I have already made a post some time ago to ask about a good design for LRU caching (in C++). You can find the question, the answer and some code there:
Better understanding the LRU algorithm
I have now tried to multi-thread this code (using pthread) and came with some really unexpected results. Before even attempting to use locking, I have created a system in which each thread accesses its own cache (see code). I run this code on a 4 cores processor. I tried to run it with 1 thread and 4 thread. When it runs on 1 thread I do 1 million lookups in the cache, on 4 threads, each threads does 250K lookups. I was expecting to get a time reduction with 4 threads but get the opposite. 1 threads runs in 2.2 seconds, 4 threads runs in more than 6 seconds?? I just can't make sense of this result.
Is something wrong with my code? Can this be explained somehow (thread management takes time). It would be great to have the feedback from experts. Thanks a lot -
I compile this code with: c++ -o cache cache.cpp -std=c++0x -O3 -lpthread
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <errno.h>
#include <sys/time.h>
#include <list>
#include <cstdlib>
#include <cstdio>
#include <memory>
#include <list>
#include <unordered_map>
#include <stdint.h>
#include <iostream>
typedef uint32_t data_key_t;
using namespace std;
//using namespace std::tr1;
class TileData
{
public:
data_key_t theKey;
float *data;
static const uint32_t tileSize = 32;
static const uint32_t tileDataBlockSize;
TileData(const data_key_t &key) : theKey(key), data(NULL)
{
float *data = new float [tileSize * tileSize * tileSize];
}
~TileData()
{
/* std::cerr << "delete " << theKey << std::endl; */
if (data) delete [] data;
}
};
typedef shared_ptr<TileData> TileDataPtr; // automatic memory management!
TileDataPtr loadDataFromDisk(const data_key_t &theKey)
{
return shared_ptr<TileData>(new TileData(theKey));
}
class CacheLRU
{
public:
list<TileDataPtr> linkedList;
unordered_map<data_key_t, TileDataPtr> hashMap;
CacheLRU() : cacheHit(0), cacheMiss(0) {}
TileDataPtr getData(data_key_t theKey)
{
unordered_map<data_key_t, TileDataPtr>::const_iterator iter = hashMap.find(theKey);
if (iter != hashMap.end()) {
TileDataPtr ret = iter->second;
linkedList.remove(ret);
linkedList.push_front(ret);
++cacheHit;
return ret;
}
else {
++cacheMiss;
TileDataPtr ret = loadDataFromDisk(theKey);
linkedList.push_front(ret);
hashMap.insert(make_pair<data_key_t, TileDataPtr>(theKey, ret));
if (linkedList.size() > MAX_LRU_CACHE_SIZE) {
const TileDataPtr dropMe = linkedList.back();
hashMap.erase(dropMe->theKey);
linkedList.remove(dropMe);
}
return ret;
}
}
static const uint32_t MAX_LRU_CACHE_SIZE = 100;
uint32_t cacheMiss, cacheHit;
};
int numThreads = 1;
void *testCache(void *data)
{
struct timeval tv1, tv2;
// Measuring time before starting the threads...
double t = clock();
printf("Starting thread, lookups %d\n", (int)(1000000.f / numThreads));
CacheLRU *cache = new CacheLRU;
for (uint32_t i = 0; i < (int)(1000000.f / numThreads); ++i) {
int key = random() % 300;
TileDataPtr tileDataPtr = cache->getData(key);
}
std::cerr << "Time (sec): " << (clock() - t) / CLOCKS_PER_SEC << std::endl;
delete cache;
}
int main()
{
int i;
pthread_t thr[numThreads];
struct timeval tv1, tv2;
// Measuring time before starting the threads...
gettimeofday(&tv1, NULL);
#if 0
CacheLRU *c1 = new CacheLRU;
(*testCache)(c1);
#else
for (int i = 0; i < numThreads; ++i) {
pthread_create(&thr[i], NULL, testCache, (void*)NULL);
//pthread_detach(thr[i]);
}
for (int i = 0; i < numThreads; ++i) {
pthread_join(thr[i], NULL);
//pthread_detach(thr[i]);
}
#endif
// Measuring time after threads finished...
gettimeofday(&tv2, NULL);
if (tv1.tv_usec > tv2.tv_usec)
{
tv2.tv_sec--;
tv2.tv_usec += 1000000;
}
printf("Result - %ld.%ld\n", tv2.tv_sec - tv1.tv_sec,
tv2.tv_usec - tv1.tv_usec);
return 0;
}
A thousand apologies, by keeping debugging the code I realised I made a really bad beginner's mistake, if you look at that code:
TileData(const data_key_t &key) : theKey(key), data(NULL)
{
float *data = new float [tileSize * tileSize * tileSize];
}
from the TikeData class where data is supposed to actually be a member variable of the class... So the right code should be:
class TileData
{
public:
float *data;
TileData(const data_key_t &key) : theKey(key), data(NULL)
{
data = new float [tileSize * tileSize * tileSize];
numAlloc++;
}
};
I am so sorry about that! It's a mistake I have done in the past, and I guess prototyping is great, but it sometimes lead to do such stupid mistakes.
I ran the code with 1 and 4 threads and do now see the speedup. 1 thread takes about 2.3 seconds, 4 threads takes 0.92 seconds.
Thanks all for your help, and sorry if I made you lose your time ;-)
I don't have a concrete answer yet. I can think of several possibilities. One is that testCache() is using random(), which is almost certainly implemented with a single global mutex. (Thus all of your threads are competing for the mutex, which is now ping-ponging between the caches.) ((That's assuming that random() is actually thread-safe on your system.))
Next, testCach() is accessing a CacheLRU which is implemented with unordered_maps and shared_ptrs. The unordered_maps, in particular might be implemented with some kind of global mutex underneath that is causing all of your threads to compete for access.
To really diagnose what is going on here you should do something much simpler inside of testCache(). (First try just taking the sqrt() of an input variable 250K times (vs. 1M times). Then try linearly accessing a C array of size 250K (or 1M). Slowly build up to the complex thing you are currently doing.)
Another possibility has to do with the pthread_join. pthread_join doesn't return until all the threads are done. So if one is taking longer than the others, you are measuring the slowest one. Your computation here seems balanced, but perhaps your OS is doing something unexpected? (Like mapping several threads to one core (perhaps because you have a hyper-threaded processor?, or one thread is moving from one core to another in the middle of the run (perhaps because the OS thinks it is smart when it is not.)
This will be a bit of a "build it up" answer. I'm running your code on a Fedora 16 Linux system with a 4-core AMD cpu and 16GB of RAM.
I can confirm that I'm seeing similar "slower with more threads" behaviour. I removed the random function, which doesn't improve things at all.
I'm going to make some other minor changes.
Related
I use thread lib to create multiple thread for my c++ program, and I call a executable program in each thread by using system() command. The executable program is multithreading itself.
so I want to ask if there is a limit for threads counts for the executable program called by system() could used in each thread , If there are some rule in thread library or standard library to limit the usage of thread for sub excutable program called by system()command?
Above is my question, If you have any question , you could read my code example below.
please ignore the progress_bar.h, it is nothing to do with my question, it's a head file which is used to show progress bar.
parallel.h is like:
#ifndef parallel_h
#define parallel_h
#include <vector>
#include <functional>
#include <atomic>
#include <thread>
#include "progress_bar.h"
//simple thread pool implementation
//updateFun should be thread-safe!
template <class T>
void processInParallel(const std::vector<T>& scheduledTasks,
std::function<void(const T&)> updateFun,
size_t maxThreads, bool progressBar)
{
if (scheduledTasks.empty()) return;
std::atomic<size_t> jobId(0);
ProgressPercent progress(scheduledTasks.size());
if (progressBar) progress.advance(0);
auto threadWorker = [&jobId, &scheduledTasks, &updateFun,
&progress, progressBar]()
{
while (true)
{
size_t expected = 0;
while(true)
{
expected = jobId;
if (jobId == scheduledTasks.size())
{
return;
}
if (jobId.compare_exchange_weak(expected, expected + 1))
{
break;
}
}
updateFun(scheduledTasks[expected]);
if (progressBar) progress.advance();
}
};
std::vector<std::thread> threads(std::min(maxThreads,
scheduledTasks.size()));
for (size_t i = 0; i < threads.size(); ++i)
{
threads[i] = std::thread(threadWorker);
}
for (size_t i = 0; i < threads.size(); ++i)
{
threads[i].join();
}
}
#endif /* parallel_h */
in main.cpp
#include "parallel.h"
#include <iostream>
#include <function>
#include <vector>
int main(int argc, char** argv){
std::vector<int> jobids = {1,2,3,4,5};
std::function<void(const int& jobid)>testfunc = [](const int& jobid)
{
system("Call another executable program here!");
};
size_t threadnum = 5;
processInParallel(testfunc, jobids, threadnum, true);
}
can anyone give me an answer?
There are two main limits for number of threads in Linux.
One is a limit for total amount of threads in system that can be checked in:
/proc/sys/kernel/threads-max. This limit is system-wide and it gives you insight how many threads can be run by kernel.
It's worth to look also on /proc/sys/vm/max_map_count as it contains information about how many virtual memory areas a process can own.
The second one is indirect limit connected with amount of virtual memory:
number of threads = total virtual memory / (stack size*1024*1024)
As you can create new threads with custom value of stack size this limit can vary. Increasing process virtual memory or decreasing stack size for new threads can allow you to run more threads within single process. Check ulimit for more information.
If your distribution is systemd-based you'd like to look also on UserTasksMax setting.
I have to apologize for my poor English first.
I'm learning hardware transactional memory now and I'm using the spin_rw_mutex.h in TBB to implement the transaction block in C++. speculative_spin_rw_mutex is a class in the spin_rw_mutex.h is a mutex which have already implemented the RTM interface of intel TSX.
The example I used to test RTM is very simple. I created the Account class and I transfer money from one account to another randomly. All accounts are in an accounts array and the size is 100. The random function is in boost.(I think STL has the same random function). The transfer function is protected with the speculative_spin_rw_mutex. I used tbb::parallel_for and tbb::task_scheduler_init to control concurrency. All transfer methods are called in the lambda of paraller_for. The total transfer times is 1 million. The strange thing is when the task_scheduler_init is set as 2 the program is the fastest (8 seconds). In fact my CPU is i7 6700k which has 8 threads. In the range of 8 and 50,000, the performance of the program is almost no change (11 to 12 seconds). When I increase the task_scheduler_init to 100,000, the run time will increase to about 18 seconds.
I tried to use profiler to analyze the program and I found the hotspot function is the mutex. However I think the rate of transaction roll-back is not so high. I don't know why the program is so slow.
Somebody says that the false sharing slows down the performance, as a result, I tried to use
std::vector> cache_aligned_accounts(AccountsSIZE,Account(1000));
to replace the orignal array
Account* accounts[AccountsSIZE];
to avoid the false sharing. It seems nothing changed;
Here is my new codes.
#include <tbb/spin_rw_mutex.h>
#include <iostream>
#include "tbb/task_scheduler_init.h"
#include "tbb/task.h"
#include "boost/random.hpp"
#include <ctime>
#include <tbb/parallel_for.h>
#include <tbb/spin_mutex.h>
#include <tbb/cache_aligned_allocator.h>
#include <vector>
using namespace tbb;
tbb::speculative_spin_rw_mutex mu;
class Account {
private:
int balance;
public:
Account(int ba) {
balance = ba;
}
int getBalance() {
return balance;
}
void setBalance(int ba) {
balance = ba;
}
};
//Transfer function. Using speculative_spin_mutex to set critical section
void transfer(Account &from, Account &to, int amount) {
speculative_spin_rw_mutex::scoped_lock lock(mu);
if ((from.getBalance())<amount)
{
throw std::invalid_argument("Illegal amount!");
}
else {
from.setBalance((from.getBalance()) - amount);
to.setBalance((to.getBalance()) + amount);
}
}
const int AccountsSIZE = 100;
//Random number generater and distributer
boost::random::mt19937 gener(time(0));
boost::random::uniform_int_distribution<> distIndex(0, AccountsSIZE - 1);
boost::random::uniform_int_distribution<> distAmount(1, 1000);
/*
Function of transfer money
*/
void all_transfer_task() {
task_scheduler_init init(10000);//Set the number of tasks can be run together
/*
Initial accounts, using cache_aligned_allocator to avoid false sharing
*/
std::vector<Account, cache_aligned_allocator<Account>> cache_aligned_accounts(AccountsSIZE,Account(1000));
const int TransferTIMES = 10000000;
//All transfer tasks
parallel_for(0, TransferTIMES, 1, [&](int i) {
try {
transfer(cache_aligned_accounts[distIndex(gener)], cache_aligned_accounts[distIndex(gener)], distAmount(gener));
}
catch (const std::exception& e)
{
//cerr << e.what() << endl;
}
//std::cout << distIndex(gener) << std::endl;
});
std::cout << cache_aligned_accounts[0].getBalance() << std::endl;
int total_balance = 0;
for (size_t i = 0; i < AccountsSIZE; i++)
{
total_balance += (cache_aligned_accounts[i].getBalance());
}
std::cout << total_balance << std::endl;
}
As Intel TSX works on cache line granularity, false sharing is definitely things to start with. Unfortunately, cache_aligned_allocator does not what you are probably expecting, i.e. it aligned whole std::vector, but you need individual Account to occupy whole cache line to prevent false sharing.
While I can't reproduce your benchmark, I see here two possible causes for this behavior:
"Too many cooks boil the soup": you use a single spin_rw_mutex that is locked by all the transfers by all the threads. Seems to me that your transfers execute sequentially. This would explain why the profile sees a hot point there. The Intel page warns against performance degradation in such case.
Throughput vs. speed: On an i7, in a couple of benchmarks, I could notice that when you use more cores, each core runs a little bit slower, so that overall time of fixed siez loops runs longer. However, counting the overall throughput (i.e. the total number of transactions that happen in all these parallel loops) the throughput is much higher (although not fully proportinally to the number of cores).
I'd rather opt for the first case, but the second is not to eliminate.
I have a piece of code that I use to test various containers (e.g. deque and a circular buffer) when passing data from a producer (thread 1) to a consumer (thread 2). A data is represented by a struct with a pair of timestamps. First timestamp is taken before push in the producer, and the second one is taken when data is popped by the consumer.
The container is protected with a pthread spinlock.
The machine runs redhat 5.5 with 2.6.18 kernel (old!), it is a 4-core system with hyperthreading disabled. gcc 4.7 with -std=c++11 flag was used in all tests.
Producer acquires the lock, timestamps the data and pushes it into the queue, unlocks and sleeps in a busy loop for 2 microseconds (the only reliable way I found to sleep for precisely 2 micros on that system).
Consumer locks, pops the data, timestamps it and generates some statistics (running mean delay and standard deviation). The stats is printed every 5 seconds (M is the mean, M2 is the std dev) and reset. I used gettimeofday() to obtain the timestamps, which means that the mean delay number can be thought of as the percentage of delays that exceed 1 microsecond.
Most of the time the output looks like this:
CNT=2500000 M=0.00935 M2=0.910238
CNT=2500000 M=0.0204112 M2=1.57601
CNT=2500000 M=0.0045016 M2=0.372065
but sometimes (probably 1 trial out of 20) like this:
CNT=2500000 M=0.523413 M2=4.83898
CNT=2500000 M=0.558525 M2=4.98872
CNT=2500000 M=0.581157 M2=5.05889
(note the mean number is much worse than in the first case, and it never recovers as the program runs).
I would appreciate thoughts on why this could happen. Thanks.
#include <iostream>
#include <string.h>
#include <stdexcept>
#include <sys/time.h>
#include <deque>
#include <thread>
#include <cstdint>
#include <cmath>
#include <unistd.h>
#include <xmmintrin.h> // _mm_pause()
int64_t timestamp() {
struct timeval tv;
gettimeofday(&tv, 0);
return 1000000L * tv.tv_sec + tv.tv_usec;
}
//running mean and a second moment
struct StatsM2 {
StatsM2() {}
double m = 0;
double m2 = 0;
long count = 0;
inline void update(long x, long c) {
count = c;
double delta = x - m;
m += delta / count;
m2 += delta * (x - m);
}
inline void reset() {
m = m2 = 0;
count = 0;
}
inline double getM2() { // running second moment
return (count > 1) ? m2 / (count - 1) : 0.;
}
inline double getDeviation() {
return std::sqrt(getM2() );
}
inline double getM() { // running mean
return m;
}
};
// pause for usec microseconds using busy loop
int64_t busyloop_microsec_sleep(unsigned long usec) {
int64_t t, tend;
tend = t = timestamp();
tend += usec;
while (t < tend) {
t = timestamp();
}
return t;
}
struct Data {
Data() : time_produced(timestamp() ) {}
int64_t time_produced;
int64_t time_consumed;
};
int64_t sleep_interval = 2;
StatsM2 statsm2;
std::deque<Data> queue;
bool producer_running = true;
bool consumer_running = true;
pthread_spinlock_t spin;
void producer() {
producer_running = true;
while(producer_running) {
pthread_spin_lock(&spin);
queue.push_back(Data() );
pthread_spin_unlock(&spin);
busyloop_microsec_sleep(sleep_interval);
}
}
void consumer() {
int64_t count = 0;
int64_t print_at = 1000000/sleep_interval * 5;
Data data;
consumer_running = true;
while (consumer_running) {
pthread_spin_lock(&spin);
if (queue.empty() ) {
pthread_spin_unlock(&spin);
// _mm_pause();
continue;
}
data = queue.front();
queue.pop_front();
pthread_spin_unlock(&spin);
++count;
data.time_consumed = timestamp();
statsm2.update(data.time_consumed - data.time_produced, count);
if (count >= print_at) {
std::cerr << "CNT=" << count << " M=" << statsm2.getM() << " M2=" << statsm2.getDeviation() << "\n";
statsm2.reset();
count = 0;
}
}
}
int main(void) {
if (pthread_spin_init(&spin, PTHREAD_PROCESS_PRIVATE) < 0)
exit(2);
std::thread consumer_thread(consumer);
std::thread producer_thread(producer);
sleep(40);
consumer_running = false;
producer_running = false;
consumer_thread.join();
producer_thread.join();
return 0;
}
EDIT:
I believe that 5 below is the only thing that can explain 1/2 second latency. When on the same core, each would run for a long time and only then switch to the other.
The rest of the things on the list are too small to cause a 1/2 second delay.
You can use pthread_setaffinity_np to pin your threads to specific cores. You can try different combinations and see how performance changes.
EDIT #2:
More things you should take care of: (who said testing was simple...)
1. Make sure the consumer is already running when the producer starts producing. Not too important in your case as the producer is not really producing in a tight loop.
2. This is very important: you divide by count every time, which is not the right thing to do for your stats. This means that the first measurement in every stats window weight a lot more than the last. To measure the median you have to collect all the values. Measuring the average and min/max, without collecting all numbers, should give you a good enough picture of the latency.
It's not surprising, really.
1. The time is taken in Data(), but then the container spends time calling malloc.
2. Are you running 64 bit or 32? In 32 bit gettimeofday is a system call while in 64 bit it's a VDSO that doesn't get into the kernel... you may want to time gettimeofday itself and record the variance. Or enroll your own using rdtsc.
The best would be to use cycles instead of micros because micros are really too big for this scenario... only the rounding to micros gets you very much skewed when dealing with such a small scale of things
3. Are you guaranteed to not get preempted between producer and consumer? I guess that not. But this should not happen very frequently on a box dedicated to testing...
4. Is it 4 cores on a single socket or 2? if it's a 2 socket box, you want to have the 2 threads on the same socket, or you pay (at least) double for data transfer.
5. Make sure the threads are not running on the same core.
6. If the Data you transfer and the additional data (container node) are sharing cache lines (kind of likely) with other Data+node, the producer would be delayed by the consumer when it writes to the consumed timestamp. This is called false sharing. You can eliminate this by padding/aligning to 64 bytes and using an intrusive container.
gettimeofday is not a good way to profile computation overhead. It is the wall clock and your computer is multiprocessing. Even you think you are not running anything else, the OS scheduler always has some other activities to keep the system running. To profile your process overhead, you have to at least raise the priority of the process you are profiling. Also use high resolution timer or cpu ticks to do the timing measure.
Question:
3 while loops below contain code that has been commented out. I search for ("TAG1", "TAG2", and "TAG3") for easy identification. I simply want the while loops to wait on the condition tested to become true before proceeding while minimizing CPU resources as much as possible. I first tried using Boost condition variables, but there's a race condition. Putting the thread to sleep for 'x' microseconds is inefficient because there is no way to precisely time the wakeup. Finally, boost::this_thread::yield() does not seem to do anything. Probably because I only have 2 active threads on a dual-core system. Specifically, how can I make the three tagged areas below run more efficiently while introducing as little unnecessary blocking as possible.
BACKGROUND
Objective:
I have an application that logs a lot of data. After profiling, I found that much time is consumed on the logging operations (logging text or binary to a file on the local hard disk). My objective is to reduce the latency on logData calls by replacing non-threaded direct write calls with calls to a threaded buffered stream logger.
Options Explored:
Upgrade 2005-era slow hard disk to SSD...possible. Cost is not prohibitive...but involves a lot of work... more than 200 computers would have to be upgraded...
Boost ASIO...I don't need all the proactor / networking overhead, looking for something simpler and more light-weight.
Design:
Producer and consumer thread pattern, the application writes data into a buffer and a background thread then writes it to disk sometime later. So the ultimate goal is to have the writeMessage function called by the application layer return as fast as possible while data is correctly / completely logged to the log file in a FIFO order sometime later.
Only one application thread, only one writer thread.
Based on ring buffer. The reason for this decision is to use as few locks as possible and ideally...and please correct me if I'm wrong...I don't think I need any.
Buffer is a statically-allocated character array, but could move it to the heap if needed / desired for performance reasons.
Buffer has a start pointer that points to the next character that should be written to the file. Buffer has an end pointer that points to the array index after the last character to be written to the file. The end pointer NEVER passes the start pointer. If a message comes in that is larger than the buffer, then the writer waits until the buffer is emptied and writes the new message to the file directly without putting the over-sized message in the buffer (once the buffer is emptied, the worker thread won't be writing anything so no contention).
The writer (worker thread) only updates the ring buffer's start pointer.
The main (application thread) only updates the ring buffer's end pointer, and again, it only inserts new data into the buffer when there is available space...otherwise it either waits for space in the buffer to become available or writes directly as described above.
The worker thread continuously checks to see if there is data to be written (indicated by the case when the buffer start pointer != buffer end pointer). If there is no data to be written, the worker thread should ideally go to sleep and wake up once the application thread has inserted something into the buffer (and changed the buffer's end pointer such that it no longer points to the same index as the start pointer). What I have below involves while loops continuously checking that condition. It is a very bad / inefficient way of waiting on the buffer.
Results:
On my 2009-era dual-core laptop with SSD, I see that the total write time of the threaded / buffered benchmark vs. direct write is about 1 : 6 (0.609 sec vs. 0.095 sec), but highly variable. Often the buffered write benchmark is actually slower than direct write. I believe that the variability is due to the poor implementation of waiting for space to free up in the buffer, waiting for the buffer to empty, and having the worker-thread wait for work to become available. I have measured that some of the while loops consume over 10000 cycles and I suspect that those cycles are actually competing for hardware resources that the other thread (worker or application) requires to finish the computation being waited on.
Output seems to check out. With TEST mode enabled and a small buffer size of 10 as a stress test, I diffed hundreds of MBs of output and found it to equal the input.
Compiles with current version of Boost (1.55)
Header
#ifndef BufferedLogStream_h
#define BufferedLogStream_h
#include <stdio.h>
#include <iostream>
#include <iostream>
#include <cstdlib>
#include "boost\chrono\chrono.hpp"
#include "boost\thread\thread.hpp"
#include "boost\thread\locks.hpp"
#include "boost\thread\mutex.hpp"
#include "boost\thread\condition_variable.hpp"
#include <time.h>
using namespace std;
#define BENCHMARK_STR_SIZE 128
#define NUM_BENCHMARK_WRITES 524288
#define TEST 0
#define BENCHMARK 1
#define WORKER_LOOP_WAIT_MICROSEC 20
#define MAIN_LOOP_WAIT_MICROSEC 10
#if(TEST)
#define BUFFER_SIZE 10
#else
#define BUFFER_SIZE 33554432 //4 MB
#endif
class BufferedLogStream {
public:
BufferedLogStream();
void openFile(char* filename);
void flush();
void close();
inline void writeMessage(const char* message, unsigned int length);
void writeMessage(string message);
bool operator() () { return start != end; }
private:
void threadedWriter();
inline bool hasSomethingToWrite();
inline unsigned int getFreeSpaceInBuffer();
void appendStringToBuffer(const char* message, unsigned int length);
FILE* fp;
char* start;
char* end;
char* endofringbuffer;
char ringbuffer[BUFFER_SIZE];
bool workerthreadkeepalive;
boost::mutex mtx;
boost::condition_variable waitforempty;
boost::mutex workmtx;
boost::condition_variable waitforwork;
#if(TEST)
struct testbuffer {
int length;
char message[BUFFER_SIZE * 2];
};
public:
void test();
private:
void getNextRandomTest(testbuffer &tb);
FILE* datatowrite;
#endif
#if(BENCHMARK)
public:
void runBenchmark();
private:
void initBenchmarkString();
void runDirectWriteBaseline();
void runBufferedWriteBenchmark();
char benchmarkstr[BENCHMARK_STR_SIZE];
#endif
};
#if(TEST)
int main() {
BufferedLogStream* bl = new BufferedLogStream();
bl->openFile("replicated.txt");
bl->test();
bl->close();
cout << "Done" << endl;
cin.get();
return 0;
}
#endif
#if(BENCHMARK)
int main() {
BufferedLogStream* bl = new BufferedLogStream();
bl->runBenchmark();
cout << "Done" << endl;
cin.get();
return 0;
}
#endif //for benchmark
#endif
Implementation
#include "BufferedLogStream.h"
BufferedLogStream::BufferedLogStream() {
fp = NULL;
start = ringbuffer;
end = ringbuffer;
endofringbuffer = ringbuffer + BUFFER_SIZE;
workerthreadkeepalive = true;
}
void BufferedLogStream::openFile(char* filename) {
if(fp) close();
workerthreadkeepalive = true;
boost::thread t2(&BufferedLogStream::threadedWriter, this);
fp = fopen(filename, "w+b");
}
void BufferedLogStream::flush() {
fflush(fp);
}
void BufferedLogStream::close() {
workerthreadkeepalive = false;
if(!fp) return;
while(hasSomethingToWrite()) {
boost::unique_lock<boost::mutex> u(mtx);
waitforempty.wait_for(u, boost::chrono::microseconds(MAIN_LOOP_WAIT_MICROSEC));
}
flush();
fclose(fp);
fp = NULL;
}
void BufferedLogStream::threadedWriter() {
while(true) {
if(start != end) {
char* currentend = end;
if(start < currentend) {
fwrite(start, 1, currentend - start, fp);
}
else if(start > currentend) {
if(start != endofringbuffer) fwrite(start, 1, endofringbuffer - start, fp);
fwrite(ringbuffer, 1, currentend - ringbuffer, fp);
}
start = currentend;
waitforempty.notify_one();
}
else { //start == end...no work to do
if(!workerthreadkeepalive) return;
boost::unique_lock<boost::mutex> u(workmtx);
waitforwork.wait_for(u, boost::chrono::microseconds(WORKER_LOOP_WAIT_MICROSEC));
}
}
}
bool BufferedLogStream::hasSomethingToWrite() {
return start != end;
}
void BufferedLogStream::writeMessage(string message) {
writeMessage(message.c_str(), message.length());
}
unsigned int BufferedLogStream::getFreeSpaceInBuffer() {
if(end > start) return (start - ringbuffer) + (endofringbuffer - end) - 1;
if(end == start) return BUFFER_SIZE-1;
return start - end - 1; //case where start > end
}
void BufferedLogStream::appendStringToBuffer(const char* message, unsigned int length) {
if(end + length <= endofringbuffer) { //most common case for appropriately-sized buffer
memcpy(end, message, length);
end += length;
}
else {
int lengthtoendofbuffer = endofringbuffer - end;
if(lengthtoendofbuffer > 0) memcpy(end, message, lengthtoendofbuffer);
int remainderlength = length - lengthtoendofbuffer;
memcpy(ringbuffer, message + lengthtoendofbuffer, remainderlength);
end = ringbuffer + remainderlength;
}
}
void BufferedLogStream::writeMessage(const char* message, unsigned int length) {
if(length > BUFFER_SIZE - 1) { //if string is too large for buffer, wait for buffer to empty and bypass buffer, write directly to file
while(hasSomethingToWrite()); {
boost::unique_lock<boost::mutex> u(mtx);
waitforempty.wait_for(u, boost::chrono::microseconds(MAIN_LOOP_WAIT_MICROSEC));
}
fwrite(message, 1, length, fp);
}
else {
//wait until there is enough free space to insert new string
while(getFreeSpaceInBuffer() < length) {
boost::unique_lock<boost::mutex> u(mtx);
waitforempty.wait_for(u, boost::chrono::microseconds(MAIN_LOOP_WAIT_MICROSEC));
}
appendStringToBuffer(message, length);
}
waitforwork.notify_one();
}
#if(TEST)
void BufferedLogStream::getNextRandomTest(testbuffer &tb) {
tb.length = 1 + (rand() % (int)(BUFFER_SIZE * 1.05));
for(int i = 0; i < tb.length; i++) {
tb.message[i] = rand() % 26 + 65;
}
tb.message[tb.length] = '\n';
tb.length++;
tb.message[tb.length] = '\0';
}
void BufferedLogStream::test() {
cout << "Buffer size is: " << BUFFER_SIZE << endl;
testbuffer tb;
datatowrite = fopen("orig.txt", "w+b");
for(unsigned int i = 0; i < 7000000; i++) {
if(i % 1000000 == 0) cout << i << endl;
getNextRandomTest(tb);
writeMessage(tb.message, tb.length);
fwrite(tb.message, 1, tb.length, datatowrite);
}
fflush(datatowrite);
fclose(datatowrite);
}
#endif
#if(BENCHMARK)
void BufferedLogStream::initBenchmarkString() {
for(unsigned int i = 0; i < BENCHMARK_STR_SIZE - 1; i++) {
benchmarkstr[i] = rand() % 26 + 65;
}
benchmarkstr[BENCHMARK_STR_SIZE - 1] = '\n';
}
void BufferedLogStream::runDirectWriteBaseline() {
clock_t starttime = clock();
fp = fopen("BenchMarkBaseline.txt", "w+b");
for(unsigned int i = 0; i < NUM_BENCHMARK_WRITES; i++) {
fwrite(benchmarkstr, 1, BENCHMARK_STR_SIZE, fp);
}
fflush(fp);
fclose(fp);
clock_t elapsedtime = clock() - starttime;
cout << "Direct write baseline took " << ((double) elapsedtime) / CLOCKS_PER_SEC << " seconds." << endl;
}
void BufferedLogStream::runBufferedWriteBenchmark() {
clock_t starttime = clock();
openFile("BufferedBenchmark.txt");
cout << "Opend file" << endl;
for(unsigned int i = 0; i < NUM_BENCHMARK_WRITES; i++) {
writeMessage(benchmarkstr, BENCHMARK_STR_SIZE);
}
cout << "Wrote" << endl;
close();
cout << "Close" << endl;
clock_t elapsedtime = clock() - starttime;
cout << "Buffered write took " << ((double) elapsedtime) / CLOCKS_PER_SEC << " seconds." << endl;
}
void BufferedLogStream::runBenchmark() {
cout << "Buffer size is: " << BUFFER_SIZE << endl;
initBenchmarkString();
runDirectWriteBaseline();
runBufferedWriteBenchmark();
}
#endif
Update: November 25, 2013
I updated the code below use boost::condition_variables, specifically the wait_for() method as recommended by Evgeny Panasyuk. This avoids unnecessarily checking the same condition over and over again. I am currently seeing the buffered version run in about 1/6th the time as the unbuffered / direct-write version. This is not the ideal case because both cases are limited by the hard disk (in my case a 2010 era SSD). I plan to use the code below in an environment where the hard disk will not be the bottleneck and most if not all the time, the buffer should have space available to accommodate the writeMessage requests. That brings me to my next question. How big should I make the buffer? I don't mind allocating 32 MBs or 64 MB to ensure that it never fills up. The code will be running on systems that can spare that. Intuitively, I feel that it's a bad idea to statically allocate a 32 MB character array. Is it? Anyhow, I expect that when I run the code below for my intended application, the latency of logData() calls will be greatly reduced which will yield a significant reduction in overall processing time.
If anyone sees any way to make the code below better (faster, more robust, leaner, etc), please let me know. I appreciate the feedback. Lazin, how would your approach be faster or more efficient than what I have posted below? I kinda like the idea of just having one buffer and making it large enough so that it practically never fills up. Then I don't have to worry about reading from different buffers. Evgeny Panasyuk, I like the approach of using existing code whenever possible, especially if it's an existing boost library. However, I also don't see how the spcs_queue is more efficient than what I have below. I'd rather deal with one large buffer than many smaller ones and have to worry about splitting splitting my input stream on the input and splicing it back together on the output. Your approach would allow me to offload the formatting from the main thread onto the worker thread. That is a cleaver approach. But I'm not sure yet whether it will save a lot of time and to realize the full benefit, I would have to modify code that I do not own.
//End Update
General solution.
I think you must look at the Naggle algorithm. For one producer and one consumer this would look like this:
At the beginning buffer is empty, worker thread is idle and waiting for the events.
Producer writes data to the buffer and notifies worker thread.
Worker thread woke up and start the write operation.
Producer tries to write another message, but buffer is used by worker, so producer allocates another buffer and writes message to it.
Producer tries to write another message, I/O still in progress so producer writes message to previously allocated buffer.
Worker thread done writing buffer to file and sees that there is another buffer with data so it grabs it and starts to write.
The very first buffer is used by producer to write all consecutive messages, until second write operation in progress.
This schema will help achieve low latency requirement, single message will be written to disc instantaneously, but large amount of events will be written by large batches for greather throughput.
If your log messages have levels - you can improve this schema a little bit. All error messages have high priority(level) and must be saved on disc immediately (because they are rare but very valuable) but debug and trace messages have low priority and can be buffered to save bandwidth (because they are very frequent but not as valuable as error and info messages). So when you write error message, you must wait until worker thread is done writing your message (and all messages that are in the same buffer) and then continue, but debug and trace messages can be just written to buffer.
Threading.
Spawning worker thread for each application thread is to costly. You must use single writer thread for each log file. Write buffers must be shared between threads. Each buffer must have two pointers - commit_pointer and prepare_pointer. All buffer space between beginning of the buffer and commit_pointer are available for worker thread. Buffer space between commit_pointer and prepare_pointer are currently updated by application threads. Invariant: commit_pointer <= prepare_pointer.
Write operations can be performed in two steps.
Prepare write. This operation reserves space in a buffer.
Producer calculates len(message) and atomically updates prepare_pointer;
Old prepare_pointer value and len is saved by consumer;
Commit write.
Producer writes message at the beginning of the reserved buffer space (old prepare_pointer value).
Producer busy-waits until commit_pointer is equal to old prepare_pointer value that its save in local variable.
Producer commit write operation by doing commit_pointer = commit_pointer + len atomically.
To prevent false sharing, len(message) can be rounded to cache line size and all extra space can be filled with spaces.
// pseudocode
void write(const char* message) {
int len = strlen(message); // TODO: round to cache line size
const char* old_prepare_ptr;
// Prepare step
while(1)
{
old_prepare_ptr = prepare_ptr;
if (
CAS(&prepare_ptr,
old_prepare_ptr,
prepare_ptr + len) == old_prepare_ptr
)
break;
// retry if another thread perform prepare op.
}
// Write message
memcpy((void*)old_prepare_ptr, (void*)message, len);
// Commit step
while(1)
{
const char* old_commit_ptr = commit_ptr;
if (
CAS(&commit_ptr,
old_commit_ptr,
old_commit_ptr + len) == old_commit_ptr
)
break;
// retry if another thread commits
}
notify_worker_thread();
}
concurrent_queue<T, Size>
The question that I have is how to make the worker thread go to work as soon as there is work to do and sleep when there is no work.
There is boost::lockfree::spsc_queue - wait-free single-producer single-consumer queue. It can be configured to have compile-time capacity (the size of the internal ringbuffer).
From what I understand, you want something similar to following configuration:
template<typename T, size_t N>
class concurrent_queue
{
// T can be wrapped into struct with padding in order to avoid false sharing
mutable boost::lockfree::spsc_queue<T, boost::lockfree::capacity<N>> q;
mutable mutex m;
mutable condition_variable c;
void wait() const
{
unique_lock<mutex> u(m);
c.wait_for(u, chrono::microseconds(1)); // Or whatever period you need.
// Timeout is required, because modification happens not under mutex
// and notification can be lost.
// Another option is just to use sleep/yield, without notifications.
}
void notify() const
{
c.notify_one();
}
public:
void push(const T &t)
{
while(!q.push(t))
wait();
notify();
}
void pop(T &result)
{
while(!q.pop(result))
wait();
notify();
}
};
When there are elements in queue - pop does not block. And when there is enough space in internal buffer - push does not block.
concurrent<T>
I want to reduce both formatting and write times as much as possible so I plan to reduce both.
Check out Herb Sutter talk at C++ and Beyond 2012: C++ Concurrency. At page 14 he shows example of concurrent<T>. Basically it is wrapper around object of type T which starts separate thread for performing all operations on that object. Usage is:
concurrent<ostream*> x(&cout); // starts thread internally
// ...
// x acts as function object.
// It's function call operator accepts action
// which is performed on wrapped object in separate thread.
int i = 42;
x([i](ostream *out){ *out << "i=" << i; }); // passing lambda as action
You can use similar pattern in order to offload all formatting work to consumer thread.
Small Object Optimization
Otherwise, new buffers are allocated and I want to avoid memory allocation after the buffer stream is constructed.
Above concurrent_queue<T, Size> example uses fixed-size buffer which is fully contained within queue, and does not imply additional allocations.
However, Herb's concurrent<T> example uses std::function to pass action into worker thread. That may incur costly allocation.
std::function implementations may use Small Object Optimization (and most implementations do) - small function objects are in-place copy-constructed in internal buffer, but there is no guarantee, and for function objects bigger than threshold - heap allocation would happen.
There are several options to avoid this allocation:
Implement std::function analog with internal buffer large enough to hold target function objects (for example, you can try to modify boost::function or this version).
Use your own function object which would represent all type of log messages. Basically it would contain just values required to format message. As potentially there are different types of messages, consider to use boost::variant (which is literary union coupled with type tag) to represent them.
Putting it all together, here is proof-of-concept (using second option):
LIVE DEMO
#include <boost/lockfree/spsc_queue.hpp>
#include <boost/optional.hpp>
#include <boost/variant.hpp>
#include <condition_variable>
#include <iostream>
#include <cstddef>
#include <thread>
#include <chrono>
#include <mutex>
using namespace std;
/*********************************************/
template<typename T, size_t N>
class concurrent_queue
{
mutable boost::lockfree::spsc_queue<T, boost::lockfree::capacity<N>> q;
mutable mutex m;
mutable condition_variable c;
void wait() const
{
unique_lock<mutex> u(m);
c.wait_for(u, chrono::microseconds(1));
}
void notify() const
{
c.notify_one();
}
public:
void push(const T &t)
{
while(!q.push(t))
wait();
notify();
}
void pop(T &result)
{
while(!q.pop(result))
wait();
notify();
}
};
/*********************************************/
template<typename T, typename F>
class concurrent
{
typedef boost::optional<F> Job;
mutable concurrent_queue<Job, 16> q; // use custom size
mutable T x;
thread worker;
public:
concurrent(T x)
: x{x}, worker{[this]
{
Job j;
while(true)
{
q.pop(j);
if(!j) break;
(*j)(this->x); // you may need to handle exceptions in some way
}
}}
{}
void operator()(const F &f)
{
q.push(Job{f});
}
~concurrent()
{
q.push(Job{});
worker.join();
}
};
/*********************************************/
struct LogEntry
{
struct Formatter
{
typedef void result_type;
ostream *out;
void operator()(double x) const
{
*out << "floating point: " << x << endl;
}
void operator()(int x) const
{
*out << "integer: " << x << endl;
}
};
boost::variant<int, double> data;
void operator()(ostream *out)
{
boost::apply_visitor(Formatter{out}, data);
}
};
/*********************************************/
int main()
{
concurrent<ostream*, LogEntry> log{&cout};
for(int i=0; i!=1024; ++i)
{
log({i});
log({i/10.});
}
}
I have a program that starts up and within about 5 minutes the virtual size of process is about 13 gigs. It runs on Linux, uses boost, gnu c++ library and various other 3rd party libraries.
After 5 minutes size stays at 13 gigs and rss size steady at around 5 gigs.
I can't just run it in a debugger because at startup about 30 threads are started, each of which starts running its own code, that does various allocations. So stepping through and checking virtual memory at different parts of code at each breakpoint is not feasible.
I thought of changing program to start each thread one at a time to make it easier to track allocation of memory, but before doing this are there any good tools?
Valgrind is fairly slow, maybe tcmalloc could provide the info?
I would use valgrind (perhaps run it an entire night) or else use Boehm GC.
Alternatively, use the proc(5) filesystem to understand (e.g. thru /proc/$pid/statm & /proc/$pid/maps) when a lot of memory gets allocated.
The most important is to find memory leaks. If the memory don't grow after startup it is less an issue.
Perhaps adding instance counters to each class might help (use atomic integers or mutexes to serialize them).
If the program's source code is big (e.g. a million of source lines) so that spending several days/weeks is worth the effort, perhaps customizing the GCC compiler (e.g. with MELT) might be relevant.
a std::set minibenchmark
You mentioned big std::set based upon million rows.
#include <set>
#include <string>
#include <string.h>
#include <cstdio>
#include <cstdlib>
#include <unistd.h>
#include <time.h>
class MyElem
{
int _n;
char _s[16-sizeof(_n)];
public:
MyElem(int k) : _n(k)
{
snprintf (_s, sizeof(_s), "%d", k);
};
~MyElem()
{
_n=0;
memset(_s, 0, sizeof(_s));
};
int n() const
{
return _n;
};
std::string str() const
{
return std::string(_s);
};
bool less(const MyElem&x) const
{
return _n < x._n;
};
};
bool operator < (const MyElem& l, const MyElem& r)
{
return l.less(r);
}
typedef std::set<MyElem> MySet;
void bench (int cnt, MySet& set)
{
for (long i=0; i<(long)cnt*1024; i++)
set.insert(MyElem(i));
time_t now = 0;
time (&now);
set.insert (((now) & 0xfffffff) * 100);
}
int main (int argc, char** argv)
{
MySet s;
clock_t cstart, cend;
int c = argc>1?atoi(argv[1]):256;
if (c<16) c=16;
printf ("c=%d Kiter\n", c);
cstart = clock();
bench (c, s);
cend = clock();
int x = getpid();
char cmdbuf[64];
snprintf(cmdbuf, sizeof(cmdbuf), "pmap %d", x);
printf ("running %s\n", cmdbuf);
fflush (NULL);
system(cmdbuf);
putchar('\n');
printf ("at end c=%d Kiter clockdiff=%.2f millisec = %.f µs/Kiter\n",
c, (cend-cstart)*1.0e-3, (double)(cend-cstart)/c);
if (s.find(x) != s.end())
printf("set has %d\n", x);
else
printf("set don't contain %d\n", x);
return 0;
}
Notice the 16 bytes sizeof(MyElem). On Debian/Sid/AMD64 with GCC 4.8.1 (intel i3770K processor, 16Gbytes RAM) and compiling that bench with g++ -Wall -O1 tset.cc -o ./tset-01
With 32768 thousands of iterations, so 32M elements:
total 2109592K
(last line above given by pmap)
at end c=32768 Kiter clockdiff=16470.00 millisec = 503 µs/Kiter
Then the implicit time from my zsh
./tset-01 32768 16.77s user 0.54s system 99% cpu 17.343 total
This is about 2.1Gbytes. so perhaps 64.3 bytes per element & set member overhead (since sizeof(MyElem)==16 the set seems to have a non-negligible cost of perhaps 6 words per element)