so I have some strange behavior of my multi-threading code and I'm not sure either something is wrong with it or it may be just some physical side-effect of the multi-sensor system or of the OS.
I'm working on windows 10, c++.
I have three cameras, one of real sense and two thermal cameras, which are synchronized by a phisical connection in a master-slave mode, and my goal is to record huge amounts of data (dozens of GB) on a external hard drive.
The idea that I thought about is to use mapped_file of boost library to map to the disk many files (or one huge) and use callbacks for each camera, where each callback opened by a new thread and memcpy the data block which is the frame to the mapped_file on the external HD, and another polling loop on the amount (which is an atomic variable) of written frames (I know it can be probably effectively done with semaphores, but for now I think that part works more or less) in the main function, that each time reaches some limit locks the thread with unique_lock and swaps the pointers of the current mapped_file to the next one (I define and open all the needed files on the external HD in advance).
Now I'll let show you how all of it look like:
The callbacks for the cameras, one for real sense which handles each time 2 frames, depth and rgb, and two for thermal cameras:
class RSCallback {
public:
char** color_mfd_ptr = new char*;
char** depth_mfd_ptr = new char*;
vector<long long int>& color_depth_ts;
volatile int& idx_depth;
std::atomic<volatile int>& idx_color;
std::mutex& mux;
boost::shared_mutex& shared_mux;
public:
RSCallback(char* cmfd_ptr, char* dmfd_ptr, vector<long long int>& ccdts,
std::atomic<volatile int>& idxc, int& idxd, std::mutex& mx,
boost::shared_mutex& smx) :
idx_color(idxc), idx_depth(idxd),
color_depth_ts(ccdts), mux(mx), shared_mux(smx){
*color_mfd_ptr = cmfd_ptr;
*depth_mfd_ptr = dmfd_ptr;
}
// This operator overloading enables calling
// operator function () on objects of increment
void operator () (const rs2::frame &frame) {
boost::shared_lock<boost::shared_mutex> shared_lock(this->shared_mux);
std::lock_guard<std::mutex> lock(this->mux);
if (rs2::frameset fs = frame.as<rs2::frameset>()) {
const std::chrono::time_point<std::chrono::steady_clock> now =
high_resolution_clock::now();
long long int loc_ts =
std::chrono::duration_cast<std::chrono::nanoseconds>(
now.time_since_epoch()).count();
this->color_depth_ts.push_back(loc_ts);
for (const rs2::frame f: fs) {
auto vf = f.as<rs2::video_frame>();
if (vf.get_bytes_per_pixel() == 2) {
size_t sz = vf.get_data_size();
memcpy((void *) ((uint8_t *) (*depth_mfd_ptr) +
idx_depth * sz), vf.get_data(), sz);
idx_depth++;
} else {
size_t sz = vf.get_data_size();
+ memcpy((void *) ((uint8_t *) (*color_mfd_ptr) +
idx_color * sz), vf.get_data(), sz);
idx_color.fetch_add(1, std::memory_order_relaxed);
}
}
}
}
};
class TC1Callback {
public:
char** tc_mfd_ptr = new char*;
vector<long long int> &tc_ts;
int& idx_tc;
size_t sz;
std::mutex &mux;
boost::shared_mutex &shared_mux;
public:
TC1Callback(char *tcm_ptr, vector<long long int> &tcts, int& ixtc,
size_t tc_size, std::mutex &mx, boost::shared_mutex &smx) :
tc_ts(tcts), idx_tc(ixtc), sz(tc_size),
mux(mx), shared_mux(smx) {
*tc_mfd_ptr = tcm_ptr;
}
// This operator overloading enables calling
// operator function () on objects of increment
void operator()(const vector <uint8_t> &cur_frame) {
boost::shared_lock<boost::shared_mutex> shared_lock(shared_mux);
std::lock_guard <std::mutex> lock(mux);
memcpy((void *) ((uint8_t *) (*tc_mfd_ptr) + idx_tc * sz), cur_frame.data(), sz);
const std::chrono::time_point <std::chrono::steady_clock> now = high_resolution_clock::now();
long long int loc_ts = std::chrono::duration_cast<std::chrono::nanoseconds>(
now.time_since_epoch()).count();
tc_ts.push_back(loc_ts);
idx_tc++;
}
};
class TC2Callback {
public:
char** tc_mfd_ptr = new char*;
vector<long long int> &tc_ts;
int& idx_tc;
size_t sz;
std::mutex &mux;
boost::shared_mutex &shared_mux;
public:
TC2Callback(char *tcm_ptr, vector<long long int> &tcts, int& ixtc,
size_t tc_size, std::mutex &mx, boost::shared_mutex &smx) :
tc_ts(tcts), idx_tc(ixtc), sz(tc_size),
mux(mx), shared_mux(smx) {
*tc_mfd_ptr = tcm_ptr;
}
// This operator overloading enables calling
// operator function () on objects of increment
void operator()(const vector <uint8_t> &cur_frame) {
boost::shared_lock<boost::shared_mutex> shared_lock(shared_mux);
std::lock_guard <std::mutex> lock(mux);
memcpy((void *) ((uint8_t *) (*tc_mfd_ptr) + idx_tc * sz), cur_frame.data(), sz);
const std::chrono::time_point <std::chrono::steady_clock> now = high_resolution_clock::now();
long long int loc_ts = std::chrono::duration_cast<std::chrono::nanoseconds>(
now.time_since_epoch()).count();
tc_ts.push_back(loc_ts);
idx_tc++;
}
};
There is the save callback which just helps to parallelize the saving of all the data and close the opened mapped_file's
class SaveCallback {
public:
mapped_file& color_mapped_fd;
mapped_file& depth_mapped_fd;
mapped_file& tc1_mapped_fd;
mapped_file& tc2_mapped_fd;
vector<long long int>& color_depth_ts;
vector<long long int>& tc1_ts;
vector<long long int>& tc2_ts;
int idx;
string save_dir;
public:
SaveCallback(mapped_file& cmfd, mapped_file& dmfd, mapped_file& tc1fd,
mapped_file& tc2fd, vector<long long int>& cdts,
vector<long long int>& tc1ts, vector<long long int>& tc2ts,
int ix, string sdir) : color_mapped_fd(cmfd), depth_mapped_fd(dmfd),
tc1_mapped_fd(tc1fd), color_depth_ts(cdts),
tc1_ts(tc1ts), tc2_ts(tc2ts), tc2_mapped_fd(tc2fd),
idx(ix), save_dir(sdir) {}
// This operator overloading enables calling
// operator function () on objects of increment
void operator()() {
color_mapped_fd.close();
depth_mapped_fd.close();
tc1_mapped_fd.close();
tc2_mapped_fd.close();
ofstream cd_fout;
string color_depth_ts_name = save_dir + to_string(idx) + "color_depth_ts.bin";
cd_fout.open(color_depth_ts_name, ios::binary | ios::out);
cd_fout.write((char *) color_depth_ts.data(),
color_depth_ts.size() * sizeof(long long int));
cd_fout.close();
ofstream tc1_fout;
string tc1_ts_name = save_dir + to_string(idx) + "tc1_ts.bin";
tc1_fout.open(tc1_ts_name, ios::binary | ios::out);
tc1_fout.write((char *) tc1_ts.data(),
tc1_ts.size() * sizeof(long long int));
tc1_fout.close();
ofstream tc2_fout;
string tc2_ts_name = save_dir + to_string(idx) + "tc2_ts.bin";
tc2_fout.open(tc2_ts_name, ios::binary | ios::out);
tc2_fout.write((char *) tc2_ts.data(),
tc2_ts.size() * sizeof(long long int));
tc2_fout.close();
}
};
The main function is the following:
int main() {
string save_dir = "G:/Vista_project/";
//Connect first Thermal Cam with default settings
auto serialNumber = "serial1";
auto wic = wic::findAndConnect(serialNumber);
if (!wic) {
cerr << "Could not connect WIC: " << serialNumber << endl;
return 1;
}
auto defaultRes = wic->doDefaultWICSettings();
if (defaultRes.first != wic::ResponseStatus::Ok) {
cerr << "DoDefaultWICSettings: "
<< wic::responseStatusToStr(defaultRes.first) << endl;
return 2;
}
//Connect second Thermal Cam with default settings
auto serialNumber2 = "serials2";
auto wic2 = wic::findAndConnect(serialNumber2);
if (!wic2) {
cerr << "Could not connect WIC: " << serialNumber2 << endl;
return 1;
}
auto defaultRes2 = wic2->doDefaultWICSettings();
if (defaultRes2.first != wic::ResponseStatus::Ok) {
cerr << "DoDefaultWICSettings: "
<< wic::responseStatusToStr(defaultRes2.first) << endl;
return 2;
}
//Additional settings done in wic example code
// enable advanced features
wic->iKnowWhatImDoing();
// enable advanced features
wic2->iKnowWhatImDoing();
// set advanced radiometry if core supports it
// set core gain
auto gain = wic->setGain(wic::GainMode::High);
// set core gain
auto gain2 = wic2->setGain(wic::GainMode::High);
auto grabber1 = wic->frameGrabber();
grabber1->setup();
auto grabber2 = wic2->frameGrabber();
grabber2->setup();
//Manual mode of camera adjustment
auto status1 = wic->setFFCMode(wic::FFCModes::Manual);
auto status2 = wic2->setFFCMode(wic::FFCModes::Manual);
auto emode = wic::ExternalSyncMode(0x0001); //0x0001
auto resp1 = wic->setExternalSyncMode(emode);
auto emode2 = wic::ExternalSyncMode(0x0002); //0x0002
auto resp2 = wic2->setExternalSyncMode(emode2);
//Sanity check with cameras resolutions
auto resolution = wic->getResolution();
if (resolution.first == 0 || resolution.second == 0) {
cerr << "Invalid resolution, core detection error." << endl;
return 3;
}
auto resolution2 = wic2->getResolution();
if (resolution2.first == 0 || resolution2.second == 0) {
cerr << "Invalid resolution, core detection error." << endl;
return 3;
}
//No-Zoom in thermal cams
auto zoom_video_mode_None = wic::VideoModeZoom(0);
wic->setVideoModeZoom(zoom_video_mode_None);
wic2->setVideoModeZoom(zoom_video_mode_None);
//time to record of a partion between ctx-switch to he next memory-block to write
int time_to_record = 600;
//cameras fps
int rs_fps = 30 ;
int tc_fps = 9 + 3;
//depth and rgb params
int rgb_ch = 3;
int depth_px_sz = 2;
int tc_px_sz = 2;
//memory allocations size for single image and for total of images per
// memory block (time_to_record function)
size_t total_tc_size = 640LL * 512 * tc_px_sz * tc_fps * time_to_record;
size_t tc_size = 640 * 512 * 2;
long long color_size = 720LL * 1280 * rgb_ch * rs_fps * time_to_record;
long long depth_size = 720LL * 1280 * depth_px_sz * rs_fps * time_to_record;
//number of partitions which gives:
// total time of recording = number_of_records * time_to_record
int number_of_records = 1;
vector <vector<long long int>> HT1_tss_vec(number_of_records);
vector <vector<long long int>> HT2_tss_vec(number_of_records);
vector <vector<long long int>> color_depth_tss(number_of_records);
char **tc1_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * tc1_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
char **tc2_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * tc2_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
char **color_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * color_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
char **depth_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * depth_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
for (int l = 0; l < number_of_records; ++l) {
string tc1_file_path = save_dir + to_string(l) + +"tc1.bin";
const char *tc1_FileName = tc1_file_path.c_str();
const size_t tc1_FileSize = total_tc_size;
mapped_file_params tc1_params(tc1_FileName);
tc1_params.new_file_size = tc1_FileSize;
tc1_params.flags = mapped_file_base::readwrite;
tc1_mapped_fds[l] = mapped_file(tc1_params);
tc1_mfd_ptrs[l] = tc1_mapped_fds[l].data();
string tc2_file_path = save_dir + to_string(l) + "tc2.bin";
const char *tc2_FileName = tc2_file_path.c_str();
const size_t tc2_FileSize = total_tc_size;
mapped_file_params tc2_params(tc2_FileName);
tc2_params.new_file_size = tc2_FileSize;
tc2_params.flags = mapped_file_base::readwrite;
tc2_mapped_fds[l] = mapped_file(tc2_params);
tc2_mfd_ptrs[l] = tc2_mapped_fds[l].data();
string c_file_path = save_dir + to_string(l) + "color.bin";
const char *c_FileName = c_file_path.c_str();
const std::size_t ColorFileSize = color_size;
mapped_file_params params_c(c_FileName);
params_c.new_file_size = ColorFileSize;
params_c.flags = mapped_file_base::readwrite;
color_mapped_fds[l] = mapped_file(params_c);
color_mfd_ptrs[l] = color_mapped_fds[l].data();
string d_file_path = save_dir + to_string(l) + "depth.bin";
const char *d_FileName = d_file_path.c_str();
const std::size_t FileSize = depth_size;
mapped_file_params params_d(d_FileName);
params_d.new_file_size = FileSize;
params_d.flags = mapped_file_base::readwrite;
depth_mapped_fds[l] = mapped_file(params_d);
depth_mfd_ptrs[l] = depth_mapped_fds[l].data();
}
boost::shared_mutex shared_mux;
std::mutex tc1_mutex;
int idx_tc1 = 0;
auto tc1_callback = TC1Callback(tc1_mfd_ptrs[0], HT1_tss_vec[0], idx_tc1,
tc_size, tc1_mutex, shared_mux);
std::mutex tc2_mutex;
int idx_tc2 = 0;
auto tc2_callback = TC2Callback(tc2_mfd_ptrs[0], HT2_tss_vec[0], idx_tc2,
tc_size, tc2_mutex, shared_mux);
grabber1->bindBufferHandler(tc1_callback);
grabber2->bindBufferHandler(tc2_callback);
std::mutex mux;
rs2::pipeline pipe;
rs2::config cfg;
std::atomic<volatile int> idx_color(0);
int idx_depth = 0;
auto rs_callback = RSCallback(color_mfd_ptrs[0], depth_mfd_ptrs[0],
color_depth_tss[0], idx_color, idx_depth, mux,
shared_mux);
boost::asio::thread_pool thread_pool(number_of_records);
cout << "Recording Started" << endl;
cfg.enable_stream(RS2_STREAM_COLOR, 1280, 720, RS2_FORMAT_RGB8);
cfg.enable_stream(RS2_STREAM_DEPTH, 1280, 720, RS2_FORMAT_Z16);
rs2::pipeline_profile profiles = pipe.start(cfg, rs_callback);
bool start_statusA = grabber1->start();
//cout << "CamA started succefully : " << start_statusA << endl;
bool start_statusB = grabber2->start();
//cout << "CamB started succefully : " << start_statusB << std::endl;
auto save_intrinsics_extrinsics = SaveIntrinsicsExtrinsics(profiles);
post(thread_pool, save_intrinsics_extrinsics);
for (int cur_idx = 0; cur_idx < number_of_records; ++cur_idx) {
while (idx_color.load() < time_to_record * (rs_fps-10)) { //TODO: the last substracted value is a hyper-parameter (a funtion of time_to_record, the bigger the value
continue;
}
if(cur_idx == number_of_records - 1){
bool finish_statusB = grabber2->stop();
//cout << "CamB stoped succefully : " << finish_statusB << endl;
bool finish_statusA = grabber1->stop();
//cout << "CamA stoped succefully : " << finish_statusA << endl;
pipe.stop();
}
{
boost::unique_lock<boost::shared_mutex> lock(shared_mux);
auto start = high_resolution_clock::now();
auto save_callback = SaveCallback(color_mapped_fds[cur_idx],
depth_mapped_fds[cur_idx],
tc1_mapped_fds[cur_idx],
tc2_mapped_fds[cur_idx],
rs_callback.color_depth_ts,
tc1_callback.tc_ts,
tc2_callback.tc_ts,
cur_idx, save_dir);
post(thread_pool, save_callback);
if(cur_idx == number_of_records-1){
break;
}
*tc1_callback.tc_mfd_ptr = tc1_mfd_ptrs[cur_idx+1];
tc1_callback.tc_ts = HT1_tss_vec[cur_idx+1];
tc1_callback.idx_tc = 0;
*tc2_callback.tc_mfd_ptr = tc2_mfd_ptrs[cur_idx+1];
tc2_callback.tc_ts = HT2_tss_vec[cur_idx+1];
tc2_callback.idx_tc = 0;
*rs_callback.color_mfd_ptr = color_mfd_ptrs[cur_idx+1];
*rs_callback.depth_mfd_ptr = depth_mfd_ptrs[cur_idx+1];
rs_callback.color_depth_ts = color_depth_tss[cur_idx+1];
rs_callback.idx_color.store(0);
rs_callback.idx_depth = 0;
auto stop = high_resolution_clock::now();
auto duration = duration_cast<nanoseconds>(stop - start);
cout << duration.count() << endl;
}
}
thread_pool.join();
cout << "Finished";
return 0;
}
So the code does the following:
I open in advance number_of_records mapped_file's on the external HD and save them into a containers.
I loop over idx_color atomic value until some constant (busy-wait) and after that I send a SaveCallback to a thread in a thread_pool (it closes the open mapped_file and saves some additional data: timestamps of the frames), I swap to the next mapped_file and everyting continue to run without losing frames (the time is less then 1 millisecond of the swap) (of course if number_of_records > 1).
There is a mutex on each of thermal cameras, so the threads of each camera won't bother each other and there is the shared_mutex on all of them with a unique_lock on it at the main loop the moment I need to swap pointers, cuz I don't want to get memory violation accessing memory that may be all ready exsausted by the current recording.
My problem is only concerning the two tc (a shortcut for thermal cameras), so you can ignore the real sense callback if you want to.
The strange behavior that I get is sometimes in the second camera (tc2) the frames of the end are written to the begining overriding the start frames, it mostly happens when I use number_of_records > 1. But the code that reset the index for writing to the correct address in the mapped_file is locked by the unique_lock and I don't see how it is possible.
Anyway my workaround is to use number_of_records=1, I get some side-effects of some frames order being distorted, but mostly the frames are synchronized. If you ask yourself tho, why to use fractions instead of one huge file, then my answer is my hardware and software deals better with this logic, because one huge file and continious writing to it makes it more exhausting.
I wonder if I missed something or my multi-threading logic, mutexes and synchronization, will be glad for some review on this logic on this implemented code.
I see some glimpses even when I record only one huge file, if it is smaller the side-effects (jumps in frames and etc) are less noticeable, so I have a tendency to think it is a side effect of the hardware of the cameras
Thank you in advance.
P.S - feel free to ask any question about any part of the code, I wrote all the generally important aspects which concern me looking for a potential bug, but maybe you will notice another issues.
You can't run the code. but even just an overview can be helpfull.
Related
Once there was a deleted question, that I wrote a huge answer to, but this question was deleted and author refused to undelete it.
So posting here a short summary of this question. And immediately answering this question myself, just to share my results.
Question was that if we're given std::bitset<65536> that is processed (by some formula) inside inner loop bit-by-bit, then how can we boost this computation?
Outer loop just called inner loop many times (lets say 50 000 times), and outer loop can't be processed in parallel, because each next iteration depends on results of previous iteration.
Example code of this process:
std::bitset<65536> bits{};
uint64_t hash = 0;
for (size_t i = 0; i < 50000; ++i) {
// Process Bits
for (size_t j = 0; j < bits.size(); ++j)
bits[j] = ModifyBit(i, j, hash, bits[j]);
hash = Hash(bits, hash);
}
Code above is just one sample way of processing, it is not a real case. The real case is such that many times we process std::bitset<65536> somehow in such a way that all bits can be processed independently.
The question is how we can process bits in parallel as fast as possible inside inner loop.
One important Note that formula that modifies bits is generic, meaning that we don't know it in advance and can't make SIMD instructions out of it.
But what we know is that all bits can be processed independently. And that we need to parallelize this processing. Also we can't parallelize outer loop as each its iteration depends on results of previous iteration.
Another Note is that std::bitset<65536> is quite small, just 1K of 64-bit words. So it means that directly using pool of std::thread of std::async threads will not work as each thread's work will be just around 50-200 nano-seconds, very tiny time to start and stop threads and send work to them. Even std::mutex takes 75 nano-seconds on my Windows machine (although 20 nano-seconds on Linux), so using std::mutex is also a big overhead.
One may assume that ModifyBit() function above takes around same time for each bit, otherwise there is no understanding on how to schedule balanced parallelization of a loop, only by slicing it into very many tiny tasks hoping that longer tasks will be balanced out by several shorter one.
Implemented quite large and complex solution for your task, but which works very fast. On my 4-core (8 hardware threads) laptop I have 6x times multi-core speedup compared to single threaded version (your version of code).
Main idea of solution below is to implement very fast multi core Thread-Pool for running arbitrary tasks that has small overhead. My implementation can handle up to 1-10 Million tasks per second (depending on CPU speed and cores count).
Regular way of asynchronously starting multiple tasks is through usage of std::async or just by creating std::thread. Both these ways are considerably slower than my own implementation. They can't give throughput of 5 Million tasks per second like my implementation gives. And your code needs millions of tasks per second to be run for good speed. That's why I implemented everything from scratch.
After fast thread pool is implemented now we can slice your 64K bitset into smaller sub-sets and process these sub-sets in parallel. I sliced 64K bitset into 16 equal parts (see BitSize / 16 in code), you can set other amount of parts equal to power of two, but not too many, otherwise thread pool overhead will be too large. Usually it is good to slice into amount of parts that is equal to twice the amount of hardware threads (or 4 times amount of cores).
I implemented several classes in C++ code. AtomicMutex class uses std::atomic_flag in order to implement very fast replacement for mutex that is based on spin-locking approach. This AtomicMutex is used to protect queue of tasks submitted for running on thread pool.
RingBuffer class is based on std::vector and implements simple and fast queue to store any objects. It is implemented using two pointers (head and tail), pointing into vector. When new element is added to queue then tail pointer is advanced to the right, if this pointer reaches end of vector then it wraps around to 0-th position. Same way when element is taken out from queue then head pointer also advances to the right with wrap around. RingBuffer is used to store thread pool tasks.
Queue class is a wrapper around RingBuffer, but with AtomicMutex protection. This spin-lock mutex is used to protect simultaneous adding/taking elements to/from queue from multiple workers' threads.
Pool implements multi-core pool of tasks itself. It creates as many worker threads as there are CPU hardware threads (double amount of cores) minus one. Each worker thread just polls new tasks from queue and executes them immediately. Main thread adds new tasks to queue. Pool also has Wait() capability to wait till all current tasks are finished, this waiting is used as barrier to wait till whole 64K bitset is processed (all sub-parts are processed). Pool accepts any lambdas (function closures) to be run. You can see that 64K bitset sliced into smaller parts is processed by doing pool.Emplace(lambda) and later pool.Wait() is used to wait till all sub-parts are finished. Exceptions from pool workers are collected and reported to user if there is any error. When doing Wait() pool runs tasks also inside main thread not to waste one core for just waiting of tasks to finish.
Timings reported in console are done by std::chrono module.
There is an ability to run both versions - single-threaded (your original version) and multi-threaded using all cores. Switch between single/multi is done by passing MultiThreaded = true template parameter to function ProcessBitset().
Try it online!
#include <cstdint>
#include <atomic>
#include <vector>
#include <array>
#include <queue>
#include <functional>
#include <thread>
#include <future>
#include <exception>
#include <optional>
#include <memory>
#include <iostream>
#include <iomanip>
#include <bitset>
#include <string>
#include <chrono>
#include <algorithm>
#include <any>
#include <type_traits>
class AtomicMutex {
class LockerC;
public:
void lock() {
while (f_.test_and_set(std::memory_order_acquire))
//f_.wait(true, std::memory_order_acquire)
;
}
void unlock() {
f_.clear(std::memory_order_release);
//f_.notify_all();
}
LockerC Locker() { return LockerC(*this); }
private:
class LockerC {
public:
LockerC() = delete;
LockerC(AtomicMutex & mux) : pmux_(&mux) { mux.lock(); }
LockerC(LockerC const & other) = delete;
LockerC(LockerC && other) : pmux_(other.pmux_) { other.pmux_ = nullptr; }
~LockerC() { if (pmux_) pmux_->unlock(); }
LockerC & operator = (LockerC const & other) = delete;
LockerC & operator = (LockerC && other) = delete;
private:
AtomicMutex * pmux_ = nullptr;
};
std::atomic_flag f_ = ATOMIC_FLAG_INIT;
};
template <typename T>
class RingBuffer {
public:
RingBuffer() : buf_(1 << 8), last_(buf_.size() - 1) {}
T & front() { return buf_[first_]; }
T const & front() const { return buf_[first_]; }
T & back() { return buf_[last_]; }
T const & back() const { return buf_[last_]; }
size_t size() const { return size_; }
bool empty() const { return size_ == 0; }
template <typename ... Args>
void emplace(Args && ... args) {
while (size_ >= buf_.size()) {
std::rotate(&buf_[0], &buf_[first_], &buf_[buf_.size()]);
first_ = 0;
last_ = buf_.size() - 1;
buf_.resize(buf_.size() * 2);
}
++size_;
++last_;
if (last_ >= buf_.size())
last_ = 0;
buf_[last_] = T(std::forward<Args>(args)...);
}
void pop() {
if (size_ == 0)
return;
--size_;
++first_;
if (first_ >= buf_.size())
first_ = 0;
}
private:
std::vector<T> buf_;
size_t first_ = 0, last_ = 0, size_ = 0;
};
template <typename T>
class Queue {
public:
size_t Size() const { return q_.size(); }
bool Empty() const { return q_.size() == 0; }
template <typename ... Args>
void Emplace(Args && ... args) {
auto lock = m_.Locker();
q_.emplace(std::forward<Args>(args)...);
}
T Pop(std::function<void()> const & on_empty = []{},
std::function<void()> const & on_full = []{}) {
while (true) {
if (q_.empty()) {
on_empty();
continue;
}
auto lock = m_.Locker();
if (q_.empty()) {
on_empty();
continue;
}
on_full();
T val = std::move(q_.front());
q_.pop();
return std::move(val);
}
}
std::optional<T> TryPop() {
auto lock = m_.Locker();
if (q_.empty())
return std::nullopt;
T val = std::move(q_.front());
q_.pop();
return std::move(val);
}
private:
AtomicMutex m_;
RingBuffer<T> q_;
};
class RunInDestr {
public:
RunInDestr(std::function<void()> const & f) : f_(f) {}
~RunInDestr() { f_(); }
private:
std::function<void()> const & f_;
};
class Pool {
public:
struct FinishExc {};
struct Worker {
std::unique_ptr<std::atomic<bool>> pdone = std::make_unique<std::atomic<bool>>(true);
std::unique_ptr<std::exception_ptr> pexc = std::make_unique<std::exception_ptr>();
std::unique_ptr<std::thread> thr;
};
Pool(size_t nthreads = size_t(-1)) {
if (nthreads == size_t(-1))
nthreads = std::thread::hardware_concurrency() - 1;
std::cout << "Pool has " << nthreads << " worker threads." << std::endl;
for (size_t i = 0; i < nthreads; ++i) {
workers_.emplace_back(Worker{});
workers_.back().thr = std::make_unique<std::thread>(
[&, pdone = workers_.back().pdone.get(), pexc = workers_.back().pexc.get()]{
try {
std::function<void()> f_done = [pdone]{
pdone->store(true, std::memory_order_relaxed);
}, f_empty = [this]{
CheckFinish();
}, f_full = [pdone]{
pdone->store(false, std::memory_order_relaxed);
};
while (true) {
RunInDestr set_done(f_done);
tasks_.Pop(f_empty, f_full)();
}
} catch (...) {
exc_.store(true, std::memory_order_relaxed);
*pexc = std::current_exception();
}
});
}
}
~Pool() {
Wait();
Finish();
}
void CheckExc() {
if (!exc_.load(std::memory_order_relaxed))
return;
Finish();
throw std::runtime_error("Pool: Exception occured!");
}
void Finish() {
finish_ = true;
for (auto & w: workers_)
try {
w.thr->join();
if (*w.pexc)
std::rethrow_exception(*w.pexc);
} catch (FinishExc const &) {}
workers_.clear();
}
template <typename ... Args>
void Emplace(Args && ... args) {
CheckExc();
tasks_.Emplace(std::forward<Args>(args)...);
}
void Wait() {
while (true) {
auto task = tasks_.TryPop();
if (!task)
break;
(*task)();
}
while (true) {
bool done = true;
for (auto & w: workers_)
if (!w.pdone->load(std::memory_order_relaxed)) {
done = false;
break;
}
if (done)
break;
}
CheckExc();
}
private:
void CheckFinish() {
if (finish_)
throw FinishExc{};
}
Queue<std::function<void()>> tasks_;
std::vector<Worker> workers_;
bool finish_ = false;
std::atomic<bool> exc_ = false;
};
template <bool MultiThreaded = true, size_t BitSize>
void ProcessBitset(Pool & pool, std::bitset<BitSize> & bset,
std::string const & businessLogicCriteria) {
static size_t constexpr block = BitSize / 16;
for (int j = 0; j < BitSize; j += block) {
auto task = [&bset, j]{
int const hi = std::min(j + block, BitSize);
for (int i = j; i < hi; ++i) {
if (i % 2 == 0)
bset[i] = 0;
else
bset[i] = 1;
}
};
if constexpr(MultiThreaded)
pool.Emplace(std::move(task));
else
task();
}
if constexpr(MultiThreaded)
pool.Wait();
}
static auto const gtb = std::chrono::high_resolution_clock::now();
double Time() {
return std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::high_resolution_clock::now() - gtb).count();
}
void Compute() {
Pool pool;
std::bitset<65536> bset;
std::string businessLogicCriteria;
int const hi = 50000;
for (int j = 0; j < hi; ++j) {
if ((j & 0x1FFF) == 0 || j + 1 >= hi)
std::cout << j / 1000 << "K (" << std::fixed << std::setprecision(3) << Time() << " sec), " << std::flush;
ProcessBitset(pool, bset, businessLogicCriteria);
businessLogicCriteria = "...";
}
}
void TimeMeasure() {
size_t constexpr A = 1 << 16, B = 1 << 5;
{
Pool pool;
auto const tb = Time();
int64_t volatile x = 0;
for (size_t i = 0; i < A; ++i) {
for (size_t j = 0; j < B; ++j)
pool.Emplace([&]{ x = x + 1; });
pool.Wait();
}
std::cout << "AtomicPool time " << std::fixed << std::setprecision(3) << (Time() - tb)
<< " sec, speed " << A * B / (Time() - tb) / 1000.0 << " empty K-tasks/sec, "
<< 1'000'000 / (A * B / (Time() - tb)) << " sec/M-task, no-collisions "
<< std::setprecision(7) << double(x) / (A * B) << std::endl;
}
{
auto const tb = Time();
//size_t const nthr = std::thread::hardware_concurrency();
size_t constexpr C = A / 8;
std::vector<std::future<void>> asyncs;
int64_t volatile x = 0;
for (size_t i = 0; i < C; ++i) {
asyncs.clear();
for (size_t j = 0; j < B; ++j)
asyncs.emplace_back(std::async(std::launch::async, [&]{ x = x + 1; }));
asyncs.clear();
}
std::cout << "AsyncPool time " << std::fixed << std::setprecision(3) << (Time() - tb)
<< " sec, speed " << C * B / (Time() - tb) / 1000.0 << " empty K-tasks/sec, "
<< 1'000'000 / (C * B / (Time() - tb)) << " sec/M-task, no-collisions "
<< std::setprecision(7) << double(x) / (C * B) << std::endl;
}
}
int main() {
try {
TimeMeasure();
Compute();
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
} catch (...) {
std::cout << "Unknown Exception!" << std::endl;
return -1;
}
}
Output for 4 cores (8 hardware threads):
Pool has 7 worker threads.
AtomicPool time 0.903 sec, speed 2321.831 empty K-tasks/sec, 0.431 sec/M-task, no-collisions 0.9999967
AsyncPool time 0.982 sec, speed 266.789 empty K-tasks/sec, 3.750 sec/M-task, no-collisions 0.9999123
Pool has 7 worker threads.
0K (0.074 sec), 8K (0.670 sec), 16K (1.257 sec), 24K (1.852 sec), 32K (2.435 sec), 40K (2.984 sec), 49K (3.650 sec), 49K (3.711 sec),
For comparison below is single-threaded version timings, that is 6x times slower:
0K (0.125 sec), 8K (3.786 sec), 16K (7.754 sec), 24K (11.202 sec), 32K (14.662 sec), 40K (18.056 sec), 49K (21.470 sec), 49K (21.841 sec),
You have this inner loop you want to parallelize:
for (size_t j = 0; j < bits.size(); ++j)
bits[j] = ModifyBit(i, j, hash, bits[j]);
So a good idea is to split it into chunks, and have multiple threads do each chunk in parallel. You can submit chunks to workers easily with a std::atomic<int> counter that increments to identify which chunk to work on. You can also make sure the threads all stop working after one loop before starting the next with a std::barrier:
std::bitset<65536> bits{};
std::thread pool[8]; // Change size accordingly
std::atomic<int> task_number{0};
constexpr std::size_t tasks_per_loop = 32; // Arbitrarily chosen
constexpr std::size_t block_size = (bits.size()+tasks_per_loop-1) / tasks_per_loop;
// (only written to by one thread by the barrier, so not atomic)
uint64_t hash = 0;
int i = 0;
std::barrier barrier(std::size(pool), [&]() {
task_number = 0;
++i;
hash = Hash(bits, hash);
});
for (std::thread& t : pool) {
t = std::thread([&]{
while (i < 50000) {
for (int t; (t = task_number++) < tasks_per_loop;) {
int block_start = t * block_size;
int block_end = std::min(block_start + block_size, bits.size());
for (int j = block_start; j < block_end; ++j) {
bits[j] = ModifyBit(i, j, hash, bits[j]);
}
}
// Wait for other threads to finish and hash
// to be calculated before starting next loop
barrier.arrive_and_wait();
}
});
}
for (std::thread& t : pool) t.join();
(The seemingly easy way of parallelizing the for loop with OpenMP #pragma omp parallel for seemed slower with some testing, perhaps because the tasks were so small)
Here it is against your implementation running similar code: https://godbolt.org/z/en76Kv4nn
And on my machine, running this a few times with 1 million iterations took 28 to 32 seconds with my approach and 44 to 50 seconds with your general thread pool approach (granted this is much less general because it can't execute arbitrary std::function<void()> tasks).
I'm implement about the data process in multi thread.
I want to process data in class DataProcess and merge the data in class DataStorage.
My problem is when the data is add to the vector sometimes occurs the exception error.
In my opinions, there have a different address class
Is it a problem to create a new data handling class and process each data?
Here is my code.
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <thread>
#include <vector>
#include <mutex>
using namespace::std;
static std::mutex m;
class DataStorage
{
private :
std::vector<long long> vecData;
public:
DataStorage()
{
}
~DataStorage()
{
}
void SetDataVectorSize(int size)
{
vecData.clear();
vecData.resize(size);
}
void DataInsertLoop(void* Data, int start, int end)
{
m.lock();
std::vector<long long> const * _v1 = static_cast<std::vector<long long> const *>(Data);
long long num = 0;
for (int idx = start; idx < _v1->size(); ++idx)
{
vecData[idx] = _v1->at(idx);
}
m.unlock();
}
};
class DataProcess
{
private:
int m_index;
long long m_startIndex;
long long m_endIndex;
int m_coreNum;
long long num;
DataStorage* m_mainStorage;
std::vector<long long> m_vecData;
public :
DataProcess(int pindex, long long startindex, long long endindex)
: m_index(pindex), m_startIndex(startindex), m_endIndex(endindex),
m_coreNum(0),m_mainStorage(NULL), num(0)
{
m_vecData.clear();
}
~DataProcess()
{
}
void SetMainAdrr(DataStorage* const mainstorage)
{
m_mainStorage = mainstorage;
}
void SetCoreInCPU(int num)
{
m_coreNum = num;
}
void DataRun()
{
for (long long idx = m_startIndex; idx < m_endIndex; ++idx)
{
num += rand();
m_vecData.push_back(num); //<- exception error position
}
m_mainStorage->DataInsertLoop(&m_vecData, m_startIndex, m_endIndex);
}
};
int main()
{
//auto beginTime = std::chrono::high_resolution_clock::now();
clock_t beginTime, endTime;
DataStorage* main = new DataStorage();
beginTime = clock();
long long totalcount = 200000000;
long long halfdata = totalcount / 2;
std::thread t1,t2;
for (int t = 0; t < 2; ++t)
{
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
clsDP->SetCoreInCPU(2);
clsDP->SetMainAdrr(main);
if (t == 0)
{
t1 = std::thread([&]() {clsDP->DataRun(); });
}
else
{
t2 = std::thread([&]() {clsDP->DataRun(); });
}
}
t1.join(); t2.join();
endTime = clock();
double resultTime = (double)(endTime - beginTime);
std::cout << "Multi Thread " << resultTime / 1000 << " sec" << std::endl;
printf("--------------------\n");
int value = getchar();
}
Interestingly, if none of your threads accesses portions of vecData accessed by another thread, DataInsertLoop::DataInsertLoop should not need to be synchonized at all. That should make processsing much faster. That is, after all bugs are fixed... This also means, you should not need a mutex at all.
There are other issues with your code... The most easily spotted is a memory leak.
In main:
DataStorage* main = new DataStorage(); // you call new, but never call delete...
// that's a memory leak. Avoid caling
// new() directly.
//
// Also: 'main' is kind of a reserved
// name, don't use it except for the
// program entry point.
// How about this, instead ?
DataStorage dataSrc; // DataSrc has a very small footprint (a few pointers).
// ...
std::thread t1,t2; // why not use an array ?
// as in:
std::vector<std::tread> thrds;
// ...
// You forgot to set the size of your data set before starting, by calling:
dataSrc.SetDataVectorSize(200000000);
for (int t = 0; t < 2; ++t)
{
// ...
// Calling new again, and not delete... Use a smart pointer type
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
// Also, fix the start and en indices (NOTE: code below works for t < 2, but
// probably not for t < 3)
auto clsDP = std::make_unique<DataProcess>(t, t * halfdata, (t + 1) * halfdata);
// You need to keep a reference to these pointers
// Either by storing them in an array, or by passing them to
// the threads. As in, for example:
thrds.emplace_back([dp = std::move(clsDP)]() {clsDP->DataRun(); });
}
//...
std::for_each(thrds.begin(), thrds.end(), [](auto& t) { t.join(); });
//...
More...
You create a mutex on your very first line of executable code. That's good... somewhat...
static std::mutex m; // a one letter name is a terrible choice for a variable with
// file scope.
Apart form the name, it's not in the right scope... If you want to use a mutex to protect DataStorage::vecData, this mutex should be declared in the same scope as DataStorage::vecData.
One last thing. Have you considered using iterators (aka pointers) as arguments to DataProcess::DataProcess() ? This would simplify the code quite a bit, and it would very likely run faster.
I'm developing a bioinformatic tool, which requires reading in millions of matrix files (average dimension = (20k, 20k)). They are tab-delimited text files, and they look something like:
0.53 0.11
0.24 0.33
Because the software reads the matrix files one at a time, memory is not an issue, but it's very slow. The following is my current function for reading in a matrix file. I first make a matrix object using a double pointer, then fill in the matrix by looping through an input file .
float** make_matrix(int nrow, int ncol, float val){
float** M = new float *[nrow];
for(int i = 0; i < nrow; i++) {
M[i] = new float[ncol];
for(int j = 0; j < ncol; j++) {
M[i][j] = val;
}
}
return M;
}
float** read_matrix(string fname, int dim_1, int dim_2){
float** K = make_matrix(dim_1, dim_2, 0);
ifstream ifile(fname);
for (int i = 0; i < dim_1; ++i) {
for (int j = 0; j < dim_2; ++j) {
ifile >> K[i][j];
}
}
ifile.clear();
ifile.seekg(0, ios::beg);
return K;
}
Is there a much faster way to do this? From my experience with python, reading in a matrix file using pandas is so much faster than using python for-loops. Is there a trick like that in c++?
(added)
Thanks so much everyone for all your suggestions and comments!
The fastest way, by far, is to change the way you write those files: write in binary format, two int first (width, height) then just dump your values.
You will be able to load it in just three read calls.
Just for fun, I measured the program posted above (using a 20,000x20,000 ASCII input file, as described) on my Mac Mini (3.2GHz i7 with SSD drive) and found that it took about 102 seconds to parse in the file using the posted code.
Then I wrote a version of the same function that uses the C stdio API (fopen()/fread()/fclose()) and does character-by-character parsing into a 1D float array. This implementation takes about 13 seconds to parse in the file on the same hardware, so it's about 7 times faster.
Both programs were compiled with g++ -O3 test_read_matrix.cpp.
float* faster_read_matrix(string fname, int numRows, int numCols)
{
FILE * fpIn = fopen(fname.c_str(), "r");
if (fpIn == NULL)
{
printf("Couldn't open file [%s] for input!\n", fname.c_str());
return NULL;
}
float* K = new float[numRows*numCols];
// We'll hold the current number in (numberBuf) until we're ready to parse it
char numberBuf[128] = {'\0'};
int numCharsInBuffer = 0;
int curRow = 0, curCol = 0;
while(curRow < numRows)
{
char tempBuf[4*1024]; // an arbitrary size
const size_t bytesRead = fread(tempBuf, 1, sizeof(tempBuf), fpIn);
if (bytesRead <= 0)
{
if (bytesRead < 0) perror("fread");
break;
}
for (size_t i=0; i<bytesRead; i++)
{
const char c = tempBuf[i];
if ((c=='.')||(c=='+')||(c=='-')||(isdigit(c)))
{
if ((numCharsInBuffer+1) < sizeof(numberBuf)) numberBuf[numCharsInBuffer++] = c;
else
{
printf("Error, number string was too long for numberBuf!\n");
}
}
else
{
if (numCharsInBuffer > 0)
{
// Parse the current number-chars we have assembled into (numberBuf) and reset (numberBuf) to empty
numberBuf[numCharsInBuffer] = '\0';
if (curCol < numCols) K[curRow*numCols+curCol] = strtod(numberBuf, NULL);
else
{
printf("Error, too many values in row %i! (Expected %i, found at least %i)\n", curRow, numCols, curCol);
}
curCol++;
}
numCharsInBuffer = 0;
if (c == '\n')
{
curRow++;
curCol = 0;
if (curRow >= numRows) break;
}
}
}
}
fclose(fpIn);
if (curRow != numRows) printf("Warning: I read %i lines in the file, but I expected there would be %i!\n", curRow, numRows);
return K;
}
I am dissatisfied with Jeremy Friesner’s otherwise excellent answer because it:
blames the problem to be with C++'s I/O system (which it is not)
fixes the problem by circumventing the actual I/O problem without being explicit about how it is a significant contributor to speed
modifies memory accesses which (may or may not) contribute to speed, and does so in a way that very large matrices may not be supported
The reason his code runs so much faster is because he removes the single most important bottleneck: unoptimized disk access. JWO’s original code can be brought to match with three extra lines of code:
float** read_matrix(std::string fname, int dim_1, int dim_2){
float** K = make_matrix(dim_1, dim_2, 0);
std::size_t buffer_size = 4*1024; // 1
char buffer[buffer_size]; // 2
std::ifstream ifile(fname);
ifile.rdbuf()->pubsetbuf(buffer, buffer_size); // 3
for (int i = 0; i < dim_1; ++i) {
for (int j = 0; j < dim_2; ++j) {
ss >> K[i][j];
}
}
// ifile.clear();
// ifile.seekg(0, std::ios::beg);
return K;
}
The addition exactly replicates Friesner’s design, but using the C++ library capabilities without all the extra programming grief on our end.
You’ll notice I also removed a couple lines at the bottom that should be inconsequential to program function and correctness, but which may cause a minor cumulative time issue as well. (If they are not inconsequential, that is a bug and should be fixed!)
How much difference this all makes depends entirely on the quality of the C++ Standard Library implementation. AFAIK the big three modern C++ compilers (MSVC, GCC, and Clang) all have sufficiently-optimized I/O handling to make the issue moot.
locale
One other thing that may also make a difference is to .imbue() the stream with the default "C" locale, which avoids a lot of special handling for numbers in locale-dependent formats other than what your files use. You only need to bother to do this if you have changed your global locale, though.
ifile.imbue(std::locale(""));
redundant initialization
Another thing that is killing your time is the effort to zero-initialize the array when you create it. Don’t do that if you don’t need it! (You don’t need it here because you know the total extents and will fill them properly. C++17 and later is nice enough to give you a zero value if the input stream goes bad, too. So you get zeros for unread values either way.)
dynamic memory block size
Finally, keeping memory accesses to an array of array should not significantly affect speed, but it still might be worth testing if you can change it. This is assuming that the resulting matrix will never be too large for the memory manager to return as a single block (and consequently crash your program).
A common design is to allocate the entire array as a single block, with the requested size plus size for the array of pointers to the rest of the block. This allows you to delete the array in a single delete[] statement. Again, I don’t believe this should be an optimization issue you need to care about until your profiler says so.
At the risk of the answer being considered incomplete (no code examples), I would like to add to the other answers additional options how to tackle the problem:
Use a binary format (width,height, values...) as file format and then use file mapping (MapViewOfFile() on Windows, mmap() or so on posix/unix systems).
Then, you can simply point your "matrix structure" pointer to the mapped address space and you are done. And in case, you do something like sparse access to the matrix, it can even save some real IO. If you always do full access to all elements of the matrix (no sparse matrices etc.), it is still quite elegant and probably faster than malloc/read.
Replacements for c++ iostream, which is known to be quite slow and should not be used for performance critical stuff:
Have a look at the {fmt} library, which has become quite popular in recent years and claims to be quite fast.
Back in the days, when I did a lot of numerics on large data sets, I always opted for binary files for storage. (It was back in the days, when the fastest CPU you get your hands on were the Pentium 1 (with the floating point bug :)). Back then, all was slower, memory was much more limited (we had MB not GB as units for RAM in our systems) and all in all, nearly 20 years have passed since.
So, as a refresher, I did write some code to show, how much faster than iostream and text files you can do if you do not have extra constraints (such as endianess of different cpus etc.).
So far, my little test only has an iostream and a binary file version with a) stdio fread() kind of loading and b) mmap(). Since I sit in front of a debian bullseye computer, my code uses linux specific stuff for the mmap() approach. To run it on Windows, you have to change a few lines of code and some includes.
Edit: I added a save function using {fmt} now as well.
Edit: I added a load function with stdio now as well.
Edit: To reduce memory workload, I reordered the code somewhat
and now only keep 2 matrix instances in memory at any given time.
The program does the following:
create a 20k x 20k matrix in ram (in a struct named Matrix_t). With random values, slowly generated by std::random.
Write the matrix with iostream to a text file.
Write the matrix with stdio to a binary file.
Create a new matrix textMatrix by loading its data from the text file.
Create a new matrix inMemoryMatrix by loading its data from the binary file with a few fread() calls.
mmap() the binary file and use it under the name mappedMatrix.
Compare each of the loaded matrices to the original randomMatrix to see if the round-trip worked.
Here the results I got on my machine after compiling this work of wonder with clang++ -O3 -o fmatio fast-matrix-io.cpp -lfmt:
./fmatio
creating random matrix (20k x 20k) (27.0775seconds)
the first 10 floating values in randomMatrix are:
57970.2 -365700 -986079 44657.8 826968 -506928 668277 398241 -828176 394645
saveMatrixAsText_IOSTREAM()
saving matrix with iostream. (192.749seconds)
saveMatrixAsText_FMT(mat0_fmt.txt)
saving matrix with {fmt}. (34.4932seconds)
saveMatrixAsBinary()
saving matrix into a binary file. (30.7591seconds)
loadMatrixFromText_IOSTREAM()
loading matrix from text file with iostream. (102.074seconds)
randomMatrix == textMatrix
comparing randomMatrix with textMatrix. (0.125328seconds)
loadMatrixFromText_STDIO(mat0_fmt.txt)
loading matrix from text file with stdio. (71.2746seconds)
randomMatrix == textMatrix
comparing randomMatrix with textMatrix (stdio). (0.124684seconds)
loadMatrixFromBinary(mat0.bin)
loading matrix from binary file into memory. (0.495685seconds)
randomMatrix == inMemoryMatrix
comparing randomMatrix with inMemoryMatrix. (0.124206seconds)
mapMatrixFromBinaryFile(mat0.bin)
mapping a view to a matrix in a binary file. (4.5883e-05seconds)
randomMatrix == mappedMatrix
comparing randomMatrix with mappedMatrix. (0.158459seconds)
And here is the code:
#include <cinttypes>
#include <memory>
#include <random>
#include <iostream>
#include <fstream>
#include <cstring>
#include <string>
#include <chrono>
#include <limits>
#include <iomanip>
// includes for mmap()...
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <cstdio>
#include <cstdlib>
#include <unistd.h>
// includes for {fmt}...
#include <fmt/core.h>
#include <fmt/os.h>
struct StopWatch {
using Clock = std::chrono::high_resolution_clock;
using TimePoint =
std::chrono::time_point<Clock>;
using Duration =
std::chrono::duration<double>;
void start(const char* description) {
this->description = std::string(description);
tstart = Clock::now();
}
void stop() {
TimePoint tend = Clock::now();
Duration elapsed = tend - tstart;
std::cout << description << " (" << elapsed.count()
<< "seconds)" << std::endl;
}
TimePoint tstart;
std::string description;
};
struct Matrix_t {
uint32_t ncol;
uint32_t nrow;
float values[];
inline uint32_t to_index(uint32_t col, uint32_t row) const {
return ncol * row + col;
}
};
template <class Initializer>
Matrix_t *createMatrix
( uint32_t ncol,
uint32_t nrow,
Initializer initFn
) {
size_t nfloats = ncol*nrow;
size_t nbytes = UINTMAX_C(8) + nfloats * sizeof(float);
Matrix_t * result =
reinterpret_cast<Matrix_t*>(operator new(nbytes));
if (nullptr != result) {
result->ncol = ncol;
result->nrow = nrow;
for (uint32_t row = 0; row < nrow; row++) {
for (uint32_t col = 0; col < ncol; col++) {
result->values[result->to_index(col,row)] =
initFn(ncol,nrow,col,row);
}
}
}
return result;
}
void saveMatrixAsText_IOSTREAM(const char* filePath,
const Matrix_t* matrix) {
std::cout << "saveMatrixAsText_IOSTREAM()" << std::endl;
if (nullptr == matrix) {
std::cout << "cannot save matrix - no matrix!" << std::endl;
}
std::ofstream outFile(filePath);
if (outFile) {
outFile << matrix->ncol << " " << matrix->nrow << std::endl;
const auto defaultPrecision = outFile.precision();
outFile.precision
(std::numeric_limits<float>::max_digits10);
for (uint32_t row = 0; row < matrix->nrow; row++) {
for (uint32_t col = 0; col < matrix->ncol; col++) {
outFile << matrix->values[matrix->to_index(col,row)]
<< " ";
}
outFile << std::endl;
}
} else {
std::cout << "could not open " << filePath << " for writing."
<< std::endl;
}
}
void saveMatrixAsText_FMT(const char* filePath,
const Matrix_t* matrix) {
std::cout << "saveMatrixAsText_FMT(" << filePath << ")"
<< std::endl;
if (nullptr == matrix) {
std::cout << "cannot save matrix - no matrix!" << std::endl;
}
auto outFile = fmt::output_file(filePath);
outFile.print("{} {}\n", matrix->ncol, matrix->nrow);
for (uint32_t row = 0; row < matrix->nrow; row++) {
outFile.print("{}", matrix->values[matrix->to_index(0,row)]);
for (uint32_t col = 1; col < matrix->ncol; col++) {
outFile.print(" {}",
matrix->values[matrix->to_index(col,row)]);
}
outFile.print("\n");
}
}
void saveMatrixAsBinary(const char* filePath,
const Matrix_t* matrix) {
std::cout << "saveMatrixAsBinary()" << std::endl;
FILE * outFile = fopen(filePath, "wb");
if (nullptr != outFile) {
fwrite( &matrix->ncol, 4, 1, outFile);
fwrite( &matrix->nrow, 4, 1, outFile);
size_t nfloats = matrix->ncol * matrix->nrow;
fwrite( &matrix->values, sizeof(float), nfloats, outFile);
fclose(outFile);
} else {
std::cout << "could not open " << filePath << " for writing."
<< std::endl;
}
}
Matrix_t* loadMatrixFromText_IOSTREAM(const char* filePath) {
std::cout << "loadMatrixFromText_IOSTREAM()" << std::endl;
std::ifstream inFile(filePath);
if (inFile) {
uint32_t ncol;
uint32_t nrow;
inFile >> ncol;
inFile >> nrow;
uint32_t nfloats = ncol * nrow;
auto loader =
[&inFile]
(uint32_t , uint32_t , uint32_t , uint32_t )
-> float
{
float value;
inFile >> value;
return value;
};
Matrix_t * matrix = createMatrix( ncol, nrow, loader);
return matrix;
} else {
std::cout << "could not open " << filePath << "for reading."
<< std::endl;
}
return nullptr;
}
Matrix_t* loadMatrixFromText_STDIO(const char* filePath) {
std::cout << "loadMatrixFromText_STDIO(" << filePath << ")"
<< std::endl;
Matrix_t* matrix = nullptr;
FILE * inFile = fopen(filePath, "rt");
if (nullptr != inFile) {
uint32_t ncol;
uint32_t nrow;
fscanf(inFile, "%d %d", &ncol, &nrow);
auto loader =
[&inFile]
(uint32_t , uint32_t , uint32_t , uint32_t )
-> float
{
float value;
fscanf(inFile, "%f", &value);
return value;
};
matrix = createMatrix( ncol, nrow, loader);
fclose(inFile);
} else {
std::cout << "could not open " << filePath << "for reading."
<< std::endl;
}
return matrix;
}
Matrix_t* loadMatrixFromBinary(const char* filePath) {
std::cout << "loadMatrixFromBinary(" << filePath << ")"
<< std::endl;
FILE * inFile = fopen(filePath, "rb");
if (nullptr != inFile) {
uint32_t ncol;
uint32_t nrow;
fread( &ncol, 4, 1, inFile);
fread( &nrow, 4, 1, inFile);
uint32_t nfloats = ncol * nrow;
uint32_t nbytes = nfloats * sizeof(float) + UINT32_C(8);
Matrix_t* matrix =
reinterpret_cast<Matrix_t*>
(operator new (nbytes));
if (nullptr != matrix) {
matrix->ncol = ncol;
matrix->nrow = nrow;
fread( &matrix->values[0], sizeof(float), nfloats, inFile);
return matrix;
} else {
std::cout << "could not find memory for the matrix."
<< std::endl;
}
fclose(inFile);
} else {
std::cout << "could not open file "
<< filePath << " for reading." << std::endl;
}
return nullptr;
}
void freeMatrix(Matrix_t* matrix) {
operator delete(matrix);
}
Matrix_t* mapMatrixFromBinaryFile(const char* filePath) {
std::cout << "mapMatrixFromBinaryFile(" << filePath << ")"
<< std::endl;
Matrix_t * matrix = nullptr;
int fd = open( filePath, O_RDONLY);
if (-1 != fd) {
struct stat sb;
if (-1 != fstat(fd, &sb)) {
auto fileSize = sb.st_size;
matrix =
reinterpret_cast<Matrix_t*>
(mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, fd, 0));
if (nullptr == matrix) {
std::cout << "mmap() failed!" << std::endl;
}
} else {
std::cout << "fstat() failed!" << std::endl;
}
close(fd);
} else {
std::cout << "open() failed!" << std::endl;
}
return matrix;
}
void unmapMatrix(Matrix_t* matrix) {
if (nullptr == matrix)
return;
size_t nbytes =
UINTMAX_C(8) +
sizeof(float) * matrix->ncol * matrix->nrow;
munmap(matrix, nbytes);
}
bool areMatricesEqual( const Matrix_t* m1, const Matrix_t* m2) {
if (nullptr == m1) return false;
if (nullptr == m2) return false;
if (m1->ncol != m2->ncol) return false;
if (m1->nrow != m2->nrow) return false;
// both exist and have same size...
size_t nfloats = m1->ncol * m1->nrow;
size_t nbytes = nfloats * sizeof(float);
return 0 == memcmp( m1->values, m2->values, nbytes);
}
int main(int argc, const char* argv[]) {
std::random_device rdev;
std::default_random_engine reng(rdev());
std::uniform_real_distribution<> rdist(-1.0E6F, 1.0E6F);
StopWatch sw;
auto randomInitFunction =
[&reng,&rdist]
(uint32_t ncol, uint32_t nrow, uint32_t col, uint32_t row)
-> float
{
return rdist(reng);
};
sw.start("creating random matrix (20k x 20k)");
Matrix_t * randomMatrix =
createMatrix(UINT32_C(20000),
UINT32_C(20000),
randomInitFunction);
sw.stop();
if (nullptr != randomMatrix) {
std::cout
<< "the first 10 floating values in randomMatrix are: "
<< std::endl;
std::cout << randomMatrix->values[0];
for (size_t i = 1; i < 10; i++) {
std::cout << " " << randomMatrix->values[i];
}
std::cout << std::endl;
sw.start("saving matrix with iostream.");
saveMatrixAsText_IOSTREAM("mat0_iostream.txt", randomMatrix);
sw.stop();
sw.start("saving matrix with {fmt}.");
saveMatrixAsText_FMT("mat0_fmt.txt", randomMatrix);
sw.stop();
sw.start("saving matrix into a binary file.");
saveMatrixAsBinary("mat0.bin", randomMatrix);
sw.stop();
sw.start("loading matrix from text file with iostream.");
Matrix_t* textMatrix =
loadMatrixFromText_IOSTREAM("mat0_iostream.txt");
sw.stop();
sw.start("comparing randomMatrix with textMatrix.");
if (!areMatricesEqual(randomMatrix, textMatrix)) {
std::cout << "randomMatrix != textMatrix!" << std::endl;
} else {
std::cout << "randomMatrix == textMatrix" << std::endl;
}
sw.stop();
freeMatrix(textMatrix);
textMatrix = nullptr;
sw.start("loading matrix from text file with stdio.");
textMatrix =
loadMatrixFromText_STDIO("mat0_fmt.txt");
sw.stop();
sw.start("comparing randomMatrix with textMatrix (stdio).");
if (!areMatricesEqual(randomMatrix, textMatrix)) {
std::cout << "randomMatrix != textMatrix!" << std::endl;
} else {
std::cout << "randomMatrix == textMatrix" << std::endl;
}
sw.stop();
freeMatrix(textMatrix);
textMatrix = nullptr;
sw.start("loading matrix from binary file into memory.");
Matrix_t* inMemoryMatrix =
loadMatrixFromBinary("mat0.bin");
sw.stop();
sw.start("comparing randomMatrix with inMemoryMatrix.");
if (!areMatricesEqual(randomMatrix, inMemoryMatrix)) {
std::cout << "randomMatrix != inMemoryMatrix!"
<< std::endl;
} else {
std::cout << "randomMatrix == inMemoryMatrix" << std::endl;
}
sw.stop();
freeMatrix(inMemoryMatrix);
inMemoryMatrix = nullptr;
sw.start("mapping a view to a matrix in a binary file.");
Matrix_t* mappedMatrix =
mapMatrixFromBinaryFile("mat0.bin");
sw.stop();
sw.start("comparing randomMatrix with mappedMatrix.");
if (!areMatricesEqual(randomMatrix, mappedMatrix)) {
std::cout << "randomMatrix != mappedMatrix!"
<< std::endl;
} else {
std::cout << "randomMatrix == mappedMatrix" << std::endl;
}
sw.stop();
unmapMatrix(mappedMatrix);
mappedMatrix = nullptr;
freeMatrix(randomMatrix);
} else {
std::cout << "could not create random matrix!" << std::endl;
}
return 0;
}
Please note, that binary formats where you simply cast to a struct pointer also depend on how the compiler does alignment and padding within structures. In my case, I was lucky and it worked. On other systems, you might have to tweak a little (#pragma pack(4) or something along that line) to make it work.
I have written my own code using Tensorflow's C API to do inference (= using a trained artificial neural network) within a C++ Fluid Dynamics simulation program. However at some point the computation stops and gives me this error:
mpirun noticed that process rank 10 with PID 0 on node node134 exited on signal 9 (Killed).
I meanwhile noticed that this is probably happening due to the fact of no remaining memory: the moment the computation stops both RAM and Swp are fully occupied.
I do not understand why this is the case. But the only things I changed since the program was running without error is the code I added to it.
Within the fluid dynamics software I programmed this:
auto t_start_0 = std::chrono::high_resolution_clock::now();
const char* frozenGraphName = "/home/elias/Lr75-57_FPVANN_premix/data/FPV_ANN_tabulated_Standard_500.pb";
const char* inputOperationName = "input_1";
const char* outputOperationName = "dense_2/BiasAdd";
int no_of_inputs = in_mean.size();
int no_of_outputs = out_mean.size();
int cellsAndPatches = (input_f_zeta_PVNorm.size())/no_of_inputs;
std::vector<int64_t> input_dimensions = {cellsAndPatches,no_of_inputs};
std::vector<int64_t> output_dimensions = {cellsAndPatches,no_of_outputs};
Inference* inf = new Inference(frozenGraphName,inputOperationName,outputOperationName,no_of_inputs,no_of_outputs,input_dimensions,output_dimensions,cellsAndPatches);
output_real = inf->doInference(input_f_zeta_PVNorm);
delete inf;
auto t_end_0 = std::chrono::high_resolution_clock::now();
auto total_0 = std::chrono::duration<float, std::milli>(t_end_0 - t_start_0).count();
std::cout << "TOTAL INFERENCE TIME C API: " << total_0 << std::endl;
The constructor of my class Inference looks like this:
Inference::Inference(const char* fgn, const char* iname, const char* oname, int nIn, int nOut, std::vector<int64_t> dimIn,std::vector<int64_t> dimOut, int CP):no_input_sizes(nIn),no_output_sizes(nOut),noCellsPatches(CP)
{
TF_Buffer* graph_def = read_file(fgn);
graph = TF_NewGraph();
status = TF_NewStatus();
TF_ImportGraphDefOptions* graph_opts = TF_NewImportGraphDefOptions();
TF_GraphImportGraphDef(graph, graph_def, graph_opts, status);
if(TF_GetCode(status)!=TF_OK)
{
std::cout << "ERROR: Unable to import graph " << TF_Message(status) << std::endl;
}
num_bytes_in = noCellsPatches*no_input_sizes*sizeof(float);
num_bytes_out = noCellsPatches*no_output_sizes*sizeof(float);
in_dims = dimIn;
out_dims = dimOut;
in_name = strdup(iname);
out_name = strdup(oname);
TF_DeleteImportGraphDefOptions(graph_opts);
TF_DeleteBuffer(graph_def);
}
The doInference-method looks like this:
std::vector<float> Inference::doInference(std::vector<float> inVals)
{
assert((inVals.size()%no_input_sizes)==0);
std::cout << "EFFECTIVE BATCH SIZE: " << inVals.size() << std::endl;
float **normalizedInputs = new float* [noCellsPatches]; // allocate pointers
normalizedInputs[0] = new float [noCellsPatches*no_input_sizes]; // allocate data
// set pointers
for (int i = 1; i < noCellsPatches; ++i) {
normalizedInputs[i] = &normalizedInputs[i-1][no_input_sizes];
}
for(int i=0;i<noCellsPatches;i++)
{
for(int j=0;j<no_input_sizes;j++)
{
normalizedInputs[i][j]=inVals.at(no_input_sizes*i+j);
}
}
const char* iname = in_name;
TF_Operation* input_op = TF_GraphOperationByName(graph,iname); // assure string value is correct by viewing the frozen graph in Tensorboard
TF_Output input = {input_op,0};
inputs = &input;
assert(inputs!=0);
const char* oname = out_name;
TF_Operation* output_op = TF_GraphOperationByName(graph,oname); // assure string value is correct by viewing the frozen graph in Tensorboard
TF_Output output = {output_op,0};
outputs = &output;
assert(outputs!=0);
int64_t in_dims_arr[] = {noCellsPatches,no_input_sizes};
TF_Tensor* input_value = TF_NewTensor(TF_FLOAT,in_dims_arr,2,&normalizedInputs[0][0],num_bytes_in,&Deallocator, 0); // normalizedInputs at Arg 4 before
TF_Tensor* const input_value_const = input_value; // const pointer to TF_Tensor
TF_Tensor* const* input_values = &input_value_const; // pointer to const pointer to TF_Tensor
assert(input_values!=0);
int64_t out_dims_arr[] = {noCellsPatches,no_output_sizes};
TF_Tensor* output_value = TF_AllocateTensor(TF_FLOAT, out_dims_arr, 2, num_bytes_out); // pointer to TF_Tensor //Arg2!
TF_Tensor** output_values = &output_value; // pointer to pointer to TF_Tensor
assert(output_values!=0);
std::cout << "Running session..." << std::endl;
TF_SessionOptions* sess_opts = TF_NewSessionOptions();
int limitCPUThreads = 1; // if you want to limit the inference to a number of CPU Threads you can do that here
int limitNumberOfCPUs = 0;
if((limitCPUThreads!=0)&&(limitNumberOfCPUs!=0))
{
std::cout << "ERROR! You cannnot limit both number of CPUs and number of threads!" << std::endl;
}
if((limitCPUThreads!=0)&&(limitNumberOfCPUs==0))
{
std::cout << "WARNING! You are limiting CPU inference to " << limitCPUThreads << " CPU Thread(s) / Core(s)!" << std::endl;
uint8_t intra_op_parallelism_threads = limitCPUThreads; // for operations that can be parallelized internally, such as matrix multiplication
uint8_t inter_op_parallelism_threads = limitCPUThreads; // for operationss that are independent in your TensorFlow graph because there is no directed path between them in the dataflow graph
uint8_t config[]={0x10,intra_op_parallelism_threads,0x28,inter_op_parallelism_threads};
TF_SetConfig(sess_opts,config,sizeof(config),status);
if (TF_GetCode(status) != TF_OK)
{
printf("ERROR: %s\n", TF_Message(status));
}
}
if((limitCPUThreads==0)&&(limitNumberOfCPUs!=0)) // HIER SCHEINT NOCH ETWAS NICHT ZU STIMMEN!
{
std::cout << "WARNING! You are limiting CPU inference to " << limitNumberOfCPUs << " CPU(s)!" << std::endl;
uint8_t numberOfCPUs = limitNumberOfCPUs;
uint8_t config[] = {0xa, 0x7, 0xa, 0x3, 0x43, 0x50, 0x55, 0x10, 0x01};
std::cout << config << std::endl;
TF_SetConfig(sess_opts,config,sizeof(config),status);
if (TF_GetCode(status) != TF_OK)
{
printf("ERROR: %s\n", TF_Message(status));
}
}
TF_Session* session = TF_NewSession(graph, sess_opts, status);
assert(TF_GetCode(status)==TF_OK);
auto t_start = std::chrono::high_resolution_clock::now();
TF_SessionRun(session,nullptr,inputs,input_values,1,outputs,output_values,1,nullptr,0,nullptr,status);
auto t_end = std::chrono::high_resolution_clock::now();
auto total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
std::cout << "time required for inference: " << total << std::endl;
float* out_vals = static_cast<float*>(TF_TensorData(*output_values));
std::vector<float> results(no_output_sizes*noCellsPatches,0);
for(int i=0;i<noCellsPatches;i++)
{
for(int j=0;j<no_output_sizes;j++)
{
results.at(i*no_output_sizes+j) = *out_vals;
out_vals++;
}
}
std::cout << "Successfully ran session!" << std::endl;
TF_CloseSession(session,status);
TF_DeleteSession(session,status);
TF_DeleteSessionOptions(sess_opts);
delete [] normalizedInputs[0];
delete [] normalizedInputs;
return results;
}
Is there some kind of memory leak that I did not recognize? Or what could be the reason it works for some hundred timesteps and then crashes?
Thanks in advance!
I'd like to store the structure below in a disk and be able to read it again: (C++)
struct pixels {
std::vector<cv::Point> indexes;
cv::Mat values;
};
I've tried to use ofstream and ifstream but they need the size of the variable which I don't really know how to calculate in this situation. It's not a simple struct with some int and double. Is there any way to do it in C++, preferably without using any third-party libraries.
(I'm actually coming from the Matlab language. It was easy to do it in that language using save: save(filename, variables)).
Edit:
I've just tried Boost Serialization. Unfortunately it's very slow for my use.
Several approaches come to mind with various cons and pros.
Use OpenCV's XML/YAML persistence functionality.
XML format (portable)
YAML format (portable)
JSON format (portable)
Use Boost.Serialization
Plain text format (portable)
XML format (portable)
binary format (non-portable)
Raw data to std::fstream
binary format (non-portable)
By "portable" I mean that the data files written on an arbitrary platform+compiler can be read on any other platform+compiler. By "non-portable", I mean that's not necessarily the case. Endiannes matters, and compilers could possibly make a difference too. You could add additional handling for such situations at the cost of performance. In this answer, I'll assume you're reading and writing on the same machine.
First here are includes, common data structures and utility functions we will use:
#include <opencv2/opencv.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/archive/xml_oarchive.hpp>
#include <boost/archive/xml_iarchive.hpp>
#include <boost/filesystem.hpp>
#include <boost/serialization/vector.hpp>
#include <chrono>
#include <fstream>
#include <vector>
// ============================================================================
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::microseconds;
namespace ba = boost::archive;
namespace bs = boost::serialization;
namespace fs = boost::filesystem;
// ============================================================================
struct pixels
{
std::vector<cv::Point> indexes;
cv::Mat values;
};
struct test_results
{
bool matches;
double write_time_ms;
double read_time_ms;
size_t file_size;
};
// ----------------------------------------------------------------------------
bool validate(pixels const& pix_out, pixels const& pix_in)
{
bool result(true);
result &= (pix_out.indexes == pix_in.indexes);
result &= (cv::countNonZero(pix_out.values != pix_in.values) == 0);
return result;
}
pixels generate_data()
{
pixels pix;
for (int i(0); i < 10000; ++i) {
pix.indexes.emplace_back(i, 2 * i);
}
pix.values = cv::Mat(1024, 1024, CV_8UC3);
cv::randu(pix.values, 0, 256);
return pix;
}
void dump_results(std::string const& label, test_results const& results)
{
std::cout << label << "\n";
std::cout << "Matched = " << (results.matches ? "true" : "false") << "\n";
std::cout << "Write time = " << results.write_time_ms << " ms\n";
std::cout << "Read time = " << results.read_time_ms << " ms\n";
std::cout << "File size = " << results.file_size << " bytes\n";
std::cout << "\n";
}
// ============================================================================
Using OpenCV FileStorage
This is the first obvious choice is to use the serialization functionality OpenCV provides -- cv::FileStorage, cv::FileNode and cv::FileNodeIterator. There's a nice tutorial in the 2.4.x documentation, which I can't seem to find right now in the new docs.
The advantage here is that we already have support for cv::Mat and cv::Point, so there's very little to implement.
However, all the formats provided are textual, so there will be a fairly large cost in reading and writing the values (especially for the cv::Mat). It may be advantageous to save/load the cv::Mat using cv::imread/cv::imwrite and serialize the filename. I'll leave this to the reader to implement and benchmark.
// ============================================================================
void save_pixels(pixels const& pix, cv::FileStorage& fs)
{
fs << "indexes" << "[";
for (auto const& index : pix.indexes) {
fs << index;
}
fs << "]";
fs << "values" << pix.values;
}
void load_pixels(pixels& pix, cv::FileStorage& fs)
{
cv::FileNode n(fs["indexes"]);
if (n.type() != cv::FileNode::SEQ) {
throw std::runtime_error("Input format error: `indexes` is not a sequence.");;
}
pix.indexes.clear();
cv::FileNodeIterator it(n.begin()), it_end(n.end());
cv::Point pt;
for (; it != it_end; ++it) {
(*it) >> pt;
pix.indexes.push_back(pt);
}
fs["values"] >> pix.values;
}
// ----------------------------------------------------------------------------
test_results test_cv_filestorage(std::string const& file_name, pixels const& pix)
{
test_results results;
pixels pix_in;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
{
cv::FileStorage fs(file_name, cv::FileStorage::WRITE);
save_pixels(pix, fs);
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
{
cv::FileStorage fs(file_name, cv::FileStorage::READ);
load_pixels(pix_in, fs);
}
high_resolution_clock::time_point t3 = high_resolution_clock::now();
results.matches = validate(pix, pix_in);
results.write_time_ms = static_cast<double>(duration_cast<microseconds>(t2 - t1).count()) / 1000;
results.read_time_ms = static_cast<double>(duration_cast<microseconds>(t3 - t2).count()) / 1000;
results.file_size = fs::file_size(file_name);
return results;
}
// ============================================================================
Using Boost Serialization
Another potential approach is to use Boost.Serialization library, as you mention you have tried. We have three options here on the archive format, two of which are textual (and portable), and one is binary (non-portable, but much more efficient).
There's more work to do here. We need to provide good serialization for cv::Mat, cv::Point and our pixels structure. Support for std::vector is provided, and to handle XML, we need to generate key-value pairs.
In case of the two textual formats, it may again be advantageous to save the cv::Mat as an image, and only serialize the path. The reader is free to try this approach. For binary format it would most likely be a tradeoff between space and time. Again, feel free to test this (you could even use cv::imencode and imdecode).
// ============================================================================
namespace boost { namespace serialization {
template<class Archive>
void serialize(Archive &ar, cv::Mat& mat, const unsigned int)
{
int cols, rows, type;
bool continuous;
if (Archive::is_saving::value) {
cols = mat.cols; rows = mat.rows; type = mat.type();
continuous = mat.isContinuous();
}
ar & boost::serialization::make_nvp("cols", cols);
ar & boost::serialization::make_nvp("rows", rows);
ar & boost::serialization::make_nvp("type", type);
ar & boost::serialization::make_nvp("continuous", continuous);
if (Archive::is_loading::value)
mat.create(rows, cols, type);
if (continuous) {
size_t const data_size(rows * cols * mat.elemSize());
ar & boost::serialization::make_array(mat.ptr(), data_size);
} else {
size_t const row_size(cols * mat.elemSize());
for (int i = 0; i < rows; i++) {
ar & boost::serialization::make_array(mat.ptr(i), row_size);
}
}
}
template<class Archive>
void serialize(Archive &ar, cv::Point& pt, const unsigned int)
{
ar & boost::serialization::make_nvp("x", pt.x);
ar & boost::serialization::make_nvp("y", pt.y);
}
template<class Archive>
void serialize(Archive &ar, ::pixels& pix, const unsigned int)
{
ar & boost::serialization::make_nvp("indexes", pix.indexes);
ar & boost::serialization::make_nvp("values", pix.values);
}
}}
// ----------------------------------------------------------------------------
template <typename OArchive, typename IArchive>
test_results test_bs_filestorage(std::string const& file_name
, pixels const& pix
, bool binary = false)
{
test_results results;
pixels pix_in;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
{
std::ios::openmode mode(std::ios::out);
if (binary) mode |= std::ios::binary;
std::ofstream ofs(file_name.c_str(), mode);
OArchive oa(ofs);
oa & boost::serialization::make_nvp("pixels", pix);
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
{
std::ios::openmode mode(std::ios::in);
if (binary) mode |= std::ios::binary;
std::ifstream ifs(file_name.c_str(), mode);
IArchive ia(ifs);
ia & boost::serialization::make_nvp("pixels", pix_in);
}
high_resolution_clock::time_point t3 = high_resolution_clock::now();
results.matches = validate(pix, pix_in);
results.write_time_ms = static_cast<double>(duration_cast<microseconds>(t2 - t1).count()) / 1000;
results.read_time_ms = static_cast<double>(duration_cast<microseconds>(t3 - t2).count()) / 1000;
results.file_size = fs::file_size(file_name);
return results;
}
// ============================================================================
Raw Data to std::fstream
If we don't care about portability of the data files, we can just do the minimal amount of work to dump and restore the memory. With some effort (at the cost of speed) you could make this more flexible.
// ============================================================================
void save_pixels(pixels const& pix, std::ofstream& ofs)
{
size_t index_count(pix.indexes.size());
ofs.write(reinterpret_cast<char const*>(&index_count), sizeof(index_count));
ofs.write(reinterpret_cast<char const*>(&pix.indexes[0]), sizeof(cv::Point) * index_count);
int cols(pix.values.cols), rows(pix.values.rows), type(pix.values.type());
bool continuous(pix.values.isContinuous());
ofs.write(reinterpret_cast<char const*>(&cols), sizeof(cols));
ofs.write(reinterpret_cast<char const*>(&rows), sizeof(rows));
ofs.write(reinterpret_cast<char const*>(&type), sizeof(type));
ofs.write(reinterpret_cast<char const*>(&continuous), sizeof(continuous));
if (continuous) {
size_t const data_size(rows * cols * pix.values.elemSize());
ofs.write(reinterpret_cast<char const*>(pix.values.ptr()), data_size);
} else {
size_t const row_size(cols * pix.values.elemSize());
for (int i(0); i < rows; ++i) {
ofs.write(reinterpret_cast<char const*>(pix.values.ptr(i)), row_size);
}
}
}
void load_pixels(pixels& pix, std::ifstream& ifs)
{
size_t index_count(0);
ifs.read(reinterpret_cast<char*>(&index_count), sizeof(index_count));
pix.indexes.resize(index_count);
ifs.read(reinterpret_cast<char*>(&pix.indexes[0]), sizeof(cv::Point) * index_count);
int cols, rows, type;
bool continuous;
ifs.read(reinterpret_cast<char*>(&cols), sizeof(cols));
ifs.read(reinterpret_cast<char*>(&rows), sizeof(rows));
ifs.read(reinterpret_cast<char*>(&type), sizeof(type));
ifs.read(reinterpret_cast<char*>(&continuous), sizeof(continuous));
pix.values.create(rows, cols, type);
if (continuous) {
size_t const data_size(rows * cols * pix.values.elemSize());
ifs.read(reinterpret_cast<char*>(pix.values.ptr()), data_size);
} else {
size_t const row_size(cols * pix.values.elemSize());
for (int i(0); i < rows; ++i) {
ifs.read(reinterpret_cast<char*>(pix.values.ptr(i)), row_size);
}
}
}
// ----------------------------------------------------------------------------
test_results test_raw(std::string const& file_name, pixels const& pix)
{
test_results results;
pixels pix_in;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
{
std::ofstream ofs(file_name.c_str(), std::ios::out | std::ios::binary);
save_pixels(pix, ofs);
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
{
std::ifstream ifs(file_name.c_str(), std::ios::in | std::ios::binary);
load_pixels(pix_in, ifs);
}
high_resolution_clock::time_point t3 = high_resolution_clock::now();
results.matches = validate(pix, pix_in);
results.write_time_ms = static_cast<double>(duration_cast<microseconds>(t2 - t1).count()) / 1000;
results.read_time_ms = static_cast<double>(duration_cast<microseconds>(t3 - t2).count()) / 1000;
results.file_size = fs::file_size(file_name);
return results;
}
// ============================================================================
Complete main()
Let's run all the tests for the various approaches and compare the results.
Code:
// ============================================================================
int main()
{
namespace ba = boost::archive;
pixels pix(generate_data());
auto r_c_xml = test_cv_filestorage("test.cv.xml", pix);
auto r_c_yaml = test_cv_filestorage("test.cv.yaml", pix);
auto r_c_json = test_cv_filestorage("test.cv.json", pix);
auto r_b_txt = test_bs_filestorage<ba::text_oarchive, ba::text_iarchive>("test.bs.txt", pix);
auto r_b_xml = test_bs_filestorage<ba::xml_oarchive, ba::xml_iarchive>("test.bs.xml", pix);
auto r_b_bin = test_bs_filestorage<ba::binary_oarchive, ba::binary_iarchive>("test.bs.bin", pix, true);
auto r_b_raw = test_raw("test.raw", pix);
// ----
dump_results("OpenCV - XML", r_c_xml);
dump_results("OpenCV - YAML", r_c_yaml);
dump_results("OpenCV - JSON", r_c_json);
dump_results("Boost - TXT", r_b_txt);
dump_results("Boost - XML", r_b_xml);
dump_results("Boost - Binary", r_b_bin);
dump_results("Raw", r_b_raw);
return 0;
}
// ============================================================================
Console output (i7-4930k, Win10, MSVC 2013)
NB: We're testing this with 10000 indexes and values being a 1024x1024 BGR image.
OpenCV - XML
Matched = true
Write time = 257.563 ms
Read time = 257.016 ms
File size = 12323677 bytes
OpenCV - YAML
Matched = true
Write time = 135.498 ms
Read time = 311.999 ms
File size = 16353873 bytes
OpenCV - JSON
Matched = true
Write time = 137.003 ms
Read time = 312.528 ms
File size = 16353873 bytes
Boost - TXT
Matched = true
Write time = 1293.84 ms
Read time = 1210.94 ms
File size = 11333696 bytes
Boost - XML
Matched = true
Write time = 4890.82 ms
Read time = 4042.75 ms
File size = 62095856 bytes
Boost - Binary
Matched = true
Write time = 12.498 ms
Read time = 4 ms
File size = 3225813 bytes
Raw
Matched = true
Write time = 8.503 ms
Read time = 2.999 ms
File size = 3225749 bytes
Conclusion
Looking at the results, the textual Boost.Serialization formats are abhorently slow -- I see what you meant. Saving values separately would definitely bring significant benefit here. The binary approach is quite good if portability is not an issue. You could still fix that at a reasonable cost.
OpenCV performs much better, XML being balanced on reads and writes, YAML/JSON (apparently identical) being faster on writes, but slower on reads. Still rather sluggish, so writing values as an image and saving filename might still be of benefit.
The raw approach is the fastest (no surprise), but also inflexible. You could make some improvements, of course, but it seems to need a lot more code than using a binary Boost.Archive -- not really worth it here. Still, if you're doing everything on the same machine, this may do the job.
Personally I'd go for the binary Boost approach, and tweak it if you need cross-platform capability.