I have written my own code using Tensorflow's C API to do inference (= using a trained artificial neural network) within a C++ Fluid Dynamics simulation program. However at some point the computation stops and gives me this error:
mpirun noticed that process rank 10 with PID 0 on node node134 exited on signal 9 (Killed).
I meanwhile noticed that this is probably happening due to the fact of no remaining memory: the moment the computation stops both RAM and Swp are fully occupied.
I do not understand why this is the case. But the only things I changed since the program was running without error is the code I added to it.
Within the fluid dynamics software I programmed this:
auto t_start_0 = std::chrono::high_resolution_clock::now();
const char* frozenGraphName = "/home/elias/Lr75-57_FPVANN_premix/data/FPV_ANN_tabulated_Standard_500.pb";
const char* inputOperationName = "input_1";
const char* outputOperationName = "dense_2/BiasAdd";
int no_of_inputs = in_mean.size();
int no_of_outputs = out_mean.size();
int cellsAndPatches = (input_f_zeta_PVNorm.size())/no_of_inputs;
std::vector<int64_t> input_dimensions = {cellsAndPatches,no_of_inputs};
std::vector<int64_t> output_dimensions = {cellsAndPatches,no_of_outputs};
Inference* inf = new Inference(frozenGraphName,inputOperationName,outputOperationName,no_of_inputs,no_of_outputs,input_dimensions,output_dimensions,cellsAndPatches);
output_real = inf->doInference(input_f_zeta_PVNorm);
delete inf;
auto t_end_0 = std::chrono::high_resolution_clock::now();
auto total_0 = std::chrono::duration<float, std::milli>(t_end_0 - t_start_0).count();
std::cout << "TOTAL INFERENCE TIME C API: " << total_0 << std::endl;
The constructor of my class Inference looks like this:
Inference::Inference(const char* fgn, const char* iname, const char* oname, int nIn, int nOut, std::vector<int64_t> dimIn,std::vector<int64_t> dimOut, int CP):no_input_sizes(nIn),no_output_sizes(nOut),noCellsPatches(CP)
{
TF_Buffer* graph_def = read_file(fgn);
graph = TF_NewGraph();
status = TF_NewStatus();
TF_ImportGraphDefOptions* graph_opts = TF_NewImportGraphDefOptions();
TF_GraphImportGraphDef(graph, graph_def, graph_opts, status);
if(TF_GetCode(status)!=TF_OK)
{
std::cout << "ERROR: Unable to import graph " << TF_Message(status) << std::endl;
}
num_bytes_in = noCellsPatches*no_input_sizes*sizeof(float);
num_bytes_out = noCellsPatches*no_output_sizes*sizeof(float);
in_dims = dimIn;
out_dims = dimOut;
in_name = strdup(iname);
out_name = strdup(oname);
TF_DeleteImportGraphDefOptions(graph_opts);
TF_DeleteBuffer(graph_def);
}
The doInference-method looks like this:
std::vector<float> Inference::doInference(std::vector<float> inVals)
{
assert((inVals.size()%no_input_sizes)==0);
std::cout << "EFFECTIVE BATCH SIZE: " << inVals.size() << std::endl;
float **normalizedInputs = new float* [noCellsPatches]; // allocate pointers
normalizedInputs[0] = new float [noCellsPatches*no_input_sizes]; // allocate data
// set pointers
for (int i = 1; i < noCellsPatches; ++i) {
normalizedInputs[i] = &normalizedInputs[i-1][no_input_sizes];
}
for(int i=0;i<noCellsPatches;i++)
{
for(int j=0;j<no_input_sizes;j++)
{
normalizedInputs[i][j]=inVals.at(no_input_sizes*i+j);
}
}
const char* iname = in_name;
TF_Operation* input_op = TF_GraphOperationByName(graph,iname); // assure string value is correct by viewing the frozen graph in Tensorboard
TF_Output input = {input_op,0};
inputs = &input;
assert(inputs!=0);
const char* oname = out_name;
TF_Operation* output_op = TF_GraphOperationByName(graph,oname); // assure string value is correct by viewing the frozen graph in Tensorboard
TF_Output output = {output_op,0};
outputs = &output;
assert(outputs!=0);
int64_t in_dims_arr[] = {noCellsPatches,no_input_sizes};
TF_Tensor* input_value = TF_NewTensor(TF_FLOAT,in_dims_arr,2,&normalizedInputs[0][0],num_bytes_in,&Deallocator, 0); // normalizedInputs at Arg 4 before
TF_Tensor* const input_value_const = input_value; // const pointer to TF_Tensor
TF_Tensor* const* input_values = &input_value_const; // pointer to const pointer to TF_Tensor
assert(input_values!=0);
int64_t out_dims_arr[] = {noCellsPatches,no_output_sizes};
TF_Tensor* output_value = TF_AllocateTensor(TF_FLOAT, out_dims_arr, 2, num_bytes_out); // pointer to TF_Tensor //Arg2!
TF_Tensor** output_values = &output_value; // pointer to pointer to TF_Tensor
assert(output_values!=0);
std::cout << "Running session..." << std::endl;
TF_SessionOptions* sess_opts = TF_NewSessionOptions();
int limitCPUThreads = 1; // if you want to limit the inference to a number of CPU Threads you can do that here
int limitNumberOfCPUs = 0;
if((limitCPUThreads!=0)&&(limitNumberOfCPUs!=0))
{
std::cout << "ERROR! You cannnot limit both number of CPUs and number of threads!" << std::endl;
}
if((limitCPUThreads!=0)&&(limitNumberOfCPUs==0))
{
std::cout << "WARNING! You are limiting CPU inference to " << limitCPUThreads << " CPU Thread(s) / Core(s)!" << std::endl;
uint8_t intra_op_parallelism_threads = limitCPUThreads; // for operations that can be parallelized internally, such as matrix multiplication
uint8_t inter_op_parallelism_threads = limitCPUThreads; // for operationss that are independent in your TensorFlow graph because there is no directed path between them in the dataflow graph
uint8_t config[]={0x10,intra_op_parallelism_threads,0x28,inter_op_parallelism_threads};
TF_SetConfig(sess_opts,config,sizeof(config),status);
if (TF_GetCode(status) != TF_OK)
{
printf("ERROR: %s\n", TF_Message(status));
}
}
if((limitCPUThreads==0)&&(limitNumberOfCPUs!=0)) // HIER SCHEINT NOCH ETWAS NICHT ZU STIMMEN!
{
std::cout << "WARNING! You are limiting CPU inference to " << limitNumberOfCPUs << " CPU(s)!" << std::endl;
uint8_t numberOfCPUs = limitNumberOfCPUs;
uint8_t config[] = {0xa, 0x7, 0xa, 0x3, 0x43, 0x50, 0x55, 0x10, 0x01};
std::cout << config << std::endl;
TF_SetConfig(sess_opts,config,sizeof(config),status);
if (TF_GetCode(status) != TF_OK)
{
printf("ERROR: %s\n", TF_Message(status));
}
}
TF_Session* session = TF_NewSession(graph, sess_opts, status);
assert(TF_GetCode(status)==TF_OK);
auto t_start = std::chrono::high_resolution_clock::now();
TF_SessionRun(session,nullptr,inputs,input_values,1,outputs,output_values,1,nullptr,0,nullptr,status);
auto t_end = std::chrono::high_resolution_clock::now();
auto total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
std::cout << "time required for inference: " << total << std::endl;
float* out_vals = static_cast<float*>(TF_TensorData(*output_values));
std::vector<float> results(no_output_sizes*noCellsPatches,0);
for(int i=0;i<noCellsPatches;i++)
{
for(int j=0;j<no_output_sizes;j++)
{
results.at(i*no_output_sizes+j) = *out_vals;
out_vals++;
}
}
std::cout << "Successfully ran session!" << std::endl;
TF_CloseSession(session,status);
TF_DeleteSession(session,status);
TF_DeleteSessionOptions(sess_opts);
delete [] normalizedInputs[0];
delete [] normalizedInputs;
return results;
}
Is there some kind of memory leak that I did not recognize? Or what could be the reason it works for some hundred timesteps and then crashes?
Thanks in advance!
Related
Suppose I analyze binary operations, and use llvm to easily perform static analysis to get the operators and operands of binary operations. But if I want to get the specific value of the runtime operand and print it out, can I use the CallInst:: Create method to achieve it, or what other method
I tried to use function instrumentation. In the process of each binary operation of Pass, I inserted the function in front of this binary operator, but I did not get the output. Suppose I print only the number of rows and columns first.
bool Instrument::runOnFunction(Function &F) {
auto FunctionName = F.getName().str();
outs() << "Running " << PASS_DESC << " on function " << FunctionName << "\n";
outs() << "Instrument Instructions\n";
LLVMContext &Context = F.getContext();
Module *M = F.getParent();
Type *VoidType = Type::getVoidTy(Context);
Type *Int32Type = Type::getInt32Ty(Context);
Type *Int8Type = Type::getInt8Ty(Context);
M->getOrInsertFunction(COVERAGE_FUNCTION_NAME, VoidType, Int32Type,
Int32Type);
M->getOrInsertFunction(BINOP_OPERANDS_FUNCTION_NAME, VoidType, Int8Type,
Int32Type, Int32Type, Int32Type, Int32Type);
for (inst_iterator Iter = inst_begin(F), E = inst_end(F); Iter != E; ++Iter) {
Instruction &Inst = (*Iter);
llvm::DebugLoc DebugLoc = Inst.getDebugLoc();
if (!DebugLoc) {
continue;
}
int Line = DebugLoc.getLine();
int Col = DebugLoc.getCol();
instrumentCoverage(M, Inst, Line, Col);
}
return true;
}
void __coverage__(int line, int col) {
printf("%d, %d\n",line,col);
}
void instrumentCoverage(Module *M, Instruction &I, int Line, int Col) {
auto &Context = M->getContext();
auto *Int32Type = Type::getInt32Ty(Context);
auto LineVal = ConstantInt::get(Int32Type, Line);
auto ColVal = ConstantInt::get(Int32Type, Col);
std::vector<Value *> Args = {LineVal, ColVal};
auto *CoverageFunction = M->getFunction("__coverage__");
CallInst::Create(CoverageFunction, Args, "", &I);
}```
so I have some strange behavior of my multi-threading code and I'm not sure either something is wrong with it or it may be just some physical side-effect of the multi-sensor system or of the OS.
I'm working on windows 10, c++.
I have three cameras, one of real sense and two thermal cameras, which are synchronized by a phisical connection in a master-slave mode, and my goal is to record huge amounts of data (dozens of GB) on a external hard drive.
The idea that I thought about is to use mapped_file of boost library to map to the disk many files (or one huge) and use callbacks for each camera, where each callback opened by a new thread and memcpy the data block which is the frame to the mapped_file on the external HD, and another polling loop on the amount (which is an atomic variable) of written frames (I know it can be probably effectively done with semaphores, but for now I think that part works more or less) in the main function, that each time reaches some limit locks the thread with unique_lock and swaps the pointers of the current mapped_file to the next one (I define and open all the needed files on the external HD in advance).
Now I'll let show you how all of it look like:
The callbacks for the cameras, one for real sense which handles each time 2 frames, depth and rgb, and two for thermal cameras:
class RSCallback {
public:
char** color_mfd_ptr = new char*;
char** depth_mfd_ptr = new char*;
vector<long long int>& color_depth_ts;
volatile int& idx_depth;
std::atomic<volatile int>& idx_color;
std::mutex& mux;
boost::shared_mutex& shared_mux;
public:
RSCallback(char* cmfd_ptr, char* dmfd_ptr, vector<long long int>& ccdts,
std::atomic<volatile int>& idxc, int& idxd, std::mutex& mx,
boost::shared_mutex& smx) :
idx_color(idxc), idx_depth(idxd),
color_depth_ts(ccdts), mux(mx), shared_mux(smx){
*color_mfd_ptr = cmfd_ptr;
*depth_mfd_ptr = dmfd_ptr;
}
// This operator overloading enables calling
// operator function () on objects of increment
void operator () (const rs2::frame &frame) {
boost::shared_lock<boost::shared_mutex> shared_lock(this->shared_mux);
std::lock_guard<std::mutex> lock(this->mux);
if (rs2::frameset fs = frame.as<rs2::frameset>()) {
const std::chrono::time_point<std::chrono::steady_clock> now =
high_resolution_clock::now();
long long int loc_ts =
std::chrono::duration_cast<std::chrono::nanoseconds>(
now.time_since_epoch()).count();
this->color_depth_ts.push_back(loc_ts);
for (const rs2::frame f: fs) {
auto vf = f.as<rs2::video_frame>();
if (vf.get_bytes_per_pixel() == 2) {
size_t sz = vf.get_data_size();
memcpy((void *) ((uint8_t *) (*depth_mfd_ptr) +
idx_depth * sz), vf.get_data(), sz);
idx_depth++;
} else {
size_t sz = vf.get_data_size();
+ memcpy((void *) ((uint8_t *) (*color_mfd_ptr) +
idx_color * sz), vf.get_data(), sz);
idx_color.fetch_add(1, std::memory_order_relaxed);
}
}
}
}
};
class TC1Callback {
public:
char** tc_mfd_ptr = new char*;
vector<long long int> &tc_ts;
int& idx_tc;
size_t sz;
std::mutex &mux;
boost::shared_mutex &shared_mux;
public:
TC1Callback(char *tcm_ptr, vector<long long int> &tcts, int& ixtc,
size_t tc_size, std::mutex &mx, boost::shared_mutex &smx) :
tc_ts(tcts), idx_tc(ixtc), sz(tc_size),
mux(mx), shared_mux(smx) {
*tc_mfd_ptr = tcm_ptr;
}
// This operator overloading enables calling
// operator function () on objects of increment
void operator()(const vector <uint8_t> &cur_frame) {
boost::shared_lock<boost::shared_mutex> shared_lock(shared_mux);
std::lock_guard <std::mutex> lock(mux);
memcpy((void *) ((uint8_t *) (*tc_mfd_ptr) + idx_tc * sz), cur_frame.data(), sz);
const std::chrono::time_point <std::chrono::steady_clock> now = high_resolution_clock::now();
long long int loc_ts = std::chrono::duration_cast<std::chrono::nanoseconds>(
now.time_since_epoch()).count();
tc_ts.push_back(loc_ts);
idx_tc++;
}
};
class TC2Callback {
public:
char** tc_mfd_ptr = new char*;
vector<long long int> &tc_ts;
int& idx_tc;
size_t sz;
std::mutex &mux;
boost::shared_mutex &shared_mux;
public:
TC2Callback(char *tcm_ptr, vector<long long int> &tcts, int& ixtc,
size_t tc_size, std::mutex &mx, boost::shared_mutex &smx) :
tc_ts(tcts), idx_tc(ixtc), sz(tc_size),
mux(mx), shared_mux(smx) {
*tc_mfd_ptr = tcm_ptr;
}
// This operator overloading enables calling
// operator function () on objects of increment
void operator()(const vector <uint8_t> &cur_frame) {
boost::shared_lock<boost::shared_mutex> shared_lock(shared_mux);
std::lock_guard <std::mutex> lock(mux);
memcpy((void *) ((uint8_t *) (*tc_mfd_ptr) + idx_tc * sz), cur_frame.data(), sz);
const std::chrono::time_point <std::chrono::steady_clock> now = high_resolution_clock::now();
long long int loc_ts = std::chrono::duration_cast<std::chrono::nanoseconds>(
now.time_since_epoch()).count();
tc_ts.push_back(loc_ts);
idx_tc++;
}
};
There is the save callback which just helps to parallelize the saving of all the data and close the opened mapped_file's
class SaveCallback {
public:
mapped_file& color_mapped_fd;
mapped_file& depth_mapped_fd;
mapped_file& tc1_mapped_fd;
mapped_file& tc2_mapped_fd;
vector<long long int>& color_depth_ts;
vector<long long int>& tc1_ts;
vector<long long int>& tc2_ts;
int idx;
string save_dir;
public:
SaveCallback(mapped_file& cmfd, mapped_file& dmfd, mapped_file& tc1fd,
mapped_file& tc2fd, vector<long long int>& cdts,
vector<long long int>& tc1ts, vector<long long int>& tc2ts,
int ix, string sdir) : color_mapped_fd(cmfd), depth_mapped_fd(dmfd),
tc1_mapped_fd(tc1fd), color_depth_ts(cdts),
tc1_ts(tc1ts), tc2_ts(tc2ts), tc2_mapped_fd(tc2fd),
idx(ix), save_dir(sdir) {}
// This operator overloading enables calling
// operator function () on objects of increment
void operator()() {
color_mapped_fd.close();
depth_mapped_fd.close();
tc1_mapped_fd.close();
tc2_mapped_fd.close();
ofstream cd_fout;
string color_depth_ts_name = save_dir + to_string(idx) + "color_depth_ts.bin";
cd_fout.open(color_depth_ts_name, ios::binary | ios::out);
cd_fout.write((char *) color_depth_ts.data(),
color_depth_ts.size() * sizeof(long long int));
cd_fout.close();
ofstream tc1_fout;
string tc1_ts_name = save_dir + to_string(idx) + "tc1_ts.bin";
tc1_fout.open(tc1_ts_name, ios::binary | ios::out);
tc1_fout.write((char *) tc1_ts.data(),
tc1_ts.size() * sizeof(long long int));
tc1_fout.close();
ofstream tc2_fout;
string tc2_ts_name = save_dir + to_string(idx) + "tc2_ts.bin";
tc2_fout.open(tc2_ts_name, ios::binary | ios::out);
tc2_fout.write((char *) tc2_ts.data(),
tc2_ts.size() * sizeof(long long int));
tc2_fout.close();
}
};
The main function is the following:
int main() {
string save_dir = "G:/Vista_project/";
//Connect first Thermal Cam with default settings
auto serialNumber = "serial1";
auto wic = wic::findAndConnect(serialNumber);
if (!wic) {
cerr << "Could not connect WIC: " << serialNumber << endl;
return 1;
}
auto defaultRes = wic->doDefaultWICSettings();
if (defaultRes.first != wic::ResponseStatus::Ok) {
cerr << "DoDefaultWICSettings: "
<< wic::responseStatusToStr(defaultRes.first) << endl;
return 2;
}
//Connect second Thermal Cam with default settings
auto serialNumber2 = "serials2";
auto wic2 = wic::findAndConnect(serialNumber2);
if (!wic2) {
cerr << "Could not connect WIC: " << serialNumber2 << endl;
return 1;
}
auto defaultRes2 = wic2->doDefaultWICSettings();
if (defaultRes2.first != wic::ResponseStatus::Ok) {
cerr << "DoDefaultWICSettings: "
<< wic::responseStatusToStr(defaultRes2.first) << endl;
return 2;
}
//Additional settings done in wic example code
// enable advanced features
wic->iKnowWhatImDoing();
// enable advanced features
wic2->iKnowWhatImDoing();
// set advanced radiometry if core supports it
// set core gain
auto gain = wic->setGain(wic::GainMode::High);
// set core gain
auto gain2 = wic2->setGain(wic::GainMode::High);
auto grabber1 = wic->frameGrabber();
grabber1->setup();
auto grabber2 = wic2->frameGrabber();
grabber2->setup();
//Manual mode of camera adjustment
auto status1 = wic->setFFCMode(wic::FFCModes::Manual);
auto status2 = wic2->setFFCMode(wic::FFCModes::Manual);
auto emode = wic::ExternalSyncMode(0x0001); //0x0001
auto resp1 = wic->setExternalSyncMode(emode);
auto emode2 = wic::ExternalSyncMode(0x0002); //0x0002
auto resp2 = wic2->setExternalSyncMode(emode2);
//Sanity check with cameras resolutions
auto resolution = wic->getResolution();
if (resolution.first == 0 || resolution.second == 0) {
cerr << "Invalid resolution, core detection error." << endl;
return 3;
}
auto resolution2 = wic2->getResolution();
if (resolution2.first == 0 || resolution2.second == 0) {
cerr << "Invalid resolution, core detection error." << endl;
return 3;
}
//No-Zoom in thermal cams
auto zoom_video_mode_None = wic::VideoModeZoom(0);
wic->setVideoModeZoom(zoom_video_mode_None);
wic2->setVideoModeZoom(zoom_video_mode_None);
//time to record of a partion between ctx-switch to he next memory-block to write
int time_to_record = 600;
//cameras fps
int rs_fps = 30 ;
int tc_fps = 9 + 3;
//depth and rgb params
int rgb_ch = 3;
int depth_px_sz = 2;
int tc_px_sz = 2;
//memory allocations size for single image and for total of images per
// memory block (time_to_record function)
size_t total_tc_size = 640LL * 512 * tc_px_sz * tc_fps * time_to_record;
size_t tc_size = 640 * 512 * 2;
long long color_size = 720LL * 1280 * rgb_ch * rs_fps * time_to_record;
long long depth_size = 720LL * 1280 * depth_px_sz * rs_fps * time_to_record;
//number of partitions which gives:
// total time of recording = number_of_records * time_to_record
int number_of_records = 1;
vector <vector<long long int>> HT1_tss_vec(number_of_records);
vector <vector<long long int>> HT2_tss_vec(number_of_records);
vector <vector<long long int>> color_depth_tss(number_of_records);
char **tc1_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * tc1_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
char **tc2_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * tc2_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
char **color_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * color_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
char **depth_mfd_ptrs = (char **) new char *[number_of_records];
mapped_file * depth_mapped_fds = (mapped_file * ) new mapped_file[number_of_records];
for (int l = 0; l < number_of_records; ++l) {
string tc1_file_path = save_dir + to_string(l) + +"tc1.bin";
const char *tc1_FileName = tc1_file_path.c_str();
const size_t tc1_FileSize = total_tc_size;
mapped_file_params tc1_params(tc1_FileName);
tc1_params.new_file_size = tc1_FileSize;
tc1_params.flags = mapped_file_base::readwrite;
tc1_mapped_fds[l] = mapped_file(tc1_params);
tc1_mfd_ptrs[l] = tc1_mapped_fds[l].data();
string tc2_file_path = save_dir + to_string(l) + "tc2.bin";
const char *tc2_FileName = tc2_file_path.c_str();
const size_t tc2_FileSize = total_tc_size;
mapped_file_params tc2_params(tc2_FileName);
tc2_params.new_file_size = tc2_FileSize;
tc2_params.flags = mapped_file_base::readwrite;
tc2_mapped_fds[l] = mapped_file(tc2_params);
tc2_mfd_ptrs[l] = tc2_mapped_fds[l].data();
string c_file_path = save_dir + to_string(l) + "color.bin";
const char *c_FileName = c_file_path.c_str();
const std::size_t ColorFileSize = color_size;
mapped_file_params params_c(c_FileName);
params_c.new_file_size = ColorFileSize;
params_c.flags = mapped_file_base::readwrite;
color_mapped_fds[l] = mapped_file(params_c);
color_mfd_ptrs[l] = color_mapped_fds[l].data();
string d_file_path = save_dir + to_string(l) + "depth.bin";
const char *d_FileName = d_file_path.c_str();
const std::size_t FileSize = depth_size;
mapped_file_params params_d(d_FileName);
params_d.new_file_size = FileSize;
params_d.flags = mapped_file_base::readwrite;
depth_mapped_fds[l] = mapped_file(params_d);
depth_mfd_ptrs[l] = depth_mapped_fds[l].data();
}
boost::shared_mutex shared_mux;
std::mutex tc1_mutex;
int idx_tc1 = 0;
auto tc1_callback = TC1Callback(tc1_mfd_ptrs[0], HT1_tss_vec[0], idx_tc1,
tc_size, tc1_mutex, shared_mux);
std::mutex tc2_mutex;
int idx_tc2 = 0;
auto tc2_callback = TC2Callback(tc2_mfd_ptrs[0], HT2_tss_vec[0], idx_tc2,
tc_size, tc2_mutex, shared_mux);
grabber1->bindBufferHandler(tc1_callback);
grabber2->bindBufferHandler(tc2_callback);
std::mutex mux;
rs2::pipeline pipe;
rs2::config cfg;
std::atomic<volatile int> idx_color(0);
int idx_depth = 0;
auto rs_callback = RSCallback(color_mfd_ptrs[0], depth_mfd_ptrs[0],
color_depth_tss[0], idx_color, idx_depth, mux,
shared_mux);
boost::asio::thread_pool thread_pool(number_of_records);
cout << "Recording Started" << endl;
cfg.enable_stream(RS2_STREAM_COLOR, 1280, 720, RS2_FORMAT_RGB8);
cfg.enable_stream(RS2_STREAM_DEPTH, 1280, 720, RS2_FORMAT_Z16);
rs2::pipeline_profile profiles = pipe.start(cfg, rs_callback);
bool start_statusA = grabber1->start();
//cout << "CamA started succefully : " << start_statusA << endl;
bool start_statusB = grabber2->start();
//cout << "CamB started succefully : " << start_statusB << std::endl;
auto save_intrinsics_extrinsics = SaveIntrinsicsExtrinsics(profiles);
post(thread_pool, save_intrinsics_extrinsics);
for (int cur_idx = 0; cur_idx < number_of_records; ++cur_idx) {
while (idx_color.load() < time_to_record * (rs_fps-10)) { //TODO: the last substracted value is a hyper-parameter (a funtion of time_to_record, the bigger the value
continue;
}
if(cur_idx == number_of_records - 1){
bool finish_statusB = grabber2->stop();
//cout << "CamB stoped succefully : " << finish_statusB << endl;
bool finish_statusA = grabber1->stop();
//cout << "CamA stoped succefully : " << finish_statusA << endl;
pipe.stop();
}
{
boost::unique_lock<boost::shared_mutex> lock(shared_mux);
auto start = high_resolution_clock::now();
auto save_callback = SaveCallback(color_mapped_fds[cur_idx],
depth_mapped_fds[cur_idx],
tc1_mapped_fds[cur_idx],
tc2_mapped_fds[cur_idx],
rs_callback.color_depth_ts,
tc1_callback.tc_ts,
tc2_callback.tc_ts,
cur_idx, save_dir);
post(thread_pool, save_callback);
if(cur_idx == number_of_records-1){
break;
}
*tc1_callback.tc_mfd_ptr = tc1_mfd_ptrs[cur_idx+1];
tc1_callback.tc_ts = HT1_tss_vec[cur_idx+1];
tc1_callback.idx_tc = 0;
*tc2_callback.tc_mfd_ptr = tc2_mfd_ptrs[cur_idx+1];
tc2_callback.tc_ts = HT2_tss_vec[cur_idx+1];
tc2_callback.idx_tc = 0;
*rs_callback.color_mfd_ptr = color_mfd_ptrs[cur_idx+1];
*rs_callback.depth_mfd_ptr = depth_mfd_ptrs[cur_idx+1];
rs_callback.color_depth_ts = color_depth_tss[cur_idx+1];
rs_callback.idx_color.store(0);
rs_callback.idx_depth = 0;
auto stop = high_resolution_clock::now();
auto duration = duration_cast<nanoseconds>(stop - start);
cout << duration.count() << endl;
}
}
thread_pool.join();
cout << "Finished";
return 0;
}
So the code does the following:
I open in advance number_of_records mapped_file's on the external HD and save them into a containers.
I loop over idx_color atomic value until some constant (busy-wait) and after that I send a SaveCallback to a thread in a thread_pool (it closes the open mapped_file and saves some additional data: timestamps of the frames), I swap to the next mapped_file and everyting continue to run without losing frames (the time is less then 1 millisecond of the swap) (of course if number_of_records > 1).
There is a mutex on each of thermal cameras, so the threads of each camera won't bother each other and there is the shared_mutex on all of them with a unique_lock on it at the main loop the moment I need to swap pointers, cuz I don't want to get memory violation accessing memory that may be all ready exsausted by the current recording.
My problem is only concerning the two tc (a shortcut for thermal cameras), so you can ignore the real sense callback if you want to.
The strange behavior that I get is sometimes in the second camera (tc2) the frames of the end are written to the begining overriding the start frames, it mostly happens when I use number_of_records > 1. But the code that reset the index for writing to the correct address in the mapped_file is locked by the unique_lock and I don't see how it is possible.
Anyway my workaround is to use number_of_records=1, I get some side-effects of some frames order being distorted, but mostly the frames are synchronized. If you ask yourself tho, why to use fractions instead of one huge file, then my answer is my hardware and software deals better with this logic, because one huge file and continious writing to it makes it more exhausting.
I wonder if I missed something or my multi-threading logic, mutexes and synchronization, will be glad for some review on this logic on this implemented code.
I see some glimpses even when I record only one huge file, if it is smaller the side-effects (jumps in frames and etc) are less noticeable, so I have a tendency to think it is a side effect of the hardware of the cameras
Thank you in advance.
P.S - feel free to ask any question about any part of the code, I wrote all the generally important aspects which concern me looking for a potential bug, but maybe you will notice another issues.
You can't run the code. but even just an overview can be helpfull.
I'm developing a bioinformatic tool, which requires reading in millions of matrix files (average dimension = (20k, 20k)). They are tab-delimited text files, and they look something like:
0.53 0.11
0.24 0.33
Because the software reads the matrix files one at a time, memory is not an issue, but it's very slow. The following is my current function for reading in a matrix file. I first make a matrix object using a double pointer, then fill in the matrix by looping through an input file .
float** make_matrix(int nrow, int ncol, float val){
float** M = new float *[nrow];
for(int i = 0; i < nrow; i++) {
M[i] = new float[ncol];
for(int j = 0; j < ncol; j++) {
M[i][j] = val;
}
}
return M;
}
float** read_matrix(string fname, int dim_1, int dim_2){
float** K = make_matrix(dim_1, dim_2, 0);
ifstream ifile(fname);
for (int i = 0; i < dim_1; ++i) {
for (int j = 0; j < dim_2; ++j) {
ifile >> K[i][j];
}
}
ifile.clear();
ifile.seekg(0, ios::beg);
return K;
}
Is there a much faster way to do this? From my experience with python, reading in a matrix file using pandas is so much faster than using python for-loops. Is there a trick like that in c++?
(added)
Thanks so much everyone for all your suggestions and comments!
The fastest way, by far, is to change the way you write those files: write in binary format, two int first (width, height) then just dump your values.
You will be able to load it in just three read calls.
Just for fun, I measured the program posted above (using a 20,000x20,000 ASCII input file, as described) on my Mac Mini (3.2GHz i7 with SSD drive) and found that it took about 102 seconds to parse in the file using the posted code.
Then I wrote a version of the same function that uses the C stdio API (fopen()/fread()/fclose()) and does character-by-character parsing into a 1D float array. This implementation takes about 13 seconds to parse in the file on the same hardware, so it's about 7 times faster.
Both programs were compiled with g++ -O3 test_read_matrix.cpp.
float* faster_read_matrix(string fname, int numRows, int numCols)
{
FILE * fpIn = fopen(fname.c_str(), "r");
if (fpIn == NULL)
{
printf("Couldn't open file [%s] for input!\n", fname.c_str());
return NULL;
}
float* K = new float[numRows*numCols];
// We'll hold the current number in (numberBuf) until we're ready to parse it
char numberBuf[128] = {'\0'};
int numCharsInBuffer = 0;
int curRow = 0, curCol = 0;
while(curRow < numRows)
{
char tempBuf[4*1024]; // an arbitrary size
const size_t bytesRead = fread(tempBuf, 1, sizeof(tempBuf), fpIn);
if (bytesRead <= 0)
{
if (bytesRead < 0) perror("fread");
break;
}
for (size_t i=0; i<bytesRead; i++)
{
const char c = tempBuf[i];
if ((c=='.')||(c=='+')||(c=='-')||(isdigit(c)))
{
if ((numCharsInBuffer+1) < sizeof(numberBuf)) numberBuf[numCharsInBuffer++] = c;
else
{
printf("Error, number string was too long for numberBuf!\n");
}
}
else
{
if (numCharsInBuffer > 0)
{
// Parse the current number-chars we have assembled into (numberBuf) and reset (numberBuf) to empty
numberBuf[numCharsInBuffer] = '\0';
if (curCol < numCols) K[curRow*numCols+curCol] = strtod(numberBuf, NULL);
else
{
printf("Error, too many values in row %i! (Expected %i, found at least %i)\n", curRow, numCols, curCol);
}
curCol++;
}
numCharsInBuffer = 0;
if (c == '\n')
{
curRow++;
curCol = 0;
if (curRow >= numRows) break;
}
}
}
}
fclose(fpIn);
if (curRow != numRows) printf("Warning: I read %i lines in the file, but I expected there would be %i!\n", curRow, numRows);
return K;
}
I am dissatisfied with Jeremy Friesner’s otherwise excellent answer because it:
blames the problem to be with C++'s I/O system (which it is not)
fixes the problem by circumventing the actual I/O problem without being explicit about how it is a significant contributor to speed
modifies memory accesses which (may or may not) contribute to speed, and does so in a way that very large matrices may not be supported
The reason his code runs so much faster is because he removes the single most important bottleneck: unoptimized disk access. JWO’s original code can be brought to match with three extra lines of code:
float** read_matrix(std::string fname, int dim_1, int dim_2){
float** K = make_matrix(dim_1, dim_2, 0);
std::size_t buffer_size = 4*1024; // 1
char buffer[buffer_size]; // 2
std::ifstream ifile(fname);
ifile.rdbuf()->pubsetbuf(buffer, buffer_size); // 3
for (int i = 0; i < dim_1; ++i) {
for (int j = 0; j < dim_2; ++j) {
ss >> K[i][j];
}
}
// ifile.clear();
// ifile.seekg(0, std::ios::beg);
return K;
}
The addition exactly replicates Friesner’s design, but using the C++ library capabilities without all the extra programming grief on our end.
You’ll notice I also removed a couple lines at the bottom that should be inconsequential to program function and correctness, but which may cause a minor cumulative time issue as well. (If they are not inconsequential, that is a bug and should be fixed!)
How much difference this all makes depends entirely on the quality of the C++ Standard Library implementation. AFAIK the big three modern C++ compilers (MSVC, GCC, and Clang) all have sufficiently-optimized I/O handling to make the issue moot.
locale
One other thing that may also make a difference is to .imbue() the stream with the default "C" locale, which avoids a lot of special handling for numbers in locale-dependent formats other than what your files use. You only need to bother to do this if you have changed your global locale, though.
ifile.imbue(std::locale(""));
redundant initialization
Another thing that is killing your time is the effort to zero-initialize the array when you create it. Don’t do that if you don’t need it! (You don’t need it here because you know the total extents and will fill them properly. C++17 and later is nice enough to give you a zero value if the input stream goes bad, too. So you get zeros for unread values either way.)
dynamic memory block size
Finally, keeping memory accesses to an array of array should not significantly affect speed, but it still might be worth testing if you can change it. This is assuming that the resulting matrix will never be too large for the memory manager to return as a single block (and consequently crash your program).
A common design is to allocate the entire array as a single block, with the requested size plus size for the array of pointers to the rest of the block. This allows you to delete the array in a single delete[] statement. Again, I don’t believe this should be an optimization issue you need to care about until your profiler says so.
At the risk of the answer being considered incomplete (no code examples), I would like to add to the other answers additional options how to tackle the problem:
Use a binary format (width,height, values...) as file format and then use file mapping (MapViewOfFile() on Windows, mmap() or so on posix/unix systems).
Then, you can simply point your "matrix structure" pointer to the mapped address space and you are done. And in case, you do something like sparse access to the matrix, it can even save some real IO. If you always do full access to all elements of the matrix (no sparse matrices etc.), it is still quite elegant and probably faster than malloc/read.
Replacements for c++ iostream, which is known to be quite slow and should not be used for performance critical stuff:
Have a look at the {fmt} library, which has become quite popular in recent years and claims to be quite fast.
Back in the days, when I did a lot of numerics on large data sets, I always opted for binary files for storage. (It was back in the days, when the fastest CPU you get your hands on were the Pentium 1 (with the floating point bug :)). Back then, all was slower, memory was much more limited (we had MB not GB as units for RAM in our systems) and all in all, nearly 20 years have passed since.
So, as a refresher, I did write some code to show, how much faster than iostream and text files you can do if you do not have extra constraints (such as endianess of different cpus etc.).
So far, my little test only has an iostream and a binary file version with a) stdio fread() kind of loading and b) mmap(). Since I sit in front of a debian bullseye computer, my code uses linux specific stuff for the mmap() approach. To run it on Windows, you have to change a few lines of code and some includes.
Edit: I added a save function using {fmt} now as well.
Edit: I added a load function with stdio now as well.
Edit: To reduce memory workload, I reordered the code somewhat
and now only keep 2 matrix instances in memory at any given time.
The program does the following:
create a 20k x 20k matrix in ram (in a struct named Matrix_t). With random values, slowly generated by std::random.
Write the matrix with iostream to a text file.
Write the matrix with stdio to a binary file.
Create a new matrix textMatrix by loading its data from the text file.
Create a new matrix inMemoryMatrix by loading its data from the binary file with a few fread() calls.
mmap() the binary file and use it under the name mappedMatrix.
Compare each of the loaded matrices to the original randomMatrix to see if the round-trip worked.
Here the results I got on my machine after compiling this work of wonder with clang++ -O3 -o fmatio fast-matrix-io.cpp -lfmt:
./fmatio
creating random matrix (20k x 20k) (27.0775seconds)
the first 10 floating values in randomMatrix are:
57970.2 -365700 -986079 44657.8 826968 -506928 668277 398241 -828176 394645
saveMatrixAsText_IOSTREAM()
saving matrix with iostream. (192.749seconds)
saveMatrixAsText_FMT(mat0_fmt.txt)
saving matrix with {fmt}. (34.4932seconds)
saveMatrixAsBinary()
saving matrix into a binary file. (30.7591seconds)
loadMatrixFromText_IOSTREAM()
loading matrix from text file with iostream. (102.074seconds)
randomMatrix == textMatrix
comparing randomMatrix with textMatrix. (0.125328seconds)
loadMatrixFromText_STDIO(mat0_fmt.txt)
loading matrix from text file with stdio. (71.2746seconds)
randomMatrix == textMatrix
comparing randomMatrix with textMatrix (stdio). (0.124684seconds)
loadMatrixFromBinary(mat0.bin)
loading matrix from binary file into memory. (0.495685seconds)
randomMatrix == inMemoryMatrix
comparing randomMatrix with inMemoryMatrix. (0.124206seconds)
mapMatrixFromBinaryFile(mat0.bin)
mapping a view to a matrix in a binary file. (4.5883e-05seconds)
randomMatrix == mappedMatrix
comparing randomMatrix with mappedMatrix. (0.158459seconds)
And here is the code:
#include <cinttypes>
#include <memory>
#include <random>
#include <iostream>
#include <fstream>
#include <cstring>
#include <string>
#include <chrono>
#include <limits>
#include <iomanip>
// includes for mmap()...
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <cstdio>
#include <cstdlib>
#include <unistd.h>
// includes for {fmt}...
#include <fmt/core.h>
#include <fmt/os.h>
struct StopWatch {
using Clock = std::chrono::high_resolution_clock;
using TimePoint =
std::chrono::time_point<Clock>;
using Duration =
std::chrono::duration<double>;
void start(const char* description) {
this->description = std::string(description);
tstart = Clock::now();
}
void stop() {
TimePoint tend = Clock::now();
Duration elapsed = tend - tstart;
std::cout << description << " (" << elapsed.count()
<< "seconds)" << std::endl;
}
TimePoint tstart;
std::string description;
};
struct Matrix_t {
uint32_t ncol;
uint32_t nrow;
float values[];
inline uint32_t to_index(uint32_t col, uint32_t row) const {
return ncol * row + col;
}
};
template <class Initializer>
Matrix_t *createMatrix
( uint32_t ncol,
uint32_t nrow,
Initializer initFn
) {
size_t nfloats = ncol*nrow;
size_t nbytes = UINTMAX_C(8) + nfloats * sizeof(float);
Matrix_t * result =
reinterpret_cast<Matrix_t*>(operator new(nbytes));
if (nullptr != result) {
result->ncol = ncol;
result->nrow = nrow;
for (uint32_t row = 0; row < nrow; row++) {
for (uint32_t col = 0; col < ncol; col++) {
result->values[result->to_index(col,row)] =
initFn(ncol,nrow,col,row);
}
}
}
return result;
}
void saveMatrixAsText_IOSTREAM(const char* filePath,
const Matrix_t* matrix) {
std::cout << "saveMatrixAsText_IOSTREAM()" << std::endl;
if (nullptr == matrix) {
std::cout << "cannot save matrix - no matrix!" << std::endl;
}
std::ofstream outFile(filePath);
if (outFile) {
outFile << matrix->ncol << " " << matrix->nrow << std::endl;
const auto defaultPrecision = outFile.precision();
outFile.precision
(std::numeric_limits<float>::max_digits10);
for (uint32_t row = 0; row < matrix->nrow; row++) {
for (uint32_t col = 0; col < matrix->ncol; col++) {
outFile << matrix->values[matrix->to_index(col,row)]
<< " ";
}
outFile << std::endl;
}
} else {
std::cout << "could not open " << filePath << " for writing."
<< std::endl;
}
}
void saveMatrixAsText_FMT(const char* filePath,
const Matrix_t* matrix) {
std::cout << "saveMatrixAsText_FMT(" << filePath << ")"
<< std::endl;
if (nullptr == matrix) {
std::cout << "cannot save matrix - no matrix!" << std::endl;
}
auto outFile = fmt::output_file(filePath);
outFile.print("{} {}\n", matrix->ncol, matrix->nrow);
for (uint32_t row = 0; row < matrix->nrow; row++) {
outFile.print("{}", matrix->values[matrix->to_index(0,row)]);
for (uint32_t col = 1; col < matrix->ncol; col++) {
outFile.print(" {}",
matrix->values[matrix->to_index(col,row)]);
}
outFile.print("\n");
}
}
void saveMatrixAsBinary(const char* filePath,
const Matrix_t* matrix) {
std::cout << "saveMatrixAsBinary()" << std::endl;
FILE * outFile = fopen(filePath, "wb");
if (nullptr != outFile) {
fwrite( &matrix->ncol, 4, 1, outFile);
fwrite( &matrix->nrow, 4, 1, outFile);
size_t nfloats = matrix->ncol * matrix->nrow;
fwrite( &matrix->values, sizeof(float), nfloats, outFile);
fclose(outFile);
} else {
std::cout << "could not open " << filePath << " for writing."
<< std::endl;
}
}
Matrix_t* loadMatrixFromText_IOSTREAM(const char* filePath) {
std::cout << "loadMatrixFromText_IOSTREAM()" << std::endl;
std::ifstream inFile(filePath);
if (inFile) {
uint32_t ncol;
uint32_t nrow;
inFile >> ncol;
inFile >> nrow;
uint32_t nfloats = ncol * nrow;
auto loader =
[&inFile]
(uint32_t , uint32_t , uint32_t , uint32_t )
-> float
{
float value;
inFile >> value;
return value;
};
Matrix_t * matrix = createMatrix( ncol, nrow, loader);
return matrix;
} else {
std::cout << "could not open " << filePath << "for reading."
<< std::endl;
}
return nullptr;
}
Matrix_t* loadMatrixFromText_STDIO(const char* filePath) {
std::cout << "loadMatrixFromText_STDIO(" << filePath << ")"
<< std::endl;
Matrix_t* matrix = nullptr;
FILE * inFile = fopen(filePath, "rt");
if (nullptr != inFile) {
uint32_t ncol;
uint32_t nrow;
fscanf(inFile, "%d %d", &ncol, &nrow);
auto loader =
[&inFile]
(uint32_t , uint32_t , uint32_t , uint32_t )
-> float
{
float value;
fscanf(inFile, "%f", &value);
return value;
};
matrix = createMatrix( ncol, nrow, loader);
fclose(inFile);
} else {
std::cout << "could not open " << filePath << "for reading."
<< std::endl;
}
return matrix;
}
Matrix_t* loadMatrixFromBinary(const char* filePath) {
std::cout << "loadMatrixFromBinary(" << filePath << ")"
<< std::endl;
FILE * inFile = fopen(filePath, "rb");
if (nullptr != inFile) {
uint32_t ncol;
uint32_t nrow;
fread( &ncol, 4, 1, inFile);
fread( &nrow, 4, 1, inFile);
uint32_t nfloats = ncol * nrow;
uint32_t nbytes = nfloats * sizeof(float) + UINT32_C(8);
Matrix_t* matrix =
reinterpret_cast<Matrix_t*>
(operator new (nbytes));
if (nullptr != matrix) {
matrix->ncol = ncol;
matrix->nrow = nrow;
fread( &matrix->values[0], sizeof(float), nfloats, inFile);
return matrix;
} else {
std::cout << "could not find memory for the matrix."
<< std::endl;
}
fclose(inFile);
} else {
std::cout << "could not open file "
<< filePath << " for reading." << std::endl;
}
return nullptr;
}
void freeMatrix(Matrix_t* matrix) {
operator delete(matrix);
}
Matrix_t* mapMatrixFromBinaryFile(const char* filePath) {
std::cout << "mapMatrixFromBinaryFile(" << filePath << ")"
<< std::endl;
Matrix_t * matrix = nullptr;
int fd = open( filePath, O_RDONLY);
if (-1 != fd) {
struct stat sb;
if (-1 != fstat(fd, &sb)) {
auto fileSize = sb.st_size;
matrix =
reinterpret_cast<Matrix_t*>
(mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, fd, 0));
if (nullptr == matrix) {
std::cout << "mmap() failed!" << std::endl;
}
} else {
std::cout << "fstat() failed!" << std::endl;
}
close(fd);
} else {
std::cout << "open() failed!" << std::endl;
}
return matrix;
}
void unmapMatrix(Matrix_t* matrix) {
if (nullptr == matrix)
return;
size_t nbytes =
UINTMAX_C(8) +
sizeof(float) * matrix->ncol * matrix->nrow;
munmap(matrix, nbytes);
}
bool areMatricesEqual( const Matrix_t* m1, const Matrix_t* m2) {
if (nullptr == m1) return false;
if (nullptr == m2) return false;
if (m1->ncol != m2->ncol) return false;
if (m1->nrow != m2->nrow) return false;
// both exist and have same size...
size_t nfloats = m1->ncol * m1->nrow;
size_t nbytes = nfloats * sizeof(float);
return 0 == memcmp( m1->values, m2->values, nbytes);
}
int main(int argc, const char* argv[]) {
std::random_device rdev;
std::default_random_engine reng(rdev());
std::uniform_real_distribution<> rdist(-1.0E6F, 1.0E6F);
StopWatch sw;
auto randomInitFunction =
[&reng,&rdist]
(uint32_t ncol, uint32_t nrow, uint32_t col, uint32_t row)
-> float
{
return rdist(reng);
};
sw.start("creating random matrix (20k x 20k)");
Matrix_t * randomMatrix =
createMatrix(UINT32_C(20000),
UINT32_C(20000),
randomInitFunction);
sw.stop();
if (nullptr != randomMatrix) {
std::cout
<< "the first 10 floating values in randomMatrix are: "
<< std::endl;
std::cout << randomMatrix->values[0];
for (size_t i = 1; i < 10; i++) {
std::cout << " " << randomMatrix->values[i];
}
std::cout << std::endl;
sw.start("saving matrix with iostream.");
saveMatrixAsText_IOSTREAM("mat0_iostream.txt", randomMatrix);
sw.stop();
sw.start("saving matrix with {fmt}.");
saveMatrixAsText_FMT("mat0_fmt.txt", randomMatrix);
sw.stop();
sw.start("saving matrix into a binary file.");
saveMatrixAsBinary("mat0.bin", randomMatrix);
sw.stop();
sw.start("loading matrix from text file with iostream.");
Matrix_t* textMatrix =
loadMatrixFromText_IOSTREAM("mat0_iostream.txt");
sw.stop();
sw.start("comparing randomMatrix with textMatrix.");
if (!areMatricesEqual(randomMatrix, textMatrix)) {
std::cout << "randomMatrix != textMatrix!" << std::endl;
} else {
std::cout << "randomMatrix == textMatrix" << std::endl;
}
sw.stop();
freeMatrix(textMatrix);
textMatrix = nullptr;
sw.start("loading matrix from text file with stdio.");
textMatrix =
loadMatrixFromText_STDIO("mat0_fmt.txt");
sw.stop();
sw.start("comparing randomMatrix with textMatrix (stdio).");
if (!areMatricesEqual(randomMatrix, textMatrix)) {
std::cout << "randomMatrix != textMatrix!" << std::endl;
} else {
std::cout << "randomMatrix == textMatrix" << std::endl;
}
sw.stop();
freeMatrix(textMatrix);
textMatrix = nullptr;
sw.start("loading matrix from binary file into memory.");
Matrix_t* inMemoryMatrix =
loadMatrixFromBinary("mat0.bin");
sw.stop();
sw.start("comparing randomMatrix with inMemoryMatrix.");
if (!areMatricesEqual(randomMatrix, inMemoryMatrix)) {
std::cout << "randomMatrix != inMemoryMatrix!"
<< std::endl;
} else {
std::cout << "randomMatrix == inMemoryMatrix" << std::endl;
}
sw.stop();
freeMatrix(inMemoryMatrix);
inMemoryMatrix = nullptr;
sw.start("mapping a view to a matrix in a binary file.");
Matrix_t* mappedMatrix =
mapMatrixFromBinaryFile("mat0.bin");
sw.stop();
sw.start("comparing randomMatrix with mappedMatrix.");
if (!areMatricesEqual(randomMatrix, mappedMatrix)) {
std::cout << "randomMatrix != mappedMatrix!"
<< std::endl;
} else {
std::cout << "randomMatrix == mappedMatrix" << std::endl;
}
sw.stop();
unmapMatrix(mappedMatrix);
mappedMatrix = nullptr;
freeMatrix(randomMatrix);
} else {
std::cout << "could not create random matrix!" << std::endl;
}
return 0;
}
Please note, that binary formats where you simply cast to a struct pointer also depend on how the compiler does alignment and padding within structures. In my case, I was lucky and it worked. On other systems, you might have to tweak a little (#pragma pack(4) or something along that line) to make it work.
I'm trying to run FP16 person-detection-retail-0013 and person-reidentification-retail-0079 on Intel Neural Compute Stick hardware, but once I run the application to load the nets on the device I get this exception:
[INFERENCE ENGINE EXCEPTION] Dynamic batch is not supported
I've load the net with setting of the max batch size to 1 and I've started my project from the pedestrian tracker demo into the OpenVINO toolkit:
main.cpp --> CreatePedestrianTracker
CnnConfig reid_config(reid_model, reid_weights);
reid_config.max_batch_size = 16;
try {
if (ie.GetConfig(deviceName, CONFIG_KEY(DYN_BATCH_ENABLED)).as<std::string>() !=
PluginConfigParams::YES) {
reid_config.max_batch_size = 1;
std::cerr << "[DEBUG] Dynamic batch is not supported for " << deviceName << ". Fall back
to batch 1." << std::endl;
}
}
catch (const InferenceEngine::details::InferenceEngineException& e) {
reid_config.max_batch_size = 1;
std::cerr << e.what() << " for " << deviceName << ". Fall back to batch 1." << std::endl;
}
Cnn.cpp --> void CnnBase::InferBatch
void CnnBase::InferBatch(
const std::vector<cv::Mat>& frames,
std::function<void(const InferenceEngine::BlobMap&, size_t)> fetch_results) const {
const size_t batch_size = input_blob_->getTensorDesc().getDims()[0];
size_t num_imgs = frames.size();
for (size_t batch_i = 0; batch_i < num_imgs; batch_i += batch_size) {
const size_t current_batch_size = std::min(batch_size, num_imgs - batch_i);
for (size_t b = 0; b < current_batch_size; b++) {
matU8ToBlob<uint8_t>(frames[batch_i + b], input_blob_, b);
}
if ((deviceName_.find("MYRIAD") == std::string::npos) && (deviceName_.find("HDDL") ==
std::string::npos)) {
infer_request_.SetBatch(current_batch_size);
}
infer_request_.Infer();
fetch_results(outputs_, current_batch_size);
}
}
I suppose that the problem could be the topology of the detection net, but I ask if anyone has had the same problem and solved the issue.
Thank's.
I am afraid, myriad plugin does not support dynamic batch. Please try an updated version of the demo. You can find it, for example, here: https://github.com/opencv/open_model_zoo/tree/master/demos/pedestrian_tracker_demo
The demo is updated not to use dynamic batch at all.
I tried the conventional way of passing an array to a wrapper function in which I'm using insertOne to insert data using for loop. No build issues, but while running, I'm hitting this error: Microsoft C++ exception: mongocxx::v_noabi::bulk_write_exception at memory location 0x000000B26C12DF30. Here is my source code.
int main(void) {
char EUI[][20] = { "10205E3710014240", "10205e37100142cc" ,"10205E6910001E58", "10205E371001426C" };
char IP[][15] = { "192.168.85.117" , "192.168.85.114", "192.168.85.186", "192.168.85.168" };
int i = 4;
push_data(IP, EUI, i);
while (1);
}
void push_data(char IP[][15], char EUI[][20], int count)
{
mongocxx::instance inst{};
mongocxx::client conn{ mongocxx::uri{} };
auto collection = conn["new"]["collection"];
int a;
builder::stream::document builder{};
auto in_array = builder << "subdocs" << builder::stream::open_array;
for (a = 0; a<count; a++) {
in_array = in_array << builder::stream::open_document << EUI[a] << IP[a]
<< builder::stream::close_document;
}
auto after_array = in_array << builder::stream::close_array;
bsoncxx::document::value doc = after_array << builder::stream::finalize;
bsoncxx::document::view view = doc.view();
for (a = 0; a < count; a++) {
collection.insert_one(doc.view());
}
auto cursor = collection.find({});
for (auto&& doc : cursor) {
std::cout << bsoncxx::to_json(doc) << std::endl;
}
}
Almost certainly, an exception has been thrown from collection.insert_one(doc.view());. You should catch that exception (by using try, and catch), and inspect the contents of the exception, which should tell you more about what is going wrong.