I am using Tensorflow's C API to do inference within a parallelized simulation. As I wanted AVX support I compiled Tensorflow from source. I linked it and compiled everything using wmake.
Now if I start a normal (non-parallelized) simulation run, everything works fine. But if I parallelize it I get this error immediately after starting the simulation run:
[node134:18796] *** Process received signal ***
[node134:18796] Signal: Segmentation fault (11)
[node134:18796] Signal code: Address not mapped (1)
[node134:18796] Failing at address: (nil)
[node134:18796] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x3ef20)[0x7fec1c96ff20]
[node134:18796] [ 1] /home/elias/OpenFOAM/elias-4.1/platforms/linux64GccDPInt32Opt/lib/libtensorflow_framework.so(hwloc_bitmap_and+0x14)[0x7fec01c21534]
[node134:18796] [ 2] /usr/lib/x86_64-linux-gnu/libopen-pal.so.20(opal_hwloc_base_filter_cpus+0x380)[0x7febe59d6b80]
[node134:18796] [ 3] /usr/lib/x86_64-linux-gnu/openmpi/lib/openmpi/mca_ess_pmi.so(+0x2b4e)[0x7febe4902b4e]
[node134:18796] [ 4] /usr/lib/x86_64-linux-gnu/libopen-rte.so.20(orte_init+0x22e)[0x7febe5c2a1de]
[node134:18796] [ 5] /usr/lib/x86_64-linux-gnu/libmpi.so.20(ompi_mpi_init+0x30e)[0x7febffdbc27e]
[node134:18796] [ 6] /usr/lib/x86_64-linux-gnu/libmpi.so.20(MPI_Init+0x6b)[0x7febffddd2ab]
[node134:18796] [ 7] /opt/OpenFOAM/OpenFOAM-4.1/platforms/linux64GccDPInt32Opt/lib/openmpi-system/libPstream.so(_ZN4Foam8UPstream4initERiRPPc+0x1f)[0x7fec1c72843f]
[node134:18796] [ 8] /opt/OpenFOAM/OpenFOAM-4.1/platforms/linux64GccDPInt32Opt/lib/libOpenFOAM.so(_ZN4Foam7argListC1ERiRPPcbbb+0x719)[0x7fec1db36ed9]
[node134:18796] [ 9] tabulatedCombustionFoam(+0x279b8)[0x55fe6eb489b8]
[node134:18796] [10] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7)[0x7fec1c952b97]
[node134:18796] [11] tabulatedCombustionFoam(+0x30a0a)[0x55fe6eb51a0a]
[node134:18796] *** End of error message ***
I tried to fix it on my own but so far I do not see any progress. Can somebody recognize the reason for this issue?
Thanks in advance!
Edit: I did not assume that the code might be wrong, as it worked under following conditions:
- without parallelization
- in parallel with Standard C API Version one can download
Here is the relevant part of the "main":
auto t_start_0 = std::chrono::high_resolution_clock::now();
const char* frozenGraphName = "/home/elias/Lr75-57_FPVANN_premix/data/FPV_ANN_tabulated_Standard_500.pb";
const std::string iON = string(input_layer_name);
const char* inputOperationName = iON.c_str();
const std::string oON = string(output_layer_name) + "/BiasAdd";
const char* outputOperationName = oON.c_str();
int no_of_inputs = in_mean.size();
int no_of_outputs = out_mean.size();
int cellsAndPatches = (input_f_zeta_PVNorm.size())/no_of_inputs;
std::vector<int64_t> input_dimensions = {cellsAndPatches,no_of_inputs};
std::vector<int64_t> output_dimensions = {cellsAndPatches,no_of_outputs};
Inference* inf = new Inference();
bool success = inf->doInference(frozenGraphName,inputOperationName,outputOperationName,no_of_inputs,no_of_outputs,input_dimensions,output_dimensions,cellsAndPatches,input_f_zeta_PVNorm,output_real,limit_cores);
delete inf;
auto t_end_0 = std::chrono::high_resolution_clock::now();
auto total_0 = std::chrono::duration<float, std::milli>(t_end_0 - t_start_0).count();
std::cout << "TOTAL INFERENCE TIME C API: " << total_0 << std::endl;
This is the header file:
#include "c_api.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include <assert.h>
#include <vector>
#include <algorithm>
#include <iterator>
#include <cstdlib>
#include <iostream>
#include <chrono>
#include <ctime>
#include <memory>
#include <unistd.h>
#include <thread>
static void DeallocateBuffer(void* data, size_t)
static TF_Buffer* ReadBufferFromFile(const char* file)
const auto f = std::fopen(file, "rb");
if (f == nullptr)
return nullptr;
std::fseek(f, 0, SEEK_END);
const auto fsize = ftell(f);
std::fseek(f, 0, SEEK_SET);
if (fsize < 1)
return nullptr;
const auto data = std::malloc(fsize);
std::fread(data, fsize, 1, f);
TF_Buffer* buf = TF_NewBuffer();
buf->data = data;
buf->length = fsize;
buf->data_deallocator = DeallocateBuffer;
return buf;
static void DeallocateTensor(void* data, std::size_t, void*) // vorher void* arg
class Inference
bool doInference(const char*,const char*,const char*,int,int,std::vector<int64_t>,std::vector<int64_t>,int,std::vector<float>&,std::vector<float>&,int);
#endif // INFERENCEC_H
This is the .C file:
#include "inferenceC.H"
bool Inference::doInference(const char* fgn, const char* iname, const char* oname, int nIn, int nOut, std::vector<int64_t> dimIn,std::vector<int64_t> dimOut, int CP, std::vector<float> &inVals, std::vector<float> &outVals, int maxCores)
TF_Buffer* graph_def = ReadBufferFromFile(fgn);
if (graph_def == nullptr)
std::cout << "Can't read buffer from file" << std::endl;
TF_Graph* graph = TF_NewGraph();
TF_Status* status = TF_NewStatus();
TF_ImportGraphDefOptions* graph_opts = TF_NewImportGraphDefOptions();
TF_GraphImportGraphDef(graph, graph_def, graph_opts, status);
std::cout << "ERROR: Unable to import graph " << TF_Message(status) << std::endl;
//int num_bytes_in = CP*nIn*sizeof(float);
//int num_bytes_out = CP*nOut*sizeof(float);
std::cout << "Effective batch size: " << (inVals.size()/nIn) << std::endl;
TF_Output input = {TF_GraphOperationByName(graph, iname), 0};
TF_Output output = {TF_GraphOperationByName(graph, oname), 0};
const std::vector<std::int64_t> dims = {CP,nIn};
std::size_t data_size = sizeof(float);
for (auto i : dims)
data_size *= i;
auto data = static_cast<float*>(std::malloc(data_size));
std::copy(inVals.begin(), inVals.end(), data);
TF_Tensor* input_value = TF_NewTensor(TF_FLOAT,dims.data(), static_cast<int>(dims.size()),data, data_size,DeallocateTensor, nullptr);
const std::vector<int64_t> outdims = {CP,nOut};
std::size_t outdata_size = sizeof(float);
for (auto i : outdims)
outdata_size *= i;
TF_Tensor* output_value = nullptr;
std::cout << "Running session..." << std::endl;
TF_SessionOptions* sess_opts = TF_NewSessionOptions();
uint8_t intra_op_parallelism_threads = maxCores; // for operations that can be parallelized internally, such as matrix multiplication
uint8_t inter_op_parallelism_threads = maxCores; // for operationss that are independent in your TensorFlow graph because there is no directed path between them in the dataflow graph
uint8_t config[]={0x10,intra_op_parallelism_threads,0x28,inter_op_parallelism_threads};
if (TF_GetCode(status) != TF_OK)
printf("ERROR: %s\n", TF_Message(status));
TF_Session* session = TF_NewSession(graph, sess_opts, status);
auto t_start = std::chrono::high_resolution_clock::now();
TF_SessionRun(session, nullptr, &input, &input_value, 1, &output, &output_value, 1, nullptr, 0, nullptr, status);
auto t_end = std::chrono::high_resolution_clock::now();
auto total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
std::cout << "time required for TF_SessionRun: " << total << std::endl;
float* out_vals = static_cast<float*>(TF_TensorData(output_value));
std::vector<float> results(nOut*CP,0);
for(int i=0;i<CP;i++)
for(int j=0;j<nOut;j++)
results.at(i*nOut+j) = *out_vals;
std::cout << "Successfully ran session!" << std::endl;
outVals = results;
return 0;
As the following link shows it was no code error, instead there was a Tensorflow problem which is resolved on the current master branch:
I would like to measure the maximum memory usage of abc.exe on random tests generated by gen.exe. How could I do that?
My code that runs abc.exe on tests from gen.exe looks like this:
#include <bits/stdc++.h>
using namespace std;
int main()
int i = 0;
while (true)
string si = to_string(i);
cout << i << "\n";
if (system(("echo " + si + "| ./gen.exe > test.in").c_str())) // gen.exe is test generator
cout << "gen error\n";
if (system(("./abc.exe < test.in > a.out"))) // abc.exe is the program I want to test
cout << "abc error\n";
I know that i can use time -v ./abc.exe but then the used memory is printed in the terminal but I'd like to be able to save it to a variable.
You can use getrusage( RUSAGE_CHILDREN, ... ) to obtain the maximum resident memory. Note that this call will return the maximum memory used by the biggest child at that point in time.
In the example below I used boost::process because it gives better control but it's up to you to use std::system or not, works the same way.
#include <string>
#include <cstdint>
#include <string.h>
#include <iostream>
#include <boost/process/child.hpp>
#include <sys/resource.h>
namespace bp = boost::process;
int parent( const std::string& exename )
// Loop from 0 to 10 megabytes
for ( int j=0; j<10; ++j )
// Command name is the name of this executable plus one argument with size
std::string gencmd = exename + " " + std::to_string(j);
// Start process
bp::child child( gencmd );
// Wait for it to allocate memory
// Query the memory usage at this point in time
struct rusage ru;
getrusage( RUSAGE_CHILDREN, &ru );
std::cerr << "Loop:" << j << " mem:"<< ru.ru_maxrss/1024. << " MB" << std::endl;
// Wait for process to quit
if ( child.exit_code()!=0 )
std::cerr << "Error executing child:" << child.exit_code() << std::endl;
return 1;
return 0;
int child( int size ) {
// Allocated "size" megabites explicitly
size_t memsize = size*1024*1024;
uint8_t* ptr = (uint8_t*)malloc( memsize );
memset( ptr, size, memsize );
// Wait for the parent to sample our memory usage
sleep( 2 );
// Free memory
free( ptr );
return 0;
int main( int argc, char* argv[] )
// Without arguments, it is the parent.
// Pass the name of the binary
if ( argc==1 ) return parent( argv[0] );
return child( std::atoi( argv[1] ) );
It prints
$ ./env_test
Loop:0 mem:0 MB
Loop:1 mem:3.5625 MB
Loop:2 mem:4.01953 MB
Loop:3 mem:5.05469 MB
Loop:4 mem:6.04688 MB
Loop:5 mem:7.05078 MB
Loop:6 mem:7.78516 MB
Loop:7 mem:8.97266 MB
Loop:8 mem:9.82031 MB
Loop:9 mem:10.8867 MB
If you cannot use boost libraries, you'd got to work a little more but it is still feasible.
If you just want to know the maximum size ever of your children processes then the following works with std::system:
#include <cstdio>
#include <string>
#include <iostream>
#include <sstream>
#include <string.h>
#include <unistd.h>
#include <sys/resource.h>
int main(int argc, char* argv[]) {
if (argc > 1) {
size_t size = ::atol(argv[1]);
size_t memsize = size * 1024 * 1024;
void* ptr = ::malloc(memsize);
memset(ptr, 0, memsize);
return 0;
for (int j = 0; j < 10; ++j) {
std::ostringstream cmd;
cmd << argv[0] << " " << j;
int res = std::system(cmd.str().c_str());
if (res < 0) {
fprintf(stderr, "ERROR system: %s\n", strerror(errno));
struct rusage ru;
res = getrusage(RUSAGE_CHILDREN, &ru);
size_t maxmem = ru.ru_maxrss;
fprintf(stderr, "Loop:%d MaxMem:%ld\n", j, maxmem);
return 0;
It prints
Loop:0 MaxMem:3552
Loop:1 MaxMem:4192
Loop:2 MaxMem:5148
Loop:3 MaxMem:6228
Loop:4 MaxMem:7364
Loop:5 MaxMem:8456
Loop:6 MaxMem:9120
Loop:7 MaxMem:10188
Loop:8 MaxMem:11324
Loop:9 MaxMem:12256
However if you want to keep track of the memory usage during the child process execution you cannot use std::system(). First, you need to call fork() to spawn a new process and then execv() to execute a bash command.
#include <string>
#include <cstdint>
#include <string.h>
#include <unistd.h>
#include <iostream>
#include <sys/resource.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <vector>
int parent(const std::string& exename) {
// Loop from 0 to 10 megabytes
for (int j = 0; j < 10; ++j) {
// Command name is the name of this executable plus one argument with size
std::string gencmd = exename + " " + std::to_string(j);
// Start process
pid_t pid = fork();
if (pid == 0) { // child
const char* args[] = {"/bin/bash", "-c", gencmd.c_str(), (char*)0};
int res = execv("/bin/bash", (char**)args);
// Should never return
std::cerr << "execv error: " << strerror(errno) << std::endl;
return 1;
// parent
long maxmem = 0;
while (true) {
int status;
pid_t rid = ::waitpid(pid, &status, WNOHANG);
if (rid < 0) {
if (errno != ECHILD) {
std::cerr << "waitpid:" << strerror(errno) << std::endl;
return 2;
if (rid == pid) {
if (WIFEXITED(pid)) {
// Wait for it to allocate memory
// Query the memory usage at this point in time
struct rusage ru;
int res = getrusage(RUSAGE_CHILDREN, &ru);
if (res != 0) {
if (errno != ECHILD) {
std::cerr << "getrusage:" << errno << strerror(errno) << std::endl;
if (maxmem < ru.ru_maxrss) {
maxmem = ru.ru_maxrss;
std::cerr << "Loop:" << j << " mem:" << maxmem / 1024. << " MB" << std::endl;
return 0;
int child(int size) {
// Allocated "size" megabites explicitly
size_t memsize = size * 1024 * 1024;
uint8_t* ptr = (uint8_t*)malloc(memsize);
memset(ptr, size, memsize);
// Wait for the parent to sample our memory usage
// Free memory
return 0;
int main(int argc, char* argv[]) {
// Without arguments, it is the parent.
// Pass the name of the binary
if (argc == 1) return parent(argv[0]);
return child(std::atoi(argv[1]));
The result on my machine is:
$ ./fork_test
Loop:0 mem:3.22656 MB
Loop:1 mem:3.69922 MB
Loop:2 mem:4.80859 MB
Loop:3 mem:5.92578 MB
Loop:4 mem:6.87109 MB
Loop:5 mem:8.05469 MB
Loop:6 mem:8.77344 MB
Loop:7 mem:9.71875 MB
Loop:8 mem:10.7422 MB
Loop:9 mem:11.6797 MB
There is a video about this post.
I'm tring to use program ***to print frame stack *** in C/C++ code.
Below is a demo find form internet:
print call stack in C or C++
#include <cassert>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include <iomanip>
#include <cxxabi.h> // __cxa_demangle
#include <elfutils/libdwfl.h> // Dwfl*
#include <execinfo.h> // backtrace
#include <unistd.h> // getpid
using namespace std;
// https://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
std::string demangle(const char* name) {
int status = -4;
std::unique_ptr<char, void(*)(void*)> res {
abi::__cxa_demangle(name, NULL, NULL, &status),
return (status==0) ? res.get() : name ;
std::string debug_info(Dwfl* dwfl, void* ip) {
uintptr_t ip2 = reinterpret_cast<uintptr_t>(ip);
Dwfl_Module* module = dwfl_addrmodule(dwfl, ip2);
char const* name = dwfl_module_addrname(module, ip2);
std::string function_name = name ? demangle(name) : "<unknown>";
int line_num = -1;
char const* file_name;
if (Dwfl_Line* dwfl_line = dwfl_module_getsrc(module, ip2)) {
Dwarf_Addr addr;
file_name = dwfl_lineinfo(dwfl_line, &addr, &line_num, nullptr, nullptr, nullptr);
std::stringstream ss;
ss << std::setw(16)<<std::setfill('0') <<ip << ' ' << function_name;
if (file_name)
ss << " at " << file_name << ':' << line_num;
ss << std::endl;
return ss.str();
std::string gen_framestack_backtrace() {
// Initialize Dwfl.
Dwfl* dwfl = nullptr;
Dwfl_Callbacks callbacks = {};
char* debuginfo_path = nullptr;
callbacks.find_elf = dwfl_linux_proc_find_elf;
callbacks.find_debuginfo = dwfl_standard_find_debuginfo;
callbacks.debuginfo_path = &debuginfo_path;
dwfl = dwfl_begin(&callbacks);
int r;
r = dwfl_linux_proc_report(dwfl, getpid());
r = dwfl_report_end(dwfl, nullptr, nullptr);
// Loop over stack frames.
std::stringstream ss;
void* stack[512];
int stack_size = ::backtrace(stack, sizeof stack / sizeof *stack);
for (int i = 0; i < stack_size; ++i) {
ss << i << ": ";
// Works.
ss << debug_info(dwfl, stack[i]);
#if 0
// TODO intended to do the same as above, but segfaults,
// so possibly UB In above function that does not blow up by chance?
void *ip = stack[i];
std::string function;
int line = -1;
char const* file;
uintptr_t ip2 = reinterpret_cast<uintptr_t>(ip);
Dwfl_Module* module = dwfl_addrmodule(dwfl, ip2);
char const* name = dwfl_module_addrname(module, ip2);
function = name ? demangle(name) : "<unknown>";
// TODO if I comment out this line it does not blow up anymore.
if (Dwfl_Line* dwfl_line = dwfl_module_getsrc(module, ip2)) {
Dwarf_Addr addr;
file = dwfl_lineinfo(dwfl_line, &addr, &line, nullptr, nullptr, nullptr);
ss << ip << ' ' << function;
if (file)
ss << " at " << file << ':' << line;
ss << std::endl;
return ss.str();
void my_func_2() {
std::cout << gen_framestack_backtrace() << std::endl;
void my_func_1(double f) {
void my_func_1(int i) {
int main(int argc, char **argv) {
long long unsigned int n;
if (argc > 1) {
n = strtoul(argv[1], NULL, 0);
} else {
n = 1;
for (long long unsigned int i = 0; i < n; ++i) {
my_func_1(1); // line 122
my_func_1(2.0); // line 123
$ sudo apt install libdw-dev libunwind-dev
$ g++ -fno-pie -ggdb3 -O0 -no-pie -o a.out -std=c++11 -Wall -Wextra -pedantic-errors test.cpp -ldw -lunwind -ggdb
$ ./a.out
0: 000000000x401ab1 stacktrace[abi:cxx11]() at /home/wxq/test/test7.cpp:71
1: 000000000x401c11 my_func_2() at /home/wxq/test/test7.cpp:106
2: 000000000x401ca2 my_func_1(int) at /home/wxq/test/test7.cpp:117
3: 000000000x401d01 main at /home/wxq/test/test7.cpp:128
4: 000x7f3e4ee4dbf6 __libc_start_main at ../csu/libc-start.c:310
5: 000000000x401479 _start at ../csu/libc-start.c:-1
0: 000000000x401ab1 stacktrace[abi:cxx11]() at /home/wxq/test/test7.cpp:71
1: 000000000x401c11 my_func_2() at /home/wxq/test/test7.cpp:106
2: 000000000x401c8f my_func_1(double) at /home/wxq/test/test7.cpp:112
3: 000000000x401d16 main at /home/wxq/test/test7.cpp:129
4: 000x7f3e4ee4dbf6 __libc_start_main at ../csu/libc-start.c:310
5: 000000000x401479 _start at ../csu/libc-start.c:-1
But above solution can not print function arguments
Does any one have solution to print backtrace in C/C++ program, just like gdb bt command?
Just like below command
#0 createObj1 (handle=0x5555559291c0, shimHandle=0x55555595a850) at /home/wxq/setup.cpp:983
#1 0x00007ffff60a3763 in initialize (this=0x55555595a850, config=...)at /home/wxq/test.cpp:197
#2 0x00007ffff60a24f2 in create_extended (setup=0x5555559291c0) at /home/wxq/test.cpp:509
#3 0x0000555555555538 in main (argc=5, argv=0x7fffffffe0e8) at /home/wxq/core_model.cpp:145
Mxnet c++ inference with MXPredSetInput segmentation fault
1. background
I have tried https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp successed.
But when I try to deploy mxnet in c++ with my own model, I met a segmentation fault error:
[17:33:07] src/nnvm/legacy_json_util.cc:209: Loading symbol saved by previous version v1.2.1. Attempting to upgrade...
Signal: SIGSEGV (Segmentation fault)
2. code with error:
MXPredSetInput(pred_hnd, "data", image_data.data(), static_cast<mx_uint>(image_size));
3. tips
First I thought it's because of input data shape not compatible with the model input layer.But I ask model designer, it's a resnet model with conv only, so, any kind input shape should be OK.
4. Download model:
Download them, and put them into model dir.
4. code: find: https://github.com/jaysimon/mxnet_cpp_infere
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <vector>
#include <memory>
#include <thread>
#include <iomanip>
#include <opencv2/opencv.hpp>
// Path for c_predict_api
#include <mxnet/c_predict_api.h>
const mx_float DEFAULT_MEAN = 117.0;
static std::string trim(const std::string& input) {
auto not_space = [](int ch) {
return !std::isspace(ch);
auto output = input;
output.erase(output.begin(), std::find_if(output.begin(), output.end(), not_space));
output.erase(std::find_if(output.rbegin(), output.rend(), not_space).base(), output.end());
return output;
// Read file to buffer
class BufferFile {
public :
std::string file_path_;
std::size_t length_ = 0;
std::unique_ptr<char[]> buffer_;
explicit BufferFile(const std::string& file_path)
: file_path_(file_path) {
std::ifstream ifs(file_path.c_str(), std::ios::in | std::ios::binary);
if (!ifs) {
std::cerr << "Can't open the file. Please check " << file_path << ". \n";
ifs.seekg(0, std::ios::end);
length_ = static_cast<std::size_t>(ifs.tellg());
ifs.seekg(0, std::ios::beg);
std::cout << file_path.c_str() << " ... " << length_ << " bytes\n";
// Buffer as null terminated to be converted to string
buffer_.reset(new char[length_ + 1]);
buffer_[length_] = 0;
ifs.read(buffer_.get(), length_);
std::size_t GetLength() {
return length_;
char* GetBuffer() {
return buffer_.get();
void GetImageFile(const std::string& image_file,
mx_float* image_data, int channels,
cv::Size resize_size, const mx_float* mean_data = nullptr) {
// Read all kinds of file into a BGR color 3 channels image
cv::Mat im_ori = cv::imread(image_file, cv::IMREAD_COLOR);
if (im_ori.empty()) {
std::cerr << "Can't open the image. Please check " << image_file << ". \n";
cv::Mat im;
resize(im_ori, im, resize_size);
int size = im.rows * im.cols * channels;
mx_float* ptr_image_r = image_data;
mx_float* ptr_image_g = image_data + size / 3;
mx_float* ptr_image_b = image_data + size / 3 * 2;
float mean_b, mean_g, mean_r;
mean_b = mean_g = mean_r = DEFAULT_MEAN;
mean_b = 103.06;
mean_g = 115.9;
mean_r = 123.15;
for (int i = 0; i < im.rows; i++) {
auto data = im.ptr<uchar>(i);
for (int j = 0; j < im.cols; j++) {
if (channels > 1) {
*ptr_image_b++ = static_cast<mx_float>(*data++) - mean_b;
*ptr_image_g++ = static_cast<mx_float>(*data++) - mean_g;
*ptr_image_r++ = static_cast<mx_float>(*data++) - mean_r;
// LoadSynsets
// Code from : https://github.com/pertusa/mxnet_predict_cc/blob/master/mxnet_predict.cc
std::vector<std::string> LoadSynset(const std::string& synset_file) {
std::ifstream fi(synset_file.c_str());
if (!fi.is_open()) {
std::cerr << "Error opening synset file " << synset_file << std::endl;
std::vector<std::string> output;
std::string synset, lemma;
while (fi >> synset) {
getline(fi, lemma);
return output;
void PrintOutputResult(const std::vector<float>& data, const std::vector<std::string>& synset) {
if (data.size() != synset.size()) {
std::cerr << "Result data and synset size do not match!" << std::endl;
float best_accuracy = 0.0;
std::size_t best_idx = 0;
for (std::size_t i = 0; i < data.size(); ++i) {
std::cout << "Accuracy[" << i << "] = " << std::setprecision(8) << data[i] << std::endl;
if (data[i] > best_accuracy) {
best_accuracy = data[i];
best_idx = i;
std::cout << "Best Result: " << trim(synset[best_idx]) << " (id=" << best_idx << ", " <<
"accuracy=" << std::setprecision(8) << best_accuracy << ")" << std::endl;
void predict(PredictorHandle pred_hnd, const std::vector<mx_float> &image_data,
NDListHandle nd_hnd, const std::string &synset_file, int i) {
auto image_size = image_data.size();
// Set Input
//>>>>>>>>>>>>>>>>>>>> Problem code <<<<<<<<<<<<<<<<<<<<<<<
MXPredSetInput(pred_hnd, "data", image_data.data(), static_cast<mx_uint>(image_size));
// <<<<<<<<<<<<<<<<<<<<<<< Problem code <<<<<<<<<<<<<<<<<<<<<<<
// Do Predict Forward
mx_uint output_index = 0;
mx_uint* shape = nullptr;
mx_uint shape_len;
// Get Output Result
MXPredGetOutputShape(pred_hnd, output_index, &shape, &shape_len);
std::size_t size = 1;
for (mx_uint i = 0; i < shape_len; ++i) { size *= shape[i]; }
std::vector<float> data(size);
MXPredGetOutput(pred_hnd, output_index, &(data[0]), static_cast<mx_uint>(size));
// Release NDList
if (nd_hnd) {
// Release Predictor
// Synset path for your model, you have to modify it
auto synset = LoadSynset(synset_file);
// Print Output Data
PrintOutputResult(data, synset);
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "No test image here." << std::endl
<< "Usage: ./image-classification-predict apple.jpg [num_threads]" << std::endl;
std::string test_file(argv[1]);
int num_threads = 1;
if (argc == 3)
num_threads = std::atoi(argv[2]);
// Models path for your model, you have to modify it
std::string json_file = "../model/rfcn_dcn_chicken-0000.json";
std::string param_file = "../model/rfcn_dcn_chicken-0000.params";
std::string synset_file = "../model/synset.txt";
std::string nd_file = "../model/mean_224.nd";
BufferFile json_data(json_file);
BufferFile param_data(param_file);
// Parameters
int dev_type = 1; // 1: cpu, 2: gpu
int dev_id = 0; // arbitrary.
mx_uint num_input_nodes = 1; // 1 for feedforward
const char* input_key[1] = { "data" };
const char** input_keys = input_key;
// Image size and channels
int width = 1000;
int height = 562;
int channels = 3;
const mx_uint input_shape_indptr[2] = { 0, 4 };
const mx_uint input_shape_data[4] = { 1,
static_cast<mx_uint>(width) };
if (json_data.GetLength() == 0 || param_data.GetLength() == 0) {
auto image_size = static_cast<std::size_t>(width * height * channels);
// Read Mean Data
const mx_float* nd_data = nullptr;
NDListHandle nd_hnd = nullptr;
BufferFile nd_buf(nd_file);
if (nd_buf.GetLength() > 0) {
mx_uint nd_index = 0;
mx_uint nd_len;
const mx_uint* nd_shape = nullptr;
const char* nd_key = nullptr;
mx_uint nd_ndim = 0;
MXNDListCreate(static_cast<const char*>(nd_buf.GetBuffer()),
&nd_hnd, &nd_len);
MXNDListGet(nd_hnd, nd_index, &nd_key, &nd_data, &nd_shape, &nd_ndim);
// Read Image Data
std::vector<mx_float> image_data(image_size);
GetImageFile(test_file, image_data.data(), channels, cv::Size(width, height), nd_data);
if (num_threads == 1) {
// Create Predictor
PredictorHandle pred_hnd;
MXPredCreate(static_cast<const char*>(json_data.GetBuffer()),
static_cast<const char*>(param_data.GetBuffer()),
predict(pred_hnd, image_data, nd_hnd, synset_file, 0);
} else {
// Create Predictor
std::vector<PredictorHandle> pred_hnds(num_threads, nullptr);
MXPredCreateMultiThread(static_cast<const char*>(json_data.GetBuffer()),
static_cast<const char*>(param_data.GetBuffer()),
for (auto hnd : pred_hnds)
std::vector<std::thread> threads;
for (int i = 0; i < num_threads; i++)
threads.emplace_back(predict, pred_hnds[i], image_data, nd_hnd, synset_file, i);
for (int i = 0; i < num_threads; i++)
printf("run successfully\n");
WINDOWS 10, amd64
Built tensorflow GPU enabled C++ static libraries with CMAKE GUI + MSBUILD
Built successful.
LABEL_IMAGE tutorial example execution times :
... Main.cc execution : 9.17 secs
... Label_image.py execution (tensorflow) : 10.34 secs
... Label_image.py execution (tensorflow-gpu) : 1.62 secs
Any idea why ? Thanks a lot
Main.cc with minor customizations :
#define NOMINMAX
#include <fstream>
#include <utility>
#include <vector>
#include "tensorflow/cc/ops/const_op.h"
#include "tensorflow/cc/ops/image_ops.h"
#include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/default_device.h"
#include "tensorflow/core/graph/graph_def_builder.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/init_main.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/public/session.h"
#include "tensorflow/core/util/command_line_flags.h"
// These are all common classes it's handy to reference with no namespace.
using tensorflow::Flag;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::string;
using tensorflow::int32;
static Status ReadEntireFile(tensorflow::Env* env, const string& filename, Tensor* output) {
tensorflow::uint64 file_size = 0;
TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
string contents;
std::unique_ptr<tensorflow::RandomAccessFile> file;
TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
tensorflow::StringPiece data;
TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(contents)[0]));
if (data.size() != file_size) {
return tensorflow::errors::DataLoss("Truncated read of '", filename, "' expected ", file_size, " got ", data.size());
output->scalar<string>()() = data.ToString();
return Status::OK();
// Given an image file name, read in the data, try to decode it as an image,
// resize it to the requested size, and then scale the values as desired.
Status ReadTensorFromImageFile(const string file_name, const int input_height, const int input_width, const float input_mean, const float input_std, std::vector<Tensor>* out_tensors) {
auto root = tensorflow::Scope::NewRootScope();
using namespace ::tensorflow::ops; // NOLINT(build/namespaces)
string input_name = "file_reader";
string output_name = "dim";
// read file_name into a tensor named input
Tensor input(tensorflow::DT_STRING, tensorflow::TensorShape());
TF_RETURN_IF_ERROR(ReadEntireFile(tensorflow::Env::Default(), file_name, &input));
// use a placeholder to read input data
auto file_reader = Placeholder(root.WithOpName("input"), tensorflow::DataType::DT_STRING);
std::vector<std::pair<string, tensorflow::Tensor>> inputs = { { "input", input }, };
// Now try to figure out what kind of file it is and decode it.
const int wanted_channels = 3;
tensorflow::Output image_reader;
if (tensorflow::StringPiece(file_name).ends_with(".png")) {
image_reader = DecodePng(root.WithOpName("png_reader"), file_reader, DecodePng::Channels(wanted_channels));
else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
// gif decoder returns 4-D tensor, remove the first dim
image_reader = Squeeze(root.WithOpName("squeeze_first_dim"), DecodeGif(root.WithOpName("gif_reader"), file_reader));
else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
else {
// Assume if it's neither a PNG nor a GIF then it must be a JPEG.
image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader, DecodeJpeg::Channels(wanted_channels));
// Now cast the image data to float so we can do normal math on it.
auto uint8_caster = Cast(root.WithOpName("uint8_caster"), image_reader, tensorflow::DT_UINT8);
// The convention for image ops in TensorFlow is that all images are expected
// to be in batches, so that they're four-dimensional arrays with indices of
// [batch, height, width, channel]. Because we only have a single image, we
// have to add a batch dimension of 1 to the start with ExpandDims().
auto dims_expander = ExpandDims(root.WithOpName(output_name), uint8_caster, 0);
// Bilinearly resize the image to fit the required dimensions.
//auto resized = ResizeBilinear(root, dims_expander,Const(root.WithOpName("size"), { input_height, input_width }));
// Subtract the mean and divide by the scale.
//Div(root.WithOpName(output_name), Sub(root, resized, { input_mean }),{ input_std });
// This runs the GraphDef network definition that we've just constructed, and
// returns the results in the output tensor.
tensorflow::GraphDef graph;
tensorflow::SessionOptions options;
std::unique_ptr<tensorflow::Session> session(tensorflow::NewSession(options));
TF_RETURN_IF_ERROR(session->Run({ inputs }, { output_name }, {}, out_tensors));
return Status::OK();
// Reads a model graph definition from disk, and creates a session object you
// can use to run it.
Status LoadGraph(const string& graph_file_name, std::unique_ptr<tensorflow::Session>* session) {
tensorflow::GraphDef graph_def;
Status load_graph_status = ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
if (!load_graph_status.ok()) {return tensorflow::errors::NotFound("Failed to load compute graph at '",graph_file_name, "'");}
tensorflow::SessionOptions options;
Status session_create_status = (*session)->Create(graph_def);
if (!session_create_status.ok()) {return session_create_status; }
return Status::OK();
int main(int argc, char* argv[]) {
// These are the command-line flags the program can understand.
// They define where the graph and input data is located, and what kind of
// input the model expects. If you train your own model, or use something
// other than inception_v3, then you'll need to update these.
string image = "tensorflow/examples/label_image/data/grace_hopper.jpg";
string graph = "tensorflow/examples/label_image/data/faster_rcnn_resnet101_coco_11_06_2017/frozen_inference_graph.pb";
string labels = "/tensorflow/tensorflow/examples/label_image/data/faster_rcnn_resnet101_coco_11_06_2017/graph.pbtxt";
int32 input_width = 299;
int32 input_height = 299;
float input_mean = 0;
float input_std = 255;
string input_layer = "image_tensor:0";
std::vector<string> output_layer = { "detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0" };
string o_layer = "detection_boxes:0, detection_scores : 0, detection_classes : 0, num_detections : 0"; //dummy for Flag structure
bool self_test = false;
string root_dir = "/tensorflow/";
std::vector<Flag> flag_list = {
Flag("image", &image, "image to be processed"),
Flag("graph", &graph, "graph to be executed"),
Flag("labels", &labels, "name of file containing labels"),
Flag("input_width", &input_width, "resize image to this width in pixels"),
Flag("input_height", &input_height,
"resize image to this height in pixels"),
Flag("input_mean", &input_mean, "scale pixel values to this mean"),
Flag("input_std", &input_std, "scale pixel values to this std deviation"),
Flag("input_layer", &input_layer, "name of input layer"),
Flag("output_layer", &o_layer, "name of output layer"),
Flag("self_test", &self_test, "run a self test"),
Flag("root_dir", &root_dir,
"interpret image and graph file names relative to this directory"),
string usage = tensorflow::Flags::Usage(argv[0], flag_list);
const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
if (!parse_result) {
LOG(ERROR) << usage;
return -1;
// We need to call this to set up global state for TensorFlow.
tensorflow::port::InitMain(argv[0], &argc, &argv);
if (argc > 1) {
LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
return -1;
// First we load and initialize the model.
std::unique_ptr<tensorflow::Session> session;
string graph_path = tensorflow::io::JoinPath(root_dir, graph);
Status load_graph_status = LoadGraph(graph_path, &session);
if (!load_graph_status.ok()) {
LOG(ERROR) << load_graph_status;
return -1;
// Get the image from disk as a float array of numbers, resized and normalized
// to the specifications the main graph expects.
std::vector<Tensor> resized_tensors;
string image_path = tensorflow::io::JoinPath(root_dir, image);
LOG(ERROR) << "Detection Basla....";
Status read_tensor_status = ReadTensorFromImageFile(image_path, input_height, input_width, input_mean, input_std, &resized_tensors);
if (!read_tensor_status.ok()) {
LOG(ERROR) << read_tensor_status;
return -1;
const Tensor resized_tensor = resized_tensors[0];
// Actually run the image through the model.
std::vector<Tensor> outputs;
Status run_status = session->Run({ { input_layer, resized_tensor } }, { output_layer }, {}, &outputs);
LOG(ERROR) << "Detection Bit......";
if (!run_status.ok()) {
LOG(ERROR) << "Running model failed: " << run_status;
return -1;
tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
tensorflow::TTypes<float>::Flat num_detections = outputs[3].flat<float>();
auto boxes = outputs[0].flat_outer_dims<float, 3>();
LOG(ERROR) << "num_detections:" << num_detections(0) << "," << outputs[0].shape().DebugString();
for (size_t i = 0; i < num_detections(0) && i < 20; ++i)
if (scores(i) > 0.5)
LOG(ERROR) << i << ",score:" << scores(i) << ",class:" << classes(i) << ",box:" << "," << boxes(0, i, 0) << "," << boxes(0, i, 1) << "," << boxes(0, i, 2) << "," << boxes(0, i, 3);
return 0;
After successful built I ran the code and got a "_pywrap_tensorflow_internal.pyd not found" message.
I searched PC and found one in phython/tensorflow path.
I copied that one to execution path and everything was ok except gpu usage
Suddenly something whispered me ;
"Hey you immortal !! you should get recently generated
pywrap_tensorflow_internal.dll and rename it _pywrap_tensorflow_internal.pyd
and copy it to execution path.
GPU is being used
I am having a problem in trying to serialize an array of unsigned char into file with GZIP compression using protobuf while playing with the library.
I think the problem might have to do with some of my syntax or misuse of API.
I have also tried std::fstream.
FYI, Windows 8.1 & VS2013 is the building environment.
syntax = "proto3";
package Recipe;
message Scene
repeated int32 imageData = 1 [packed=true];
#include <iostream>
#include <fstream>
#include <ostream>
#include <istream>
#include <string>
#include <cstdint>
#include "Scene.pb.h"
#include <google\protobuf\io\zero_copy_stream_impl.h>
#include <google\protobuf\io\gzip_stream.h>
int const _MIN = 0;
int const _MAX = 255;
unsigned int const _SIZE = 65200000;
unsigned int const _COMPRESSION_LEVEL = 10;
void randWithinUnsignedCharSize(uint8_t * buffer, unsigned int size)
for (size_t i = 0; i < size; ++i)
buffer[i] = _MIN + (rand() % static_cast<int>(_MAX - _MIN + 1));
using namespace google::protobuf::io;
int main()
Recipe::Scene * scene = new Recipe::Scene();
uint8_t * imageData = new uint8_t[_SIZE];
randWithinUnsignedCharSize(imageData, _SIZE);
for (size_t i = 0; i < _SIZE; i++)
std::cout << "scene->imagedata_size() " << scene->imagedata_size() << std::endl;
std::ofstream output("scene.art", std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
OstreamOutputStream outputFileStream(&output);
GzipOutputStream::Options options;
options.format = GzipOutputStream::GZIP;
options.compression_level = _COMPRESSION_LEVEL;
GzipOutputStream gzipOutputStream(&outputFileStream, options);
if (!scene->SerializeToZeroCopyStream(&gzipOutputStream)) {
std::cerr << "Failed to write scene." << std::endl;
return -1;
Recipe::Scene * scene1 = new Recipe::Scene();
std::ifstream input("scene.art", std::ifstream::in | std::ifstream::binary);
IstreamInputStream inputFileStream(&input);
GzipInputStream gzipInputStream(&inputFileStream);
if (!scene1->ParseFromZeroCopyStream(&gzipInputStream)) {
std::cerr << "Failed to parse scene." << std::endl;
return -1;
std::cout << "scene1->imagedata_size() " << scene1->imagedata_size() <<std::endl;
return 0;
You seem to have a typo in your code. Compression level is according to documentation in range 0-9. You set incorrectly compression level to 10.
Your example is working for me when corrected to:
unsigned int const _COMPRESSION_LEVEL = 9;