I'm currently porting my old OpenCV C code to the C++ interface of OpenCV 2/3 and I'm not quite sure about some equivalents of old functions. Pretty early I ran into an issue with cvZero. The only possibility I found was to set the matrix content via Mat::setTo. Now, having to be able to manage multi-channel scalars and different data types, setTo iterates through all elements of the matrix and sets them one after another while cvZero basically did a memset. I am wondering what would be the recommended way for using the C++ interface, in case I just want to clear my image black.
Thanks!
yourMat = cv::Mat::zeros(yourMat.size(), yourMat.type()) does not seem to allocate new memory but only overwrites the existing Mat object (memory was previously allocated, otherwise .size is 0). Not sure whether memset is used internally, but this sample code gives 50% longer processing time for the version with .setTo compared to the version with cv::Mat::zeros - but I didn't evaluate the offset from the manipulation (which should be quite identical in both versions)!
int main(int argc, char* argv[])
{
cv::Mat input = cv::imread("C:/StackOverflow/Input/Lenna.png");
srand(time(NULL));
cv::Mat a = input;
cv::Mat b = input;
cv::imshow("original", a);
b = cv::Mat::zeros(a.size(), a.type());
std::vector<int> randX;
std::vector<int> randY;
std::vector<cv::Vec3b> randC;
int n = 500000;
randX.resize(n);
randY.resize(n);
randC.resize(n);
for (unsigned int i = 0; i < n; ++i)
{
randX[i] = rand() % input.cols;
randY[i] = rand() % input.rows;
randC[i] = cv::Vec3b(rand()%255, rand()%255, rand()%255);
}
clock_t start1 = clock();
for (unsigned int i = 0; i < randX.size(); ++i)
{
b.at<cv::Vec3b>(randY[i], randX[i]) = randC[i];
b = cv::Mat::zeros(b.size(), b.type());
}
clock_t end1 = clock();
clock_t start2 = clock();
for (unsigned int i = 0; i < randX.size(); ++i)
{
b.at<cv::Vec3b>(randY[i], randX[i]) = randC[i];
b.setTo( cv::Scalar(0, 0, 0));
}
clock_t end2 = clock();
std::cout << "time1 = " << ( (end1 - start1) / CLOCKS_PER_SEC ) << " seconds" << std::endl;
std::cout << "time2 = " << ((end2 - start2) / CLOCKS_PER_SEC) << " seconds" << std::endl;
cv::imshow("a", a);
cv::imshow("b", b);
cv::waitKey(0);
return 0;
}
gives me output:
time1 = 14 seconds
time2 = 21 seconds
on my machine (Release mode) (no IPP).
and a black image for both, a and b which indicates that no new memory was allocated, but the existing Mat memory was used.
int n = 250000; will produce output
time1 = 6 seconds
time2 = 10 seconds
This is no answer about whether or not memset is used internally or whether or not it is as fast as cvZero, but at least you know now how to set to zero faster than .setTo
Related
i'm trying to optimize my code using multithreading and is not just that the program is not the double speed as is suposed to be in this dual-core computer, it is SO MUCH SLOW. And i just wanna know if i'm doing something wrong or is pretty normal that in this case use multithreading does not help. I make this recreation of how i used the multithreading, and in my computer the parallel versions take's 4 times the time in the comparation of the normal version:
#include <iostream>
#include <random>
#include <thread>
#include <chrono>
using namespace std;
default_random_engine ran;
inline bool get(){
return ran() % 3;
}
void normal_serie(unsigned repetitions, unsigned &result){
for (unsigned i = 0; i < repetitions; ++i)
result += get();
}
unsigned parallel_series(unsigned repetitions){
const unsigned hardware_threads = std::thread::hardware_concurrency();
cout << "Threads in this computer: " << hardware_threads << endl;
const unsigned threads_number = (hardware_threads != 0) ? hardware_threads : 2;
const unsigned its_per_thread = repetitions / threads_number;
unsigned *results = new unsigned[threads_number]();
std::thread *threads = new std::thread[threads_number - 1];
for (unsigned i = 0; i < threads_number - 1; ++i)
threads[i] = std::thread(normal_serie, its_per_thread, std::ref(results[i]));
normal_serie(its_per_thread, results[threads_number - 1]);
for (unsigned i = 0; i < threads_number - 1; ++i)
threads[i].join();
auto result = std::accumulate(results, results + threads_number, 0);
delete[] results;
delete[] threads;
return result;
}
int main()
{
constexpr unsigned repetitions = 100000000;
auto to = std::chrono::high_resolution_clock::now();
cout << parallel_series(repetitions) << endl;
auto tf = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(tf - to).count();
cout << "Parallel duration: " << duration << "ms" << endl;
to = std::chrono::high_resolution_clock::now();
unsigned r = 0;
normal_serie(repetitions, r);
cout << r << endl;
tf = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(tf - to).count();
cout << "Normal duration: " << duration << "ms" << endl;
return 0;
}
Things that i already know, but i didn't to make this code shorter:
I should set a max_iterations_per_thread because you don't wanna make 10 iterations per thread, but in this case we are doing one billion iterations so that is not gonna happend.
The number of iterations must be divisible by the number or threads, otherwise the code will not do an effective work.
This is the output that i get in my computer:
Threads in this computer: 2
66665160
Parallel duration: 4545ms
66664432
Normal duration: 1019ms
(Solved partially doing this changes: )
inline bool get(default_random_engine &ran){
return ran() % 3;
}
void normal_serie(unsigned repetitions, unsigned &result){
default_random_engine eng;
unsigned saver_result = 0;
for (unsigned i = 0; i < repetitions; ++i)
saver_result += get(eng);
result += saver_result;
}
All your threads are tripping over each other fighting for access to ran which can only perform one operation at a time because it only has one state and each operation advances its state. There is no point in running operations in parallel if the vast majority of each operation involves a choke point that cannot support any concurrency.
All elements of results are likely to share a cache line, which means there is lots of inter-core communication going on.
Try modifying normal_serie to accumulate into a local variable and only write it to results in the end.
How can I use XGBOOST https://github.com/dmlc/xgboost/ library in c++? I have founded Python and Java API, but I can't found API for c++
I ended up using the C API, see below an example:
// create the train data
int cols=3,rows=5;
float train[rows][cols];
for (int i=0;i<rows;i++)
for (int j=0;j<cols;j++)
train[i][j] = (i+1) * (j+1);
float train_labels[rows];
for (int i=0;i<rows;i++)
train_labels[i] = 1+i*i*i;
// convert to DMatrix
DMatrixHandle h_train[1];
XGDMatrixCreateFromMat((float *) train, rows, cols, -1, &h_train[0]);
// load the labels
XGDMatrixSetFloatInfo(h_train[0], "label", train_labels, rows);
// read back the labels, just a sanity check
bst_ulong bst_result;
const float *out_floats;
XGDMatrixGetFloatInfo(h_train[0], "label" , &bst_result, &out_floats);
for (unsigned int i=0;i<bst_result;i++)
std::cout << "label[" << i << "]=" << out_floats[i] << std::endl;
// create the booster and load some parameters
BoosterHandle h_booster;
XGBoosterCreate(h_train, 1, &h_booster);
XGBoosterSetParam(h_booster, "booster", "gbtree");
XGBoosterSetParam(h_booster, "objective", "reg:linear");
XGBoosterSetParam(h_booster, "max_depth", "5");
XGBoosterSetParam(h_booster, "eta", "0.1");
XGBoosterSetParam(h_booster, "min_child_weight", "1");
XGBoosterSetParam(h_booster, "subsample", "0.5");
XGBoosterSetParam(h_booster, "colsample_bytree", "1");
XGBoosterSetParam(h_booster, "num_parallel_tree", "1");
// perform 200 learning iterations
for (int iter=0; iter<200; iter++)
XGBoosterUpdateOneIter(h_booster, iter, h_train[0]);
// predict
const int sample_rows = 5;
float test[sample_rows][cols];
for (int i=0;i<sample_rows;i++)
for (int j=0;j<cols;j++)
test[i][j] = (i+1) * (j+1);
DMatrixHandle h_test;
XGDMatrixCreateFromMat((float *) test, sample_rows, cols, -1, &h_test);
bst_ulong out_len;
const float *f;
XGBoosterPredict(h_booster, h_test, 0,0,&out_len,&f);
for (unsigned int i=0;i<out_len;i++)
std::cout << "prediction[" << i << "]=" << f[i] << std::endl;
// free xgboost internal structures
XGDMatrixFree(h_train[0]);
XGDMatrixFree(h_test);
XGBoosterFree(h_booster);
Use XGBoost C API.
BoosterHandle booster;
const char *model_path = "/path/of/model";
// create booster handle first
XGBoosterCreate(NULL, 0, &booster);
// by default, the seed will be set 0
XGBoosterSetParam(booster, "seed", "0");
// load model
XGBoosterLoadModel(booster, model_path);
const int feat_size = 100;
const int num_row = 1;
float feat[num_row][feat_size];
// create some fake data for predicting
for (int i = 0; i < num_row; ++i) {
for(int j = 0; j < feat_size; ++j) {
feat[i][j] = (i + 1) * (j + 1)
}
}
// convert 2d array to DMatrix
DMatrixHandle dtest;
XGDMatrixCreateFromMat(reinterpret_cast<float*>(feat),
num_row, feat_size, NAN, &dtest);
// predict
bst_ulong out_len;
const float *f;
XGBoosterPredict(booster, dtest, 0, 0, &out_len, &f);
assert(out_len == num_row);
std::cout << f[0] << std::endl;
// free memory
XGDMatrixFree(dtest);
XGBoosterFree(booster);
Note when you want to load an existing model(like above code shows), you have to ensure the data format in training is the same as in predicting. So, if you predict with XGBoosterPredict, which accepts a dense matrix as parameter, you have to use dense matrix in training.
Training with libsvm format and predict with dense matrix may cause wrong predictions, as XGBoost FAQ says:
“Sparse” elements are treated as if they were “missing” by the tree booster, and as zeros by the linear booster. For tree models, it is important to use consistent data formats during training and scoring.
Here is what you need:https://github.com/EmbolismSoil/xgboostpp
#include "xgboostpp.h"
#include <algorithm>
#include <iostream>
int main(int argc, const char* argv[])
{
auto nsamples = 2;
auto xgb = XGBoostPP(argv[1], 3); //特征列有4列, label有3个, iris例子中分别为三种类型的花,回归任何的话,这里nlabel=1即可
//result = array([[9.9658281e-01, 2.4966884e-03, 9.2058454e-04],
// [9.9608469e-01, 2.4954407e-03, 1.4198524e-03]], dtype=float32)
XGBoostPP::Matrix features(2, 4);
features <<
5.1, 3.5, 1.4, 0.2,
4.9, 3.0, 1.4, 0.2;
XGBoostPP::Matrix y;
auto ret = xgb.predict(features, y);
if (ret != 0){
std::cout << "predict error" << std::endl;
}
std::cout << "intput : \n" << features << std::endl << "output: \n" << y << std::endl;
}
In case training in Python is okay and you only need to run the prediction in C++, there is a nice tool for generating static if/else-code from a trained model:
https://github.com/popcorn/xgb2cpp
I ended up using this after spending a day trying to load and use a xgboost model in C++ without success. The code generated by xgb2cpp was working instantly and also has the nice benefit that it does not have any dependencies.
There is no example I am aware of. there is a c_api.h file that contains a C/C++ api for the package, and you'll have to find your way using it. I've just did that. Took me a few hours reading the code and trying few things out. But eventually I managed to create a working C++ example of xgboost.
To solve this problem we runs the xgboost program from C++ source code.
I need to do something like this in the fastest way possible (O(1) would be perfect):
for (int j = 0; j < V; ++j)
{
if(!visited[j]) required[j]=0;
}
I came up with this solution:
for (int j = 0; j < V; ++j)
{
required[j]=visited[j]&required[j];
}
Which made the program run 3 times faster but I believe there is an even better way to do this. Am I right?
Btw. required and visited are dynamically allocated arrays
bool *required;
bool *visited;
required = new bool[V];
visited = new bool[V];
In the case where you're using a list of simple objects, you are most likely best suited using the functionality provided by the C++ Standard Library. Structures like valarray and vectors are recognized and optimized very effectively by all modern compilers.
Much debate exists as to how much you can rely on your compiler, but one guarantee is, your compiler was built alongside the standard library and relying on it for basic functionality (such as your problem) is generally a safe bet.
Never be afraid to run your own time tests and race your compiler! It's a fun exercise and one that is ever increasingly difficult to achieve.
Construct a valarray (highly optimized in c++11 and later):
std::valarray<bool> valRequired(required, V);
std::valarray<bool> valVisited(visited, V);
valRequired &= valVisited;
Alternatively, you could do it with one line using transform:
std::transform(required[0], required[V-1], visited[0], required[0], [](bool r, bool v){ return r & v; })
Edit: while fewer lines is not faster, your compiler will likely vectorize this operation.
I also tested their timing:
int main(int argc, const char * argv[]) {
auto clock = std::chrono::high_resolution_clock{};
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
for (int i = 0; i < 5; ++i) {
required[i] &= visited[i];
}
auto end = clock.now();
std::cout << "1: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
for (int i = 0; i < 5; ++i) {
required[i] = visited[i] & required[i];
}
auto end = clock.now();
std::cout << "2: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
auto start = clock.now();
std::transform(required, required + 4, visited, required, [](bool r, bool v){ return r & v; });
auto end = clock.now();
std::cout << "3: " << (end - start).count() << std::endl;
}
{
bool visited[5] = {1,0,1,0,0};
bool required[5] = {1,1,1,0,1};
std::valarray<bool> valVisited(visited, 5);
std::valarray<bool> valrequired(required, 5);
auto start = clock.now();
valrequired &= valVisited;
auto end = clock.now();
std::cout << "4: " << (end - start).count() << std::endl;
}
}
Output:
1: 102
2: 55
3: 47
4: 45
Program ended with exit code: 0
In the line of #AlanStokes, use packed binary data and combine with the AVX instruction _mm512_and_epi64, 512 bits at a time. Be prepared for your hair messed up.
Note: I've posted this also on Eigen forum here
I want to premultiply 3xN matrices by a 3x3 matrix, i.e., to transform 3D points, like
p_dest = T * p_source
after initializing the matrices:
Eigen::Matrix<double, 3, Eigen::Dynamic> points = Eigen::Matrix<double, 3, Eigen::Dynamic>::Random(3, NUMCOLS);
Eigen::Matrix<double, 3, Eigen::Dynamic> dest = Eigen::Matrix<double, 3, Eigen::Dynamic>(3, NUMCOLS);
int NT = 100;
I have evaluated this two versions
// eigen direct multiplication
for (int i = 0; i < NT; i++){
Eigen::Matrix3d T = Eigen::Matrix3d::Random();
dest.noalias() = T * points;
}
and
// col multiplication
for (int i = 0; i < NT; i++){
Eigen::Matrix3d T = Eigen::Matrix3d::Random();
for (int c = 0; c < points.cols(); c++){
dest.col(c) = T * points.col(c);
}
}
the NT repetition are done just to compute average time
I am surprised the the column by column multiplication is about 4/5 time faster than the direct multiplication
(and the direct multiplication is even slower if I do not use the .noalias(), but this is fine since it is doing a temporary copy)
I've tried to change NUMCOLS from 0 to 1000000 and the relation is linear.
I'm using Visual Studio 2013 and compiling in release
The next figure shows on X the number of columns of the matrix and in Y the avg time for a single operation, in blue the col by col multiplication, in red the matrix multiplication
Any suggestion why this happens?
Short answer
You're timing the lazy (and therefore lack of) evaluation in the col multiplication version, vs. the lazy (but evaluated) evaluation in the direct version.
Long answer
Instead of code snippets, let's look at a full MCVE. First, "you're" version:
void ColMult(Matrix3Xd& dest, Matrix3Xd& points)
{
Eigen::Matrix3d T = Eigen::Matrix3d::Random();
for (int c = 0; c < points.cols(); c++){
dest.col(c) = T * points.col(c);
}
}
void EigenDirect(Matrix3Xd& dest, Matrix3Xd& points)
{
Eigen::Matrix3d T = Eigen::Matrix3d::Random();
dest.noalias() = T * points;
}
int main(int argc, char *argv[])
{
srand(time(NULL));
int NUMCOLS = 100000 + rand();
Matrix3Xd points = Matrix3Xd::Random(3, NUMCOLS);
Matrix3Xd dest = Matrix3Xd(3, NUMCOLS);
Matrix3Xd dest2 = Matrix3Xd(3, NUMCOLS);
int NT = 200;
// eigen direct multiplication
auto beg1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NT; i++)
{
EigenDirect(dest, points);
}
auto end1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = end1-beg1;
// col multiplication
auto beg2 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < NT; i++)
{
ColMult(dest2, points);
}
auto end2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds2 = end2-beg2;
std::cout << "Direct time: " << elapsed_seconds.count() << "\n";
std::cout << "Col time: " << elapsed_seconds2.count() << "\n";
std::cout << "Eigen speedup: " << elapsed_seconds2.count() / elapsed_seconds.count() << "\n\n";
return 0;
}
With this code (and SSE turned on), I get:
Direct time: 0.449301
Col time: 0.10107
Eigen speedup: 0.224949
Same 4-5 slowdown you complained of. Why?!?! Before we get to the answer, let's modify the code a bit so that the dest matrix is sent to an ostream. Add std::ostream outPut(0); to the beginning of main() and before ending the timers add outPut << dest << "\n\n"; and outPut << dest2 << "\n\n";. The std::ostream outPut(0); doesn't output anything (I'm pretty sure the badbit is set), but it does cause Eigens operator<< to be called, which forces the evaluation of the matrix.
NOTE: if we used outPut << dest(1,1) then dest would be evaluated only enough to output the single element in the col multiplication method.
We then get
Direct time: 0.447298
Col time: 0.681456
Eigen speedup: 1.52349
as a result as expected. Note that the Eigen direct method took the exact(ish) same time (meaning the evaluation took place even without the added ostream), whereas the col method all of the sudden took much longer.
I'm using OpenCV in C++ and I'm stuck on a point. I need to do the following:
if(src(I) < aNumber)
do operation1
else
do operation2
For loop takes 100+ ms for 1000x750 image. I don't want to use a for loop because it takes a lot of time. I want to use an (some) OpenCV function(s) with that function I could be able to edit some of the values in the matrix. For example, my array is
[1 4 5;
4 6 2;
3 2 1]
I want:
if(an element of mat < 4)
pow(element,2)
else
element--;
According to this if-else
[1 3 4;
3 5 4
9 4 1]
is going to be my result matrix.
Does anybody know any functions to handle this except using two for loops?
You may want to check out compare. Example:
//Mat mask; compare(src, 10.0, mask, CMP_LT);
Mat mask = src < 10.0;
Depending on the actual operation you wish to preform you may be able to use the result from compare, otherwise you could take a look at the gpu module. In particular, the Per-element Operations.
Personally, I feel that OpenCV should be treated a bit like MATLAB, avoid loops, use matrices, and try to use the built-in functions whenever possible (even if they are just implemented as a loop it saves you typing out the same thing again and again).
EDIT: Following is an example piece of code to achieve the task in your updated question using loops and using the built-in matrix operators:
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
//#include <opencv2/gpu/gpu.hpp>
using namespace cv;
#include <iostream>
using namespace std;
int main(int argc, char** argv)
{
// Load Image
Mat matImage = imread("Test.png");
// Convert to Grayscale
Mat matGray(matImage.rows, matImage.cols, CV_8UC1);
cvtColor(matImage, matGray, CV_BGR2GRAY);
double time, dThreshold = 50.0;
//int TIMES = 1000;
//namedWindow("Display", WINDOW_NORMAL);
//char chKey;
//imshow("Display", matGray);
//chKey = '\0'; while (chKey != '') chKey = waitKey(0);
//----------------------------- Loop Method -------------------------------
time = (double) getTickCount();
//for (int k = 0; k < TIMES; k++)
//{
Mat matLoop = matGray.clone();
for (int i = 0; i < matLoop.rows; ++i)
{
for (int j = 0; j < matLoop.cols; ++j)
{
uchar& unValue = matLoop.at<uchar>(i, j);
if (unValue < dThreshold)
unValue = pow(unValue, 2);
else
unValue--;
}
}
//}
time = 1000*((double)getTickCount() - time)/getTickFrequency();
//time /= TIMES;
cout << "Loop Method Time: " << time << " milliseconds." << endl;
//imshow("Display", matLoop);
//chKey = '\0'; while (chKey != '') chKey = waitKey(0);
//---------------------------- Matrix Method ------------------------------
time = (double) getTickCount();
//for (int i = 0; i < TIMES; i++)
//{
Mat matMask, matMatrix;
matMask = matGray < dThreshold;
bitwise_and(matGray, matMask, matMatrix);
pow(matMatrix, 2.0, matMatrix);
subtract(matGray, 1.0, matMatrix, ~matMask);
//}
time = 1000*((double)getTickCount() - time)/getTickFrequency();
//time /= TIMES;
cout << "Matrix Method Time: " << time << " milliseconds." << endl;
//imshow("Display", matMatrix);
//chKey = '\0'; while (chKey != '') chKey = waitKey(0);
return 0;
}
As well as reducing the number of lines of code you need to type from 12 to 5, the matrix method is also faster. Enabling the timing loops (with TIMES = 1000;), I get the following times for a medium sized image:
Loop Method Time: 9.19669 milliseconds.
Matrix Method Time: 2.82657 milliseconds.
With the gpu module I am sure that you could reduce the second time further, but unfortunately I don't currently have a suitable graphics card attached to my current system.