Opencv weird speed performance of convertTo and LUT - c++
Environment
OpenCV 4.4.0
CentOS 7.2 docker
gcc 7.3.1
What I want to do
I'm doing some CV deeplearning deployment optimization.Some of my models needs their input to be normalized. So preprocessing normalization became an optimization point. Pixel value is between [0, 255], so first thing is multiplying it with 1 / 255.0, which is my first method. After some google I found LUT which theoretically should be faster than float calculation. So I wrote code like below to test the two methods:
Test code
#include "opencv2/imgcodecs/imgcodecs.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <chrono>
#include <dirent.h>
#include <iostream>
#include <string>
#include <vector>
int getFiles(const std::string path, std::vector<std::string>& files, std::string suffix)
{
int iFileCnt = 0;
DIR* dirptr = NULL;
struct dirent* dirp;
if ((dirptr = opendir(path.c_str())) == NULL) {
return 0;
}
while ((dirp = readdir(dirptr)) != NULL) {
if ((dirp->d_type == DT_REG) && 0 == (strcmp(strchr(dirp->d_name, '.'), suffix.c_str()))) {
files.push_back(dirp->d_name);
}
++iFileCnt;
}
closedir(dirptr);
return iFileCnt;
}
int main(int argc, char* argv[])
{
std::string pic_dir = argv[1];
int loop_count = 10;
if (argc >= 3) {
loop_count = std::stoi(argv[2]);
}
float FACTOR = 1 / 255.0;
std::vector<cv::Size> sizes = {
{299, 299},
{416, 416},
{512, 512},
{640, 640},
{960, 540},
{1920, 1080}
};
// std::vector<cv::Size> sizes = {
// {1920, 1080},
// {960, 540},
// {640, 640},
// {512, 512},
// {416, 416},
// {299, 299}
// };
cv::Mat table(1, 256, CV_32FC1);
auto ptr = table.ptr<float>(0);
for (int i = 0; i < 256; ++i) {
ptr[i] = float(i) * FACTOR;
}
std::vector<std::string> pic_files;
getFiles(pic_dir, pic_files, ".jpg");
std::vector<cv::Mat> image_mats(pic_files.size());
for (int i = 0; i < pic_files.size(); ++i) {
std::string one_pic_path = pic_dir + "/" + pic_files[i];
image_mats[i] = cv::imread(one_pic_path);
}
for (auto& one_size : sizes) {
std::cout << "size: " << one_size << std::endl;
double time_1 = 0;
double time_2 = 0;
for (auto& one_mat : image_mats) {
cv::Mat tmp_image;
cv::resize(one_mat, tmp_image, one_size);
for (int i = 0; i < loop_count; ++i) {
auto t_1_1 = std::chrono::steady_clock::now();
cv::Mat out_1;
tmp_image.convertTo(out_1, CV_32FC3, FACTOR);
auto t_1_2 = std::chrono::steady_clock::now();
time_1 += std::chrono::duration<double, std::milli>(t_1_2 - t_1_1).count();
auto t_2_1 = std::chrono::steady_clock::now();
cv::Mat out_2;
cv::LUT(tmp_image, table, out_2);
auto t_2_2 = std::chrono::steady_clock::now();
time_2 += std::chrono::duration<double, std::milli>(t_2_2 - t_2_1).count();
auto diff = cv::sum(out_1 - out_2);
if (diff[0] > 1E-3) {
std::cout << diff << std::endl;
}
}
}
size_t count = loop_count * image_mats.size();
auto average_time_1 = time_1 / count;
auto average_time_2 = time_2 / count;
auto promote_percent = (average_time_1 - average_time_2) / average_time_1 * 100;
printf("total pic num: %d, loop %d times\n", pic_files.size(), loop_count);
printf("method_1, total %f ms, average %f ms\n", time_1, average_time_1);
printf("method_2, total %f ms, average %f ms, promote: %.2f%\n", time_2, average_time_2,
promote_percent);
printf("\n");
}
return 0;
}
Weird performance
What I want to test is speed difference between two methods, with different input sizes, while the outputs of two methods should be equal. I took 128 pictures with different sizes for test. Here is the weird performance:
1. result of the code above
size: [299 x 299]
total pic num: 128, loop 10 times
method_1, total 38.872174 ms, average 0.030369 ms
method_2, total 330.688332 ms, average 0.258350 ms, promote: -750.71%
size: [416 x 416]
total pic num: 128, loop 10 times
method_1, total 103.708926 ms, average 0.081023 ms
method_2, total 689.972421 ms, average 0.539041 ms, promote: -565.30%
size: [512 x 512]
total pic num: 128, loop 10 times
method_1, total 267.989430 ms, average 0.209367 ms
method_2, total 450.809036 ms, average 0.352195 ms, promote: -68.22%
size: [640 x 640]
total pic num: 128, loop 10 times
method_1, total 757.269510 ms, average 0.591617 ms
method_2, total 551.951118 ms, average 0.431212 ms, promote: 27.11%
size: [960 x 540]
total pic num: 128, loop 10 times
method_1, total 1095.167540 ms, average 0.855600 ms
method_2, total 760.330269 ms, average 0.594008 ms, promote: 30.57%
size: [1920 x 1080]
total pic num: 128, loop 10 times
method_1, total 4944.142104 ms, average 3.862611 ms
method_2, total 3471.176202 ms, average 2.711856 ms, promote: 29.79%
2. comment the diff part:
//auto diff = cv::sum(out_1 - out_2);
//if (diff[0] > 1E-3) {
// std::cout << diff << std::endl;
//}
size: [299 x 299]
total pic num: 128, loop 10 times
method_1, total 246.356823 ms, average 0.192466 ms
method_2, total 361.859598 ms, average 0.282703 ms, promote: -46.88%
size: [416 x 416]
total pic num: 128, loop 10 times
method_1, total 516.542233 ms, average 0.403549 ms
method_2, total 719.191240 ms, average 0.561868 ms, promote: -39.23%
size: [512 x 512]
total pic num: 128, loop 10 times
method_1, total 839.599260 ms, average 0.655937 ms
method_2, total 342.608080 ms, average 0.267663 ms, promote: 59.19%
size: [640 x 640]
total pic num: 128, loop 10 times
method_1, total 1384.348467 ms, average 1.081522 ms
method_2, total 524.382672 ms, average 0.409674 ms, promote: 62.12%
size: [960 x 540]
total pic num: 128, loop 10 times
method_1, total 1796.153597 ms, average 1.403245 ms
method_2, total 688.210851 ms, average 0.537665 ms, promote: 61.68%
size: [1920 x 1080]
total pic num: 128, loop 10 times
method_1, total 7707.945924 ms, average 6.021833 ms
method_2, total 3812.262622 ms, average 2.978330 ms, promote: 50.54%
3. Uncomment the diff part but reverse the sizes vector
std::vector<cv::Size> sizes = {
{1920, 1080},
{960, 540},
{640, 640},
{512, 512},
{416, 416},
{299, 299}
};
...
auto diff = cv::sum(out_1 - out_2);
if (diff[0] > 1E-3) {
std::cout << diff << std::endl;
}
size: [1920 x 1080]
total pic num: 128, loop 10 times
method_1, total 4933.384896 ms, average 3.854207 ms
method_2, total 3563.611341 ms, average 2.784071 ms, promote: 27.77%
size: [960 x 540]
total pic num: 128, loop 10 times
method_1, total 887.353187 ms, average 0.693245 ms
method_2, total 917.995079 ms, average 0.717184 ms, promote: -3.45%
size: [640 x 640]
total pic num: 128, loop 10 times
method_1, total 492.562282 ms, average 0.384814 ms
method_2, total 525.089826 ms, average 0.410226 ms, promote: -6.60%
size: [512 x 512]
total pic num: 128, loop 10 times
method_1, total 181.900041 ms, average 0.142109 ms
method_2, total 159.691528 ms, average 0.124759 ms, promote: 12.21%
size: [416 x 416]
total pic num: 128, loop 10 times
method_1, total 77.030586 ms, average 0.060180 ms
method_2, total 221.307936 ms, average 0.172897 ms, promote: -187.30%
size: [299 x 299]
total pic num: 128, loop 10 times
method_1, total 38.139366 ms, average 0.029796 ms
method_2, total 112.203023 ms, average 0.087659 ms, promote: -194.19%
4. Comment the diff part and reverse the sizes vector
std::vector<cv::Size> sizes = {
{1920, 1080},
{960, 540},
{640, 640},
{512, 512},
{416, 416},
{299, 299}
};
...
//auto diff = cv::sum(out_1 - out_2);
//if (diff[0] > 1E-3) {
// std::cout << diff << std::endl;
//}
size: [1920 x 1080]
total pic num: 128, loop 10 times
method_1, total 8021.875493 ms, average 6.267090 ms
method_2, total 3849.222334 ms, average 3.007205 ms, promote: 52.02%
size: [960 x 540]
total pic num: 128, loop 10 times
method_1, total 605.553580 ms, average 0.473089 ms
method_2, total 477.145896 ms, average 0.372770 ms, promote: 21.21%
size: [640 x 640]
total pic num: 128, loop 10 times
method_1, total 268.076975 ms, average 0.209435 ms
method_2, total 169.015667 ms, average 0.132043 ms, promote: 36.95%
size: [512 x 512]
total pic num: 128, loop 10 times
method_1, total 117.419851 ms, average 0.091734 ms
method_2, total 94.436479 ms, average 0.073778 ms, promote: 19.57%
size: [416 x 416]
total pic num: 128, loop 10 times
method_1, total 73.963177 ms, average 0.057784 ms
method_2, total 221.397616 ms, average 0.172967 ms, promote: -199.33%
size: [299 x 299]
total pic num: 128, loop 10 times
method_1, total 38.046131 ms, average 0.029724 ms
method_2, total 113.839007 ms, average 0.088937 ms, promote: -199.21%
Question
I know cpu working state may undulate, and speed performance may not be exactly the same. But Why does code outside timekeeping have such influence on convertTo and LUT?
Related
c++ Changing from second into days,hours.minutes and second
cout << "Please enter the elapsed time in seconds or (0 to end the program): " ; cin >> totalSeconds; days = totalSeconds / 86400 % 60; hours = totalSeconds/ 3600 % 60; minutes = totalSeconds/ 60 % 60; seconds = totalSeconds % 60; The small number works fine but once I start using bigger number it's not working anyone knows why? Here are the logs : Please enter the elapsed time in seconds or (0 to end the program): 62 The equivalent time of 62 seconds in days:hours:minutes:seconds is: :0:0:1:2 Please enter the elapsed time in seconds or (0 to end the program): 9630 The equivalent time of 9630 seconds in days:hours:minutes:seconds is: :0:2:40:30 Please enter the elapsed time in seconds or (0 to end the program): 216000 The equivalent time of 216000 seconds in days:hours:minutes:seconds is: :2:0:0:0 both the 62 and 9630 second works fine but not the 216000s
Let's start with your formula for days: days = totalSeconds / 86400 % 60; If totalSeconds is 5270400 (61 days), that equation will compute 1 for days. That's probably not your only bug. Your modulus parameter of 60 on the right of the % doesn't appear correct or it's not needed as you are using it. This is probably what you want without being clever. days = totalSeconds / 86400; totalSeconds = totalSeconds % 86400; hours = totalSeconds / 3600; totalSeconds = totalSeconds % 3600; minutes = totalSeconds / 60; totalSeconds = totalSeconds % 60; seconds = totalSeconds;
Huge latency spikes while running simple code
I have a simple benchmark that demonstrates performance of busywait threads. It runs in two modes: first one simply gets two timepoints sequentially, second one iterates through vector and measures duration of an iteration. I see that two sequential calls of clock::now() takes about 50 nanoseconds on the average and one average iteration through vector takes about 100 nanoseconds. But sometimes these operations are executed with a huge delay: about 50 microseconds in the first case and 10 milliseconds (!) in the second case. Test runs on single isolated core so context switches do not occur. I also call mlockall in beginning of the program so I assume that page faults do not affect the performance. Following additional optimizations were also applied: kernel boot parameters: intel_idle.max_cstate=0 idle=halt irqaffinity=0,14 isolcpus=4-13,16-27 pti=off spectre_v2=off audit=0 selinux=0 nmi_watchdog=0 nosoftlockup=0 rcu_nocb_poll rcu_nocbs=19-20 nohz_full=19-20; rcu[^c] kernel threads moved to a housekeeping CPU core 0; network card RxTx queues moved to a housekeeping CPU core 0; writeback kernel workqueue moved to a housekeeping CPU core 0; transparent_hugepage disabled; Intel CPU HyperThreading disabled; swap file/partition is not used. Environment: System details: Default Archlinux kernel: 5.1.9-arch1-1-ARCH #1 SMP PREEMPT Tue Jun 11 16:18:09 UTC 2019 x86_64 GNU/Linux that has following PREEMPT and HZ settings: CONFIG_HZ_300=y CONFIG_HZ=300 CONFIG_PREEMPT=y Hardware details: RAM: 256GB CPU(s): 28 On-line CPU(s) list: 0-27 Thread(s) per core: 1 Core(s) per socket: 14 Socket(s): 2 NUMA node(s): 2 Vendor ID: GenuineIntel CPU family: 6 Model: 79 Model name: Intel(R) Xeon(R) CPU E5-2690 v4 # 2.60GHz Stepping: 1 CPU MHz: 3200.011 CPU max MHz: 3500.0000 CPU min MHz: 1200.0000 BogoMIPS: 5202.68 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 35840K NUMA node0 CPU(s): 0-13 NUMA node1 CPU(s): 14-27 Example code: struct TData { std::vector<char> Data; TData() = default; TData(size_t aSize) { for (size_t i = 0; i < aSize; ++i) { Data.push_back(i); } } }; using TBuffer = std::vector<TData>; TData DoMemoryOperation(bool aPerform, const TBuffer& aBuffer, size_t& outBufferIndex) { if (!aPerform) { return TData {}; } const TData& result = aBuffer[outBufferIndex]; if (++outBufferIndex == aBuffer.size()) { outBufferIndex = 0; } return result; } void WarmUp(size_t aCyclesCount, bool aPerform, const TBuffer& aBuffer) { size_t bufferIndex = 0; for (size_t i = 0; i < aCyclesCount; ++i) { auto data = DoMemoryOperation(aPerform, aBuffer, bufferIndex); } } void TestCycle(size_t aCyclesCount, bool aPerform, const TBuffer& aBuffer, Measurings& outStatistics) { size_t bufferIndex = 0; for (size_t i = 0; i < aCyclesCount; ++i) { auto t1 = std::chrono::steady_clock::now(); { auto data = DoMemoryOperation(aPerform, aBuffer, bufferIndex); } auto t2 = std::chrono::steady_clock::now(); auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count(); outStatistics.AddMeasuring(diff, t2); } } int Run(int aCpu, size_t aDataSize, size_t aBufferSize, size_t aCyclesCount, bool aAllocate, bool aPerform) { if (mlockall(MCL_CURRENT | MCL_FUTURE)) { throw std::runtime_error("mlockall failed"); } std::cout << "Test parameters" << ":\ndata size=" << aDataSize << ",\nnumber of elements=" << aBufferSize << ",\nbuffer size=" << aBufferSize * aDataSize << ",\nnumber of cycles=" << aCyclesCount << ",\nallocate=" << aAllocate << ",\nperform=" << aPerform << ",\nthread "; SetCpuAffinity(aCpu); TBuffer buffer; if (aPerform) { buffer.resize(aBufferSize); std::fill(buffer.begin(), buffer.end(), TData { aDataSize }); } WaitForKey(); std::cout << "Running..."<< std::endl; WarmUp(aBufferSize * 2, aPerform, buffer); Measurings statistics; TestCycle(aCyclesCount, aPerform, buffer, statistics); statistics.Print(aCyclesCount); WaitForKey(); if (munlockall()) { throw std::runtime_error("munlockall failed"); } return 0; } And following results are received: First: StandaloneTests --run_test=MemoryAccessDelay --cpu=19 --data-size=280 --size=67108864 --count=1000000000 --allocate=1 --perform=0 Test parameters: data size=280, number of elements=67108864, buffer size=18790481920, number of cycles=1000000000, allocate=1, perform=0, thread 14056 on cpu 19 Statistics: min: 16: max: 18985: avg: 18 0 - 10 : 0 (0 %): - 10 - 100 : 999993494 (99 %): min: 40: max: 117130: avg: 40 100 - 1000 : 946 (0 %): min: 380: max: 506236837: avg: 43056598 1000 - 10000 : 5549 (0 %): min: 56876: max: 70001739: avg: 7341862 10000 - 18985 : 11 (0 %): min: 1973150818: max: 14060001546: avg: 3644216650 Second: StandaloneTests --run_test=MemoryAccessDelay --cpu=19 --data-size=280 --size=67108864 --count=1000000000 --allocate=1 --perform=1 Test parameters: data size=280, number of elements=67108864, buffer size=18790481920, number of cycles=1000000000, allocate=1, perform=1, thread 3264 on cpu 19 Statistics: min: 36: max: 4967479: avg: 48 0 - 10 : 0 (0 %): - 10 - 100 : 964323921 (96 %): min: 60: max: 4968567: avg: 74 100 - 1000 : 35661548 (3 %): min: 122: max: 4972632: avg: 2023 1000 - 10000 : 14320 (0 %): min: 1721: max: 33335158: avg: 5039338 10000 - 100000 : 130 (0 %): min: 10010533: max: 1793333832: avg: 541179510 100000 - 1000000 : 0 (0 %): - 1000000 - 4967479 : 81 (0 %): min: 508197829: max: 2456672083: avg: 878824867 Any ideas what is the reason of such huge delays and how it may be investigated?
In: TData DoMemoryOperation(bool aPerform, const TBuffer& aBuffer, size_t& outBufferIndex); It returns a std::vector<char> by value. That involves a memory allocation and data copying. The memory allocations can do a syscall (brk or mmap) and memory mappings related syscalls are notorious for being slow. When timings include syscalls one cannot expect low variance. You may like to run your application with /usr/bin/time --verbose <app> or perf -ddd <app> to see the number of page faults and context switches.
How to get hours value greater than 24 hours in python 2.7?
time1 = timedelta(days=2, hours=6.20) time2 = timedelta(hours=20.10) sum_time = time1 + time2 print str(sum_time) print sum_time.total_seconds() / 3600 Output: 3 days, 2:18:00 74.3 How to get output 74:18:00 ?
With total_Seconds / 3600 you only get the hours in decimal format. You can use divmod to break down the seconds into full hours, minutes and seconds: divmod(a, b) Take two (non complex) numbers as arguments and return a pair of numbers consisting of their quotient and remainder when using long division The code would look like this (I added 34 seconds in time2 to check if the seconds part is correct): from datetime import timedelta time1 = timedelta(days=2, hours=6.20) time2 = timedelta(hours=20.10, seconds=34) sum_time = time1 + time2 print str(sum_time) hours, seconds = divmod(sum_time.total_seconds(), 3600) minutes, seconds = divmod(seconds, 60) print "%d:%02d:%02d" % (hours, minutes, seconds) and the output will be: 3 days, 2:18:34 74:18:34 The result of the first divmod is 74 hours (quotient) and a remainder of 1114 seconds. The second divmod is feeded with the remaining seconds from the line before (1114) and gives a result of 18 minutes and 34 seconds.
Sorting algorithm timing discrepancies between g++ and Visual Studio?
So I have implemented a heap, merge, and quicksort. I time the three sorts all the same way. double DiffClocks(clock_t clock1, clock_t clock2){ double diffticks = clock1 - clock2; double diffsecs = diffticks / CLOCKS_PER_SEC; return diffsecs; } Then with each sort, I time them the same way. Just repeated for each different sort. void heapsort(int myArray[], int n){ clock_t begin, end; begin = clock(); heapSortMain(myArray, n); end = clock(); double elapsedTime = heapDiffClocks(end, begin); std::cout << '\t' << elapsedTime; } All three of the sorts are working. I have a function that verifies the arrays are sorted after executing each sort. My question is, why do I have such a big difference between the timing when running on g++ and on Visual Studio? My output from Visual Studio 2012: n Heap Merge Quick 100 0 0 0 1000 0 0 0 10000 0.01 0 0 100000 0.14 0.02 0.03 1000000 1.787 0.22 0.33 10000000 24.116 2.475 6.956 My output from g++ 4.7.2 n Heap Merge Quick 100 0 0 0 1000 0 0 0 10000 0 0 0.01 100000 0.05 0.02 0.02 1000000 0.59 0.33 0.29 10000000 10.78 3.79 3.3 I used a standard bubbleDown and swap implementation with heap. A recursive mergesort with a merge to merge the two sorted subarrays. A recursive quicksort with a median of 3 pivot and partition function. I have always understood quicksort to be the fastest general sorting algorithm. On VS it really lags behind merge, and heap just goes up quickly when I hit 10 million on VS.
Clamping to "easy" numbers
So I'm trying to make a graphing application, and I'm using Desmos as a base for that. The thing I'm struggling with is the way Desmos handles the subdivisions of the axes. When you zoom in or out the scales are always on "easy" simple numbers like 5, 100, 1000 etc. So my question is: how does one go about simplifying their scale with any level of zoom? BTW: Using C++
I was going to write a description of how to do this in general, but then I realize that the code may be easier than explaining. Most important step: define precisely what you mean by "easy simple" numbers. Example #1: 1, 2, 4, 8, 16, 32, 64, 128, ... , 1073741824, ... These are powers of two. So, a straightforward ceil(log(x)/log(2.0)) will solve it. Example #2: 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, ... There is a mixture of powers of two, and some multiples of it. Let's take a closer look. A subset of these can be described as powers of ten. Changing the formula to ceil(log(x)/log(10.0)) will solve it. For each power-of-ten, its multiples by 2.0 and 5.0 are also "easy simple numbers". Inside each iteration, after checking the power-of-ten value, also check the two multiples. If it fits inside one of the multiple, that value can be returned as result. Code The following code is only meant to explain the concept. It is not efficient - an efficient version should have used logarithm to get the result in O(1) time. #include <iostream> #include <vector> #include <limits> #include <stdexcept> #include <algorithm> using namespace std; double getNiceAxisLength(double value, double baseLength, double step, const std::vector<double>& subSteps) { typedef std::vector<double>::const_iterator VecDoubleIter; if (value < 0.0) { throw std::invalid_argument("Error: value must be non-negative. Take absolute value if necessary."); } if (baseLength <= 0.0) { throw std::invalid_argument("Error: baseLength must be positive."); } if (step <= 1.0) { throw std::invalid_argument("Error: step must be strictly greater than 1."); } for (VecDoubleIter iter = subSteps.begin(); iter != subSteps.end(); ++iter) { double subStep = *iter; if (subStep <= 1.0 || subStep >= step) { throw std::invalid_argument("Error: each subStep must be strictly greater than 1, and strictly smaller than step."); } } // make ascending. std::vector<double> sortedSubSteps(subSteps.begin(), subSteps.end()); std::sort(sortedSubSteps.begin(), sortedSubSteps.end()); if (value <= baseLength) { return baseLength; } double length = baseLength; double terminateLength = numeric_limits<double>::max() / step; while (length < terminateLength) { for (VecDoubleIter iter = sortedSubSteps.begin(); iter != sortedSubSteps.end(); ++iter) { double subStep = *iter; if (value <= length * subStep) { return (length * subStep); } } double nextLength = length * step; if (value <= nextLength) { return nextLength; } length = nextLength; } return baseLength; } int main() { double baseLength = 1.0; double step = 10.0; std::vector<double> subSteps; subSteps.push_back(2.5); subSteps.push_back(5); for (int k = 0; k < 1000; k += ((k >> 2) + 1)) { double value = k; double result = getNiceAxisLength(value, baseLength, step, subSteps); cout << "k: " << value << " result: " << result << endl; } cout << "Hello world!" << endl; return 0; } Output k: 0 result: 1 k: 1 result: 1 k: 2 result: 2.5 k: 3 result: 5 k: 4 result: 5 k: 6 result: 10 k: 8 result: 10 k: 11 result: 25 k: 14 result: 25 k: 18 result: 25 k: 23 result: 25 k: 29 result: 50 k: 37 result: 50 k: 47 result: 50 k: 59 result: 100 k: 74 result: 100 k: 93 result: 100 k: 117 result: 250 k: 147 result: 250 k: 184 result: 250 k: 231 result: 250 k: 289 result: 500 k: 362 result: 500 k: 453 result: 500 k: 567 result: 1000 k: 709 result: 1000 k: 887 result: 1000 Hello world! Hello world!