I was curious on vector lookup vs map lookup and wrote a little test program for it.. its seems like vector is always faster the way I'm using it.. is there something else I should take into consideration here? Is the test biased in any way? The results of a run is at the bottom.. its in nanoseconds, but gcc doesn't seem to support it on my platform.
Using string for the lookup would of course change things a lot.
The compile line I'm using is this: g++ -O3 --std=c++0x -o lookup lookup.cpp
#include <iostream>
#include <vector>
#include <map>
#include <unordered_map>
#include <chrono>
#include <algorithm>
unsigned dummy = 0;
class A
{
public:
A(unsigned id) : m_id(id){}
unsigned id(){ return m_id; }
void func()
{
//making sure its not optimized away
dummy++;
}
private:
unsigned m_id;
};
class B
{
public:
void func()
{
//making sure its not optimized away
dummy++;
}
};
int main()
{
std::vector<A> v;
std::unordered_map<unsigned, B> u;
std::map<unsigned, B> m;
unsigned elementCount = 1;
struct Times
{
unsigned long long v;
unsigned long long u;
unsigned long long m;
};
std::map<unsigned, Times> timesMap;
while(elementCount != 10000000)
{
elementCount *= 10;
for(unsigned i = 0; i < elementCount; ++i)
{
v.emplace_back(A(i));
u.insert(std::make_pair(i, B()));
m.insert(std::make_pair(i, B()));
}
std::chrono::time_point<std::chrono::steady_clock> start = std::chrono::high_resolution_clock::now();
for(unsigned i = 0; i < elementCount; ++i)
{
auto findItr = std::find_if(std::begin(v), std::end(v),
[&i](A & a){ return a.id() == i; });
findItr->func();
}
auto tp0 = std::chrono::high_resolution_clock::now()- start;
unsigned long long vTime = std::chrono::duration_cast<std::chrono::nanoseconds>(tp0).count();
start = std::chrono::high_resolution_clock::now();
for(unsigned i = 0; i < elementCount; ++i)
{
u[i].func();
}
auto tp1 = std::chrono::high_resolution_clock::now()- start;
unsigned long long uTime = std::chrono::duration_cast<std::chrono::nanoseconds>(tp1).count();
start = std::chrono::high_resolution_clock::now();
for(unsigned i = 0; i < elementCount; ++i)
{
m[i].func();
}
auto tp2 = std::chrono::high_resolution_clock::now()- start;
unsigned long long mTime = std::chrono::duration_cast<std::chrono::nanoseconds>(tp2).count();
timesMap.insert(std::make_pair(elementCount ,Times{vTime, uTime, mTime}));
}
for(auto & itr : timesMap)
{
std::cout << "Element count: " << itr.first << std::endl;
std::cout << "std::vector time: " << itr.second.v << std::endl;
std::cout << "std::unordered_map time: " << itr.second.u << std::endl;
std::cout << "std::map time: " << itr.second.m << std::endl;
std::cout << "-----------------------------------" << std::endl;
}
std::cout << dummy;
}
./lookup
Element count: 10
std::vector time: 0
std::unordered_map time: 0
std::map time: 1000
-----------------------------------
Element count: 100
std::vector time: 0
std::unordered_map time: 3000
std::map time: 13000
-----------------------------------
Element count: 1000
std::vector time: 2000
std::unordered_map time: 29000
std::map time: 138000
-----------------------------------
Element count: 10000
std::vector time: 22000
std::unordered_map time: 287000
std::map time: 1610000
-----------------------------------
Element count: 100000
std::vector time: 72000
std::unordered_map time: 1539000
std::map time: 8994000
-----------------------------------
Element count: 1000000
std::vector time: 746000
std::unordered_map time: 12654000
std::map time: 154060000
-----------------------------------
Element count: 10000000
std::vector time: 8001000
std::unordered_map time: 123608000
std::map time: 2279362000
-----------------------------------
33333330
I'm not at all shocked the vector tested better than anything else. The asm code for it (actual disassembly) breaks down to this (on my Apple LLVM 4.2 at full opt):
0x100001205: callq 0x100002696 ; symbol stub for: std::__1::chrono::steady_clock::now()
0x10000120a: testl %r13d, %r13d
0x10000120d: leaq -272(%rbp), %rbx
0x100001214: je 0x100001224 ; main + 328 at main.cpp:78
0x100001216: imull $10, %r14d, %ecx
0x10000121a: incl 7896(%rip) ; dummy
0x100001220: decl %ecx
0x100001222: jne 0x10000121a ; main + 318 [inlined] A::func() at main.cpp:83
main + 318 at main.cpp:83
0x100001224: movq %rax, -280(%rbp)
0x10000122b: callq 0x100002696 ; symbol stub for: std::__1::chrono::
Note the 'loop' (the jne 0x10000121a). The "find_if" has been completely optimized out, and the result is effectively a sweep over the array with a decrementing register to count how many times to increment the global. Thats all that is being done; there is no searching of any kind undergone in this.
So yeah, its how you're using it.
First, you don't seem to clear your containers between tests. So they don't contain what you think they do.
Second, according to your times, your vector exhibits linear time, which is something that just can't be, as complexity is O(N*N) in your algorithm. Probably it WAS optimized away. Instead of trying to combat optimization, I would suggest just turning it off.
Third, your values are too predictable for a vector. This can impact it dramatically. Try random values (or a random_shuffle())
Related
I wrote a program which use std::thread::hardware_concurrency to get how much threads my computer could support.Then I divide the size of array by N and get N blocks. And I create N threads to calculate the sum of the block.Here is the code
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <numeric>
#include <thread>
#include <vector>
#include <stdlib.h>
int64_t thread_cost_time = 0;
template <typename Iterator, typename T> struct accumulate_block {
void operator()(Iterator first, Iterator last, T &result) {
using namespace std::chrono;
auto start = std::chrono::high_resolution_clock::now();
result = std::accumulate(first, last, result);
auto stop = std::chrono::high_resolution_clock::now();
auto thread_time =
std::chrono::duration_cast<microseconds>(stop - start).count();
thread_cost_time = std::max(thread_time, thread_cost_time);
}
};
template <typename Iterator, typename T>
T parallel_accumulate(Iterator first, Iterator last, T &init, uint64_t num) {
uint64_t length = std::distance(first, last);
const uint64_t min_per_thread = 25;
// it will assign 12 to hard_ware_threads in my pc
const uint64_t hardware_threads = std::thread::hardware_concurrency();
const uint64_t max_threads = (length + min_per_thread - 1) / (min_per_thread);
// const uint64_t num_threads = std::min(hardware_threads != 0 ?
// hardware_threads : 2,
// max_threads);
const uint64_t num_threads = num;
const uint64_t block_size = length / num_threads;
std::vector<T> results(num_threads);
std::vector<std::thread> threads(num_threads - 1);
Iterator block_start = first;
for (uint64_t i = 0; i < num_threads - 1; i++) {
Iterator block_end = block_start;
std::advance(block_end, block_size);
// calculate the sum of block
threads[i] = std::thread{accumulate_block<Iterator, T>(), block_start,
block_end, std::ref(results[i])};
block_start = block_end;
}
accumulate_block<Iterator, T>()(block_start, last, results[num_threads - 1]);
std::for_each(threads.begin(), threads.end(),
std::mem_fn(&std::thread::join));
return std::accumulate(results.begin(), results.end(), init);
}
int main(int argc, char *argv[]) {
// constexpr const uint64_t sz = 1000000000;
for (int number = 2; number < 32; number++) {
int64_t parr = 0;
int64_t single = 0;
int64_t thread_trivial = 0;
std::cout
<< "--------------------------------------------------------------"
<< std::endl;
std::cout << "---------------------thread: " << number
<< "-----------------------" << std::endl;
int iter_times = 10;
for (int iter = 0; iter < iter_times; iter++) {
thread_cost_time = 0;
constexpr const uint64_t sz = 100000000 ;
std::vector<uint64_t> arr;
for (uint32_t i = 0; i < sz; i++) {
arr.emplace_back(i);
}
using namespace std::chrono;
auto start = std::chrono::high_resolution_clock::now();
uint64_t init = 0;
parallel_accumulate<decltype(arr.begin()), uint64_t>(
arr.begin(), arr.end(), std::ref(init), number);
auto stop = std::chrono::high_resolution_clock::now();
parr += std::chrono::duration_cast<microseconds>(stop - start).count();
thread_trivial +=
std::chrono::duration_cast<microseconds>(stop - start).count() -
thread_cost_time;
uint64_t init_ = 0;
uint64_t arr_sz = arr.size();
// uint64_t block_sz = arr.size() / 2;
start = std::chrono::high_resolution_clock::now();
std::accumulate(arr.begin(), arr.end(), init_);
// std::cout << init_ << std::endl;
stop = std::chrono::high_resolution_clock::now();
single += std::chrono::duration_cast<microseconds>(stop - start).count();
}
std::cout << "parallel " << parr / iter_times<< std::endl;
std::cout << "single thread " << single / iter_times<< std::endl;
std::cout << "parr is "
<< static_cast<double>(single) / static_cast<double>(parr)
<< "X fast" << std::endl;
std::cout << "thread create and destory time " << thread_trivial / iter_times
<< std::endl;
}
}
I record the time of multithread and single thread.
I can only achieve at most 6.57x faster than use only one thread, even though std::thread::hardware_concurrency tell me I have 12 threads could run simultaneously.
There were no contention of lock in this program.I also record the time of create and destory the thread, even if I minus it , I still cannot achieve 12X faster.
I think maybe thread schedule will make multithreads slow, but I have 12 threads, It shouldn't achieve only 6.57x faster.
I think maybe multithreads will decrease the hit ratio of cache,but I'm not quite sure.
So how can I achieve 12X faster than use only one thread?
Here is my static of my program
threads
parallel
single
faster
2
324868
633777
1.95
3
218584
633777
2.87
4
167169
633777
3.77
5
136542
633777
4.64
6
113207
633777
5.48
7
147324
633777
4.27
8
136768
633777
4.67
You could run my code to get the data from 2 threads to 31 threads
Apparently, at least on my Intel core i7, std::thread::hardware_concurrency() returns the number of hardware threads available. On hardware with simultaneous multi-threading typically 2 hardware threads share time on a single hardware core. The hardware core switches transparently between the 2 hardware threads. That means you only get about half the speedup factor that you might expect based on the result of std::thread::hardware_concurrency().
In practice each hardware thread will stall from time to time for various reasons, e.g. waiting for data to arrive from memory, giving the other hardware thread extra processing time. Typically simultaneous multi-threading (or Hyper-threading as Intel calls it) will give you an extra 15% of performance that way, so you may expect a speedup factor of up to (12/2)*(115/100) = 6.9.
Overheads, including the one you mention, but also in my experience the increased working-set size, can further reduce the speed-up factor.
I encountered a similar problem like this question : Timed vector vs map vs unordered_map lookup
But my case only on vector vs unordered_map on a small scale of elements(0-100, mostly will be 0-20). So I changed the author #gitgregor code:
#include <iostream>
#include <vector>
#include <map>
#include <unordered_map>
#include <chrono>
#include <algorithm>
unsigned dummy = 0;
int main()
{
std::vector<unsigned> v;
std::unordered_map<unsigned, unsigned> u;
unsigned elementCount = 1;
struct Times
{
unsigned long long v;
unsigned long long u;
};
std::map<unsigned, Times> timesMap;
while (elementCount != 100)
{
v.clear();
u.clear();
elementCount *= 10;
std::vector<unsigned int> tmp;
tmp.reserve(elementCount);
for (unsigned i = 0; i < elementCount; ++i)
{
tmp.push_back(std::rand()%50000);
}
// fill vector and unmap with random numbers
for (const auto integer : tmp)
{
v.emplace_back(integer);
u.insert(std::make_pair(integer, 1));
}
// fill a testset with 10000 random numbers to test lookup
std::vector<unsigned> tmp2;
tmp2.reserve(10000);
for (int i = 0; i < tmp2.size(); i++)
{
tmp2.push_back(std::rand()%50000);
}
std::chrono::time_point<std::chrono::steady_clock> start = std::chrono::high_resolution_clock::now();
for (const auto integer : tmp2)
{
auto findItr = std::find(std::begin(v), std::end(v), integer);
if (findItr != v.end())
{
dummy++;
}
}
auto tp0 = std::chrono::high_resolution_clock::now() - start;
unsigned long long vTime = std::chrono::duration_cast<std::chrono::nanoseconds>(tp0).count();
start = std::chrono::high_resolution_clock::now();
for (const auto integer : tmp2)
{
const bool res = u[integer] == 0;
if (res)
{
dummy++;
}
}
auto tp1 = std::chrono::high_resolution_clock::now() - start;
unsigned long long uTime = std::chrono::duration_cast<std::chrono::nanoseconds>(tp1).count();
timesMap.insert(std::make_pair(elementCount, Times{ vTime, uTime }));
}
for (auto& itr : timesMap)
{
std::cout << "Element count: " << itr.first << std::endl;
std::cout << "std::vector time: " << itr.second.v << std::endl;
std::cout << "std::unordered_map time: " << itr.second.u << std::endl;
std::cout << "-----------------------------------" << std::endl;
}
std::cout << dummy;
}
I turned off optimization and have random numbers to fill vector and unordered_map, and use a number set of 10000 random numbers to test lookup. But the results are not consistent at all:
First run:
Element count: 10
std::vector time: 100
std::unordered_map time: 100
-----------------------------------
Element count: 100
std::vector time: 0
std::unordered_map time: 100
-----------------------------------
Second Run:
Element count: 10
std::vector time: 200
std::unordered_map time: 200
-----------------------------------
Element count: 100
std::vector time: 100
std::unordered_map time: 100
-----------------------------------
Third Run:
Element count: 10
std::vector time: 100
std::unordered_map time: 0
-----------------------------------
Element count: 100
std::vector time: 100
std::unordered_map time: 0
-----------------------------------
And the results look also strange with only numbers : 0, 100 and 200.
Does anyone have some idea why?
I found the real reason for your code not measuring correctly, because you had loop for (int i = 0; i < tmp2.size(); i++) which runs 0 times, because size of tmp2 is 0 at start. Hence you were testing against 0 integers from tmp2, hence you had almost 0 time (no operations made).
I modified your code to fix issue above, also fix some compilation issues, also set number of integers (iterations) to 1 million instead of 10 thousand, also I computed average running time (in nano seconds) instead of total time, also added std::map measurement in case you want to chooe it. Down below is full corrected code.
Try it online!
#include <iostream>
#include <vector>
#include <map>
#include <unordered_map>
#include <chrono>
#include <algorithm>
#include <cstdint>
volatile size_t dummy_total = 0;
int main()
{
std::vector<unsigned> v;
std::unordered_map<unsigned, unsigned> u;
std::map<unsigned, unsigned> m;
unsigned elementCount = 5;
struct Times
{
double v = 0;
double m = 0;
double u = 0;
};
std::map<unsigned, Times> timesMap;
while (elementCount <= 80)
{
size_t dummy = 0;
v.clear();
u.clear();
m.clear();
elementCount *= 2;
std::vector<unsigned int> tmp;
tmp.reserve(elementCount);
for (unsigned i = 0; i < elementCount; ++i)
{
tmp.push_back(std::rand()%50000);
}
// fill vector and unmap with random numbers
for (const auto integer : tmp)
{
v.emplace_back(integer);
u.insert(std::make_pair(integer, 1));
m.insert(std::make_pair(integer, 1));
}
// fill a testset with 10000 random numbers to test lookup
std::vector<unsigned> tmp2(1000000);
for (int i = 0; i < tmp2.size(); i++)
{
tmp2[i] = std::rand()%50000;
}
std::chrono::time_point<std::chrono::high_resolution_clock> start = std::chrono::high_resolution_clock::now();
for (const auto integer : tmp2)
{
auto findItr = std::find(std::begin(v), std::end(v), integer);
if (findItr != v.end())
{
++dummy;
}
}
auto tp0 = std::chrono::high_resolution_clock::now() - start;
double vTime = double(std::chrono::duration_cast<std::chrono::nanoseconds>(tp0).count()) / tmp2.size();
start = std::chrono::high_resolution_clock::now();
for (const auto integer : tmp2)
{
const bool res = u.count(integer) != 0;
if (res)
{
++dummy;
}
}
auto tp1 = std::chrono::high_resolution_clock::now() - start;
double uTime = double(std::chrono::duration_cast<std::chrono::nanoseconds>(tp1).count()) / tmp2.size();
start = std::chrono::high_resolution_clock::now();
for (const auto integer : tmp2)
{
const bool res = m.count(integer) != 0;
if (res)
{
++dummy;
}
}
auto tp2 = std::chrono::high_resolution_clock::now() - start;
double mTime = double(std::chrono::duration_cast<std::chrono::nanoseconds>(tp2).count()) / tmp2.size();
dummy_total = dummy_total + dummy;
timesMap.insert(std::make_pair(elementCount, Times{ vTime, mTime, uTime }));
}
for (auto& itr : timesMap)
{
std::cout << "Element count: " << itr.first << std::endl;
std::cout << "std::vector time: " << itr.second.v << " ns" << std::endl;
std::cout << "std::map time: " << itr.second.m << " ns" << std::endl;
std::cout << "std::unordered_map time: " << itr.second.u << " ns" << std::endl;
std::cout << "-----------------------------------" << std::endl;
}
std::cout << dummy_total;
}
Output:
Element count: 10
std::vector time: 12.8182 ns
std::map time: 26.4334 ns
std::unordered_map time: 10.2652 ns
-----------------------------------
Element count: 20
std::vector time: 24.1431 ns
std::map time: 33.9809 ns
std::unordered_map time: 13.0953 ns
-----------------------------------
Element count: 40
std::vector time: 60.7386 ns
std::map time: 42.3911 ns
std::unordered_map time: 20.1641 ns
-----------------------------------
Element count: 80
std::vector time: 102.167 ns
std::map time: 52.1565 ns
std::unordered_map time: 10.2345 ns
-----------------------------------
Element count: 160
std::vector time: 190.878 ns
std::map time: 68.7916 ns
std::unordered_map time: 13.5962 ns
-----------------------------------
18726
I have the following critical place in the code: I need to look up from 64-byte array around 1'000'000 times.
Minimal code:
#include <iostream>
#include <stdint.h>
#include <random>
#include <chrono>
#include <ctime>
#define TYPE uint8_t
#define n_lookup 64
int main(){
const int n_indices = 1000000;
TYPE lookup[n_lookup];
TYPE indices[n_indices];
TYPE result[n_indices];
// preparations
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, n_lookup);
for (int i=0; i < n_indices; i++) indices[i] = distribution(generator);
for (int i=0; i < n_lookup; i++) lookup[i] = distribution(generator);
std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
// main loop:
for (int i=0; i < n_indices; i++) {
result[i] = lookup[indices[i]];
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "computation took " << elapsed_seconds.count() * 1e9 / n_indices << " ns per element"<< std::endl;
// printing random numbers to avoid code elimination
std::cout << result[12] << result[45];
return 0;
}
After compiling with g++ lookup.cpp -std=gnu++11 -O3 -funroll-loops I get a bit less than 1ns per element on modern CPU.
I need this operation to work 2-3 times faster (without threads). How can I do this?
P.S. I also was investigating AVX512 (512 bits is exactly the size of lookup table!) instruction set, but it lacks 8-bit gather operations!
indices and result vectors are in different places in memory, but accessed in the same time. It leads to cache-misses. I suggest you to merge result and indices in one vector. Here is the code:
#include <iostream>
#include <stdint.h>
#include <random>
#include <chrono>
#include <ctime>
#define TYPE uint8_t
#define n_lookup 64
int main(){
const int n_indices = 2000000;
TYPE lookup[n_lookup];
// Merge indices and result
// If i is index, then i+1 is result
TYPE ind_res[n_indices];
// preparations
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, n_lookup);
for (int i=0; i < n_indices; i += 2) ind_res[i] = distribution(generator);
for (int i=0; i < n_lookup; i++) lookup[i] = distribution(generator);
std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
// main loop:
for (int i=0; i < n_indices; i += 2) {
ind_res[i+1] = lookup[ind_res[i]]; // more dense access here, no cache-miss
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "computation took " << elapsed_seconds.count() * 1e9 / n_indices << " ns per element"<< std::endl;
// printing random numbers to avoid code elimination
std::cout << ind_res[24] << ind_res[90];
return 0;
}
My tests shows tha this code runs much faster.
with -march=native this is what your loops compiles to:
movq %rax, %rbx
xorl %eax, %eax
.L145:
movzbl 128(%rsp,%rax), %edx
movzbl 64(%rsp,%rdx), %edx
movb %dl, 1000128(%rsp,%rax)
addq $1, %rax
cmpq $1000000, %rax
jne .L145
I'm struggling to see how that gets any quicker without parallelisation.
By changing TYPE to int32_t, it gets vectorised:
vpcmpeqd %ymm2, %ymm2, %ymm2
movq %rax, %rbx
xorl %eax, %eax
.L145:
vmovdqa -8000048(%rbp,%rax), %ymm1
vmovdqa %ymm2, %ymm3
vpgatherdd %ymm3, -8000304(%rbp,%ymm1,4), %ymm0
vmovdqa %ymm0, -4000048(%rbp,%rax)
addq $32, %rax
cmpq $4000000, %rax
jne .L145
vzeroupper
Might that help?
At first, there is a bug, distribution(0, 64) produces numbers 0 to 64, 64 can not fit into the array.
You can speed up the lookup 2x by looking up two values a time:
#include <iostream>
#include <stdint.h>
#include <random>
#include <chrono>
#include <ctime>
#define TYPE uint8_t
#define TYPE2 uint16_t
#define n_lookup 64
void tst() {
const int n_indices = 1000000;// has to be multiple of 2
TYPE lookup[n_lookup];
TYPE indices[n_indices];
TYPE result[n_indices];
TYPE2 lookup2[n_lookup * 256];
// preparations
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, n_lookup-1);
for (int i = 0; i < n_indices; i++) indices[i] = distribution(generator);
for (int i = 0; i < n_lookup; i++) lookup[i] = distribution(generator);
for (int i = 0; i < n_lookup; ++i) {
for (int j = 0; j < n_lookup; ++j) {
lookup2[(i << 8) | j] = (lookup[i] << 8) | lookup[j];
}
}
std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
TYPE2* indices2 = (TYPE2*)indices;
TYPE2* result2 = (TYPE2*)result;
// main loop:
for (int i = 0; i < n_indices / 2; ++i) {
*result2++ = lookup2[*indices2++];
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
for (int i = 0; i < n_indices; i++) {
if (result[i] != lookup[indices[i]]) {
std::cout << "!!!!!!!!!!!!!ERROR!!!!!!!!!!!!!";
}
}
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "computation took " << elapsed_seconds.count() * 1e9 / n_indices << " ns per element" << std::endl;
// printing random numbers to avoid code elimination
std::cout << result[12] << result[45];
}
int main() {
tst();
std::cin.get();
return 0;
}
Your code is already really fast. However
(on my system) the execution is about 4.858 % faster when you change
const int n_indices = 1000000;
to
const int n_indices = 1048576; // 2^10
This is not much, but it's something.
I am currently trying to benchmark various implementations of large loop performing arbitrary jobs, and I found myself with a very slow version when using boost transform iterators and boost counting_iterators.
I designed a small code that benchmark two loops that sums the product of all integers between 0 and SIZE-1 with an arbitrary integer (that I choose to be 1 in my example in order to avoid overflow).
Her's my code:
//STL
#include <iostream>
#include <algorithm>
#include <functional>
#include <chrono>
//Boost
#include <boost/iterator/transform_iterator.hpp>
#include <boost/iterator/counting_iterator.hpp>
//Compile using
// g++ ./main.cpp -o test -std=c++11
//Launch using
// ./test 1
#define NRUN 10
#define SIZE 128*1024*1024
struct MultiplyByN
{
MultiplyByN( size_t N ): m_N(N){};
size_t operator()(int i) const { return i*m_N; }
const size_t m_N;
};
int main(int argc, char* argv[] )
{
int N = std::stoi( argv[1] );
size_t sum = 0;
//Initialize chrono helpers
auto start = std::chrono::steady_clock::now();
auto stop = std::chrono::steady_clock::now();
auto diff = stop - start;
double msec=std::numeric_limits<double>::max(); //Set min runtime to ridiculously high value
MultiplyByN op(N);
//Perform multiple run in order to get minimal runtime
for(int k = 0; k< NRUN; k++)
{
sum = 0;
start = std::chrono::steady_clock::now();
for(int i=0;i<SIZE;i++)
{
sum += op(i);
}
stop = std::chrono::steady_clock::now();
diff = stop - start;
//Compute minimum runtime
msec = std::min( msec, std::chrono::duration<double, std::milli>(diff).count() );
}
std::cout << "First version : Sum of values is "<< sum << std::endl;
std::cout << "First version : Minimal Runtime was "<< msec << " msec "<< std::endl;
msec=std::numeric_limits<double>::max(); //Reset min runtime to ridiculously high value
//Perform multiple run in order to get minimal runtime
for(int k = 0; k< NRUN; k++)
{
start = std::chrono::steady_clock::now();
//Functional way to express the summation
sum = std::accumulate( boost::make_transform_iterator(boost::make_counting_iterator(0), op ),
boost::make_transform_iterator(boost::make_counting_iterator(SIZE), op ),
(size_t)0, std::plus<size_t>() );
stop = std::chrono::steady_clock::now();
diff = stop - start;
//Compute minimum runtime
msec = std::min( msec, std::chrono::duration<double, std::milli>(diff).count() );
}
std::cout << "Second version : Sum of values is "<< sum << std::endl;
std::cout << "Second version version : Minimal Runtime was "<< msec << " msec "<< std::endl;
return EXIT_SUCCESS;
}
And the output I get:
./test 1
First version : Sum of values is 9007199187632128
First version : Minimal Runtime was 433.142 msec
Second version : Sum of values is 9007199187632128
Second version version : Minimal Runtime was 10910.7 msec
The "functional" version of my loop that uses std::accumulate is 25 times slower than the simple loop version, why so ?
Thank you in advance for your help
Based on your comment in the code, you've compiled this with
g++ ./main.cpp -o test -std=c++11
Since you didn't specify the optimization level, g++ used the default setting, which is -O0 i.e. no optimization.
That means that the compiler didn't inline anything. Template libraries like the standard library or boost depend on inlining for performance. Additionally, the compiler will produce a lot of extra code, that's far from optimal -- it doesn't make any sense to make performance comparisons on such binaries.
Recompile with optimization enabled, and try your test again to get meaningful results.
There is something that baffles me with integer arithmetic in tutorials. To be precise, integer division.
The seemingly preferred method is by casting the divisor into a float, then rounding the float to the nearest whole number, then cast that back into integer:
#include <cmath>
int round_divide_by_float_casting(int a, int b){
return (int) std::roundf( a / (float) b);
}
Yet this seems like scratching your left ear with your right hand. I use:
int round_divide (int a, int b){
return a / b + a % b * 2 / b;
}
It's no breakthrough, but the fact that it is not standard makes me wonder if I am missing anything?
Despite my (albeit limited) testing, I couldn't find any scenario where the two methods give me different results. Did someone run into some sort of scenario where the int → float → int casting produced more accurate results?
Arithmetic solution
If one defined what your functions should return, she would describe it as something close as "f(a, b) returns the closest integer of the result of the division of a by b in the real divisor ring."
Thus, the question can be summarized as: can we define this closest integer using only integer division. I think we can.
There is exactly two candidates as the closest integer: a / b and (a / b) + 1(1). The selection is easy, if a % b is closer to 0 as it is to b, then a / b is our result. If not, (a / b) + 1 is.
One could then write something similar to, ignoring optimization and good practices:
int divide(int a, int b)
{
const int quot = a / b;
const int rem = a % b;
int result;
if (rem < b - rem) {
result = quot;
} else {
result = quot + 1;
}
return result;
}
While this definition satisfies out needs, one could optimize it by not computing two times the division of a by b with the use of std::div():
int divide(int a, int b)
{
const std::div_t dv = std::div(a, b);
int result = dv.quot;
if (dv.rem >= b - dv.rem) {
++result;
}
return result;
}
The analysis of the problem we did earlier assures us of the well defined behaviour of our implementation.
(1)There is just one last thing to check: how does it behaves when a or b is negative? This is left to the reader ;).
Benchmark
#include <iostream>
#include <iomanip>
#include <string>
// solutions
#include <cmath>
#include <cstdlib>
// benchmak
#include <limits>
#include <random>
#include <chrono>
#include <algorithm>
#include <functional>
//
// Solutions
//
namespace
{
int round_divide_by_float_casting(int a, int b) {
return (int)roundf(a / (float)b);
}
int round_divide_by_modulo(int a, int b) {
return a / b + a % b * 2 / b;
}
int divide_by_quotient_comparison(int a, int b)
{
const std::div_t dv = std::div(a, b);
int result = dv.quot;
if (dv.rem >= b - dv.rem)
{
++result;
}
return result;
}
}
//
// benchmark
//
class Randomizer
{
std::mt19937 _rng_engine;
std::uniform_int_distribution<int> _distri;
public:
Randomizer() : _rng_engine(std::time(0)), _distri(std::numeric_limits<int>::min(), std::numeric_limits<int>::max())
{
}
template<class ForwardIt>
void operator()(ForwardIt begin, ForwardIt end)
{
std::generate(begin, end, std::bind(_distri, _rng_engine));
}
};
class Clock
{
std::chrono::time_point<std::chrono::steady_clock> _start;
public:
static inline std::chrono::time_point<std::chrono::steady_clock> now() { return std::chrono::steady_clock::now(); }
Clock() : _start(now())
{
}
template<class DurationUnit>
std::size_t end()
{
return std::chrono::duration_cast<DurationUnit>(now() - _start).count();
}
};
//
// Entry point
//
int main()
{
Randomizer randomizer;
std::array<int, 1000> dividends; // SCALE THIS UP (1'000'000 would be great)
std::array<int, dividends.size()> divisors;
std::array<int, dividends.size()> results;
randomizer(std::begin(dividends), std::end(dividends));
randomizer(std::begin(divisors), std::end(divisors));
{
Clock clock;
auto dividend = std::begin(dividends);
auto divisor = std::begin(divisors);
auto result = std::begin(results);
for ( ; dividend != std::end(dividends) ; ++dividend, ++divisor, ++result)
{
*result = round_divide_by_float_casting(*dividend, *divisor);
}
const float unit_time = clock.end<std::chrono::nanoseconds>() / static_cast<float>(results.size());
std::cout << std::setw(40) << "round_divide_by_float_casting(): " << std::setprecision(3) << unit_time << " ns\n";
}
{
Clock clock;
auto dividend = std::begin(dividends);
auto divisor = std::begin(divisors);
auto result = std::begin(results);
for ( ; dividend != std::end(dividends) ; ++dividend, ++divisor, ++result)
{
*result = round_divide_by_modulo(*dividend, *divisor);
}
const float unit_time = clock.end<std::chrono::nanoseconds>() / static_cast<float>(results.size());
std::cout << std::setw(40) << "round_divide_by_modulo(): " << std::setprecision(3) << unit_time << " ns\n";
}
{
Clock clock;
auto dividend = std::begin(dividends);
auto divisor = std::begin(divisors);
auto result = std::begin(results);
for ( ; dividend != std::end(dividends) ; ++dividend, ++divisor, ++result)
{
*result = divide_by_quotient_comparison(*dividend, *divisor);
}
const float unit_time = clock.end<std::chrono::nanoseconds>() / static_cast<float>(results.size());
std::cout << std::setw(40) << "divide_by_quotient_comparison(): " << std::setprecision(3) << unit_time << " ns\n";
}
}
Outputs:
g++ -std=c++11 -O2 -Wall -Wextra -Werror main.cpp && ./a.out
round_divide_by_float_casting(): 54.7 ns
round_divide_by_modulo(): 24 ns
divide_by_quotient_comparison(): 25.5 ns
Demo
The two arithmetic solutions' performances are not distinguishable (their benchmark converges when you scale the bench size up).
It would really depend on the processor, and the range of the integer which is better (and using double would resolve most of the range issues)
For modern "big" CPUs like x86-64 and ARM, integer division and floating point division are roughly the same effort, and converting an integer to a float or vice versa is not a "hard" task (and does the correct rounding directly in that conversion, at least), so most likely the resulting operations are.
atmp = (float) a;
btmp = (float) b;
resfloat = divide atmp/btmp;
return = to_int_with_rounding(resfloat)
About four machine instructions.
On the other hand, your code uses two divides, one modulo and a multiply, which is quite likely longer on such a processor.
tmp = a/b;
tmp1 = a % b;
tmp2 = tmp1 * 2;
tmp3 = tmp2 / b;
tmp4 = tmp + tmp3;
So five instructions, and three of those are "divide" (unless the compiler is clever enough to reuse a / b for a % b - but it's still two distinct divides).
Of course, if you are outside the range of number of digits that a float or double can hold without losing digits (23 bits for float, 53 bits for double), then your method MAY be better (assuming there is no overflow in the integer math).
On top of all that, since the first form is used by "everyone", it's the one that the compiler recognises and can optimise.
Obviously, the results depend on both the compiler being used and the processor it runs on, but these are my results from running the code posted above, compiled through clang++ (v3.9-pre-release, pretty close to released 3.8).
round_divide_by_float_casting(): 32.5 ns
round_divide_by_modulo(): 113 ns
divide_by_quotient_comparison(): 80.4 ns
However, the interesting thing I find when I look at the generated code:
xorps %xmm0, %xmm0
cvtsi2ssl 8016(%rsp,%rbp), %xmm0
xorps %xmm1, %xmm1
cvtsi2ssl 4016(%rsp,%rbp), %xmm1
divss %xmm1, %xmm0
callq roundf
cvttss2si %xmm0, %eax
movl %eax, 16(%rsp,%rbp)
addq $4, %rbp
cmpq $4000, %rbp # imm = 0xFA0
jne .LBB0_7
is that the round is actually a call. Which really surprises me, but explains why on some machines (particularly more recent x86 processors), it is faster.
g++ gives better results with -ffast-math, which gives around:
round_divide_by_float_casting(): 17.6 ns
round_divide_by_modulo(): 43.1 ns
divide_by_quotient_comparison(): 18.5 ns
(This is with increased count to 100k values)
Prefer the standard solution. Use std::div family of functions declared in cstdlib.
See: http://en.cppreference.com/w/cpp/numeric/math/div
Casting to float and then to int may be very inefficient on some architectures, for example, microcontrollers.
Thanks for the suggestions so far. To shed some light I made a test setup to compare performance.
#include <iostream>
#include <string>
#include <cmath>
#include <cstdlib>
#include <chrono>
using namespace std;
int round_divide_by_float_casting(int a, int b) {
return (int)roundf(a / (float)b);
}
int round_divide_by_modulo(int a, int b) {
return a / b + a % b * 2 / b;
}
int divide_by_quotient_comparison(int a, int b)
{
const std::div_t dv = std::div(a, b);
int result = dv.quot;
if (dv.rem <= b - dv.rem) {
++result;
}
return result;
}
int main()
{
int itr = 1000;
//while (true) {
auto begin = chrono::steady_clock::now();
for (int i = 0; i < itr; i++) {
for (int j = 10; j < itr + 1; j++) {
divide_by_quotient_comparison(i, j);
}
}
auto end = std::chrono::steady_clock::now();
cout << "divide_by_quotient_comparison(,) function took: "
<< chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count()
<< endl;
begin = chrono::steady_clock::now();
for (int i = 0; i < itr; i++) {
for (int j = 10; j < itr + 1; j++) {
round_divide_by_float_casting(i, j);
}
}
end = std::chrono::steady_clock::now();
cout << "round_divide_by_float_casting(,) function took: "
<< chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count()
<< endl;
begin = chrono::steady_clock::now();
for (int i = 0; i < itr; i++) {
for (int j = 10; j < itr + 1; j++) {
round_divide_by_modulo(i, j);
}
}
end = std::chrono::steady_clock::now();
cout << "round_divide_by_modulo(,) function took: "
<< chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count()
<< endl;
//}
return 0;
}
The results I got on my machine (i7 with Visual Studio 2015) was as follows: the modulo arithmetic was about twice as fast as the int → float → int casting method. The method relying on std::div_t (suggested by #YSC and #teroi) was faster than the int → float → int, but slower than the modulo arithmetic method.
A second test was performed to avoid certain compiler optimizations pointed out by #YSC:
#include <iostream>
#include <string>
#include <cmath>
#include <cstdlib>
#include <chrono>
#include <vector>
using namespace std;
int round_divide_by_float_casting(int a, int b) {
return (int)roundf(a / (float)b);
}
int round_divide_by_modulo(int a, int b) {
return a / b + a % b * 2 / b;
}
int divide_by_quotient_comparison(int a, int b)
{
const std::div_t dv = std::div(a, b);
int result = dv.quot;
if (dv.rem <= b - dv.rem) {
++result;
}
return result;
}
int main()
{
int itr = 100;
vector <int> randi, randj;
for (int i = 0; i < itr; i++) {
randi.push_back(rand());
int rj = rand();
if (rj == 0)
rj++;
randj.push_back(rj);
}
vector<int> f, m, q;
while (true) {
auto begin = chrono::steady_clock::now();
for (int i = 0; i < itr; i++) {
for (int j = 0; j < itr; j++) {
q.push_back( divide_by_quotient_comparison(randi[i] , randj[j]) );
}
}
auto end = std::chrono::steady_clock::now();
cout << "divide_by_quotient_comparison(,) function took: "
<< chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count()
<< endl;
begin = chrono::steady_clock::now();
for (int i = 0; i < itr; i++) {
for (int j = 0; j < itr; j++) {
f.push_back( round_divide_by_float_casting(randi[i], randj[j]) );
}
}
end = std::chrono::steady_clock::now();
cout << "round_divide_by_float_casting(,) function took: "
<< chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count()
<< endl;
begin = chrono::steady_clock::now();
for (int i = 0; i < itr; i++) {
for (int j = 0; j < itr; j++) {
m.push_back( round_divide_by_modulo(randi[i], randj[j]) );
}
}
end = std::chrono::steady_clock::now();
cout << "round_divide_by_modulo(,) function took: "
<< chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count()
<< endl;
cout << endl;
f.clear();
m.clear();
q.clear();
}
return 0;
}
In this second test the slowest was the divide_by_quotient() reliant on std::div_t, followed by divide_by_float(), and the fastest again was the divide_by_modulo(). However this time the performance difference was much, much lower, less than 20%.