Incorrect timing in release mode - c++

I'm trying to measure time of execution of the following code:
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <chrono>
uint64_t LCG(uint64_t LCG_state)
{
LCG_state = (LCG_state * 2862933555777941757 + 1422359891750319841);
return LCG_state;
}
int main()
{
auto begin = std::chrono::high_resolution_clock::now();
uint64_t LCG_state = 333;
uint32_t w;
for(int i=0; i<640000000; i++)
{
LCG_state = LCG(LCG_state);
w = LCG_state >> 32;
//std::cout << w << "\n";
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
}
I'm using option release in Code Blocks (because I think I should if I want to measure it right). Problem is that time measured is 0 s every time. What's more if I would do loop:
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <chrono>
uint64_t LCG(uint64_t LCG_state)
{
LCG_state = (LCG_state * 2862933555777941757 + 1422359891750319841);
return LCG_state;
}
int main()
{
auto begin = std::chrono::high_resolution_clock::now();
uint64_t LCG_state = 333;
uint32_t w;
for(int i=0; i<10000; i++)
{
for(int i=0; i<640000000; i++)
{
LCG_state = LCG(LCG_state);
w = LCG_state >> 32;
//std::cout << w << "\n";
}
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
}
Then still measurerd time is 0 s. In debug trybe everything works right, but measuring time of code with debug make no sense right? Especially I would like to compare it to for example this:
#include <stdint.h>
#include <iostream>
uint64_t s[2] = {5,11};
uint64_t result;
uint64_t next(void) {
uint64_t s1 = s[0];
uint64_t s0 = s[1];
uint64_t result = s0 + s1;
s[0] = s0;
s1 ^= s1 << 23; // a
s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
return result;
}
int main()
{
for(int i=0; i<160000000; i++)
//while (true)
{
//std::cout << next() << "\n";
result = next();
//char *c = reinterpret_cast<char*>(&result);
//std::cout.write(reinterpret_cast<char*>(&result), sizeof result);
}
}
I want to know what is faster. How to measure it right? Why is the execution time 0 seconds, does the code not execute at all?

You can add an empty asm statement dependent on the variable w
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <chrono>
uint64_t LCG(uint64_t LCG_state)
{
LCG_state = (LCG_state * 2862933555777941757 + 1422359891750319841);
return LCG_state;
}
int main()
{
auto begin = std::chrono::high_resolution_clock::now();
uint64_t LCG_state = 333;
uint32_t w;
for(int i=0; i<640000000; i++)
{
LCG_state = LCG(LCG_state);
w = LCG_state >> 32;
__asm__ volatile("" : "+g" (w) : :);
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
}
This is opaque to the compiler and will prevent the loop from being optimized out

Related

How to improved performance of code when dealing with large input in C++?

How would it be possible to make this code run faster in C++. The code takes a lot of time to run. The purpose is to determine how many gates are required to handle a prescribed
arrivals-and-departures schedule.
#include <vector>
struct Airplane {
int arrival_time_seconds;
int departure_time_seconds;
};
class Schedule {
private:
const std::vector<Airplane> airplanes_;
public:
Schedule(const std::vector<Airplane>& airplanes) :
airplanes_(airplanes) {}
int MaximumNumberOfPlanes() const {
int rv = 0;
for (const Airplane& airplane : airplanes_) {
int num_planes = NumberOfPlanes(airplane.arrival_time_seconds);
if (num_planes > rv) {
rv = num_planes;
}
}
return rv;
}
private:
int NumberOfPlanes(int time_seconds) const {
int rv = 0;
for (const Airplane& airplane : airplanes_) {
if (airplane.arrival_time_seconds < time_seconds &&
time_seconds <= airplane.departure_time_seconds) {
rv++;
}
}
return rv;
}
};
A lot of people stated that this can be made O(N), and it is possible to some extent. At least I was able to make it O(max(N,86400)) which is better than your version for N>294 and better than a O(NlogN) for N>6788.
I assume that if a plane departs the next day it has a departure_time_seconds = 86400 (the number of seconds in a day), while all arrival_time_seconds are lower than 86400.
You can compile a vector of the change in number of planes in O(N) and than use it to compute the current number of planes in the airport at every second in O(86400):
int MaximumNumberOfPlanes2() const {
int delta[24 * 60 * 60 + 1] = { 0 };
for (const Airplane& x : airplanes_) {
delta[x.arrival_time_seconds]++;
delta[x.departure_time_seconds]--;
}
int rv = 0;
int np = 0;
for (int i = 0; i < 24 * 60 * 60; ++i) {
np += delta[i];
rv = std::max(rv, np);
}
return rv;
}
A test program with some timing:
#include <vector>
#include <iostream>
#include <fstream>
#include <random>
#include <chrono>
#include <queue>
int main()
{
using namespace std;
using namespace std::chrono;
default_random_engine eng;
uniform_int_distribution<int> arr_dist(0, 24*60*60);
gamma_distribution<double> dep_dist(5, 3);
std::vector<Airplane> a;
for (int i = 0; i < 100000; ++i) {
int arrival = arr_dist(eng);
int departure = arrival + (20 + lround(dep_dist(eng))) * 60;
departure = min(departure, 24*60*60);
a.push_back({ arrival, departure });
}
Schedule s(a);
{
const auto& start = steady_clock::now();
int mnp = s.MaximumNumberOfPlanes();
const auto& stop = steady_clock::now();
duration<double> elapsed = stop - start;
std::cout << "MaximumNumberOfPlanes : " << mnp << " - Elapsed: " << elapsed.count() << " s\n";
}
{
const auto& start = steady_clock::now();
int mnp = s.MaximumNumberOfPlanes2();
const auto& stop = steady_clock::now();
duration<double> elapsed = stop - start;
std::cout << "MaximumNumberOfPlanes2: " << mnp << " - Elapsed: " << elapsed.count() << " s\n";
}
return 0;
}
This gives (on my laptop):
MaximumNumberOfPlanes : 2572 - Elapsed: 48.8979 s
MaximumNumberOfPlanes2: 2572 - Elapsed: 0.0010778 s

std::chrono - fixed time step loop

I'm trying to make fixed time step loop with using < chrono >.
This is my code:
#include <iostream>
#include <chrono>
int main()
{
std::chrono::steady_clock::time_point start;
const double timePerFrame = 1.0 / 60.0;
double accumulator = 0.0;
int i = 0;
while(true)
{
start = std::chrono::steady_clock::now();
while(accumulator >= timePerFrame)
{
accumulator -= timePerFrame;
std::cout << ++i << std::endl;
//update();
}
accumulator += std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - start).count();
//render();
}
return 0;
}
Value of variable "i" is printed less then 60 times a second. The same situation takes place when I'm trying to change "timePerFrame" to "1.0". What is wrong with it?
#include <iostream>
#include <chrono>
#include <thread>
int main()
{
using namespace std::chrono;
using Framerate = duration<steady_clock::rep, std::ratio<1, 60>>;
auto next = steady_clock::now() + Framerate{1};
int i = 0;
while(true)
{
std::cout << ++i << std::endl;
//update();
std::this_thread::sleep_until(next);
next += Framerate{1};
//render();
}
return 0;
}
Here's the same thing with a busy loop:
int main()
{
using namespace std::chrono;
using Framerate = duration<steady_clock::rep, std::ratio<1, 60>>;
auto next = steady_clock::now() + Framerate{1};
int i = 0;
while(true)
{
std::cout << ++i << std::endl;
//update();
while (steady_clock::now() < next)
;
next += Framerate{1};
//render();
}
return 0;
}

Vectorizing a program increases runtime

I am asked to vectorize a larger program. Before I started with the big program I wanted to see the effect of vectorization in isolated case. For this I created two programs that should show the idea of the outstanding transformation. One with an array of structs (no vec) and struct of arrays (with vec). I expected that the soa would outperform the aos by far, but it doesn't.
measured program loop A
for (int i = 0; i < NUM; i++) {
ptr[i].c = ptr[i].a + ptr[i].b;
}
full program:
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
struct myStruct {
double a, b, c;
};
#define NUM 100000000
high_resolution_clock::time_point t1, t2, t3;
int main(int argc, char* argsv[]) {
struct myStruct *ptr = (struct myStruct *) malloc(NUM * sizeof(struct myStruct));
for (int i = 0; i < NUM; i++) {
ptr[i].a = i;
ptr[i].b = 2 * i;
}
t1 = high_resolution_clock::now();
for (int i = 0; i < NUM; i++) {
ptr[i].c = ptr[i].a + ptr[i].b;
}
t2 = high_resolution_clock::now();
long dur = duration_cast<microseconds>( t2 - t1 ).count();
cout << "took "<<dur << endl;
double sum = 0;
for (int i = 0; i < NUM; i++) {
sum += ptr[i].c;
}
cout << "sum is "<< sum << endl;
}
measured program loop B
#pragma simd
for (int i = 0; i < NUM; i++) {
C[i] = A[i] + B[i];
}
full program:
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
#include <omp.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
#define NUM 100000000
high_resolution_clock::time_point t1, t2, t3;
int main(int argc, char* argsv[]) {
double *A = (double *) malloc(NUM * sizeof(double));
double *B = (double *) malloc(NUM * sizeof(double));
double *C = (double *) malloc(NUM * sizeof(double));
for (int i = 0; i < NUM; i++) {
A[i] = i;
B[i] = 2 * i;
}
t1 = high_resolution_clock::now();
#pragma simd
for (int i = 0; i < NUM; i++) {
C[i] = A[i] + B[i];
}
t2 = high_resolution_clock::now();
long dur = duration_cast<microseconds>( t2 - t1 ).count();
cout << "Aos "<<dur << endl;
double sum = 0;
for (int i = 0; i < NUM; i++) {
sum += C[i];
}
cout << "sum "<<sum;
}
I compile with
icpc vectorization_aos.cpp -qopenmp --std=c++11 -cxxlib=/lrz/mnt/sys.x86_64/compilers/gcc/4.9.3/
icpc (v16)
compiled and executed on an Intel(R) Xeon(R) CPU E5-2697 v3 # 2.60GHz
in my test cases program A takes around 300ms, B 350ms. If I add unnecessary additional data to the struct in A it becomes increasingly slower (as more memory has to be loaded)
the -O3 flag does not have any impact on run-time
removing the #pragma simd directive does also not have impact. So either its auto vectorized or my vectorization does not work at all.
Questions:
am I missing something? Is this the way how one would vectorize a program?
Why is program 2 slower? Maybe the program is both times just memory bandwidth bound and I need to increase the computation density?
Are there programs/ code snippets that show the impact of vecotrization better and how can I verify that my program is actually executed vectorized.

Avoid blas when involving temporary memory allocation?

I have a program that computes the matrix product x'Ay repeatedly. Is it better practice to compute this by making calls to MKL's blas, i.e. cblas_dgemv and cblas_ddot, which requires allocating memory to a temporary vector, or is better to simply take the sum of x_i * a_ij * y_j? In other words, does MKL's blas theoretically add any value?
I benchmarked this for my laptop. There was virtually no difference in each of the tests, other than g++_no_blas performed twice as poorly as the other tests (why?). There was also no difference between O2, O3 and Ofast.
g++_blas_static 57ms
g++_blas_dynamic 58ms
g++_no_blas 100ms
icpc_blas_static 57ms
icpc_blas_dynamic 58ms
icpc_no_blas 58ms
util.h
#ifndef UTIL_H
#define UTIL_H
#include <random>
#include <memory>
#include <iostream>
struct rng
{
rng() : unif(0.0, 1.0)
{
}
std::default_random_engine re;
std::uniform_real_distribution<double> unif;
double rand_double()
{
return unif(re);
}
std::unique_ptr<double[]> generate_square_matrix(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N * N]);
for (unsigned i = 0; i < N; ++i)
{
for (unsigned j = 0; j < N; ++j)
{
p.get()[i*N + j] = rand_double();
}
}
return p;
}
std::unique_ptr<double[]> generate_vector(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N]);
for (unsigned i = 0; i < N; ++i)
{
p.get()[i] = rand_double();
}
return p;
}
};
#endif // UTIL_H
main.cpp
#include <iostream>
#include <iomanip>
#include <memory>
#include <chrono>
#include "util.h"
#include "mkl.h"
double vtmv_blas(double* x, double* A, double* y, const unsigned n)
{
double temp[n];
cblas_dgemv(CblasRowMajor, CblasNoTrans, n, n, 1.0, A, n, y, 1, 0.0, temp, 1);
return cblas_ddot(n, temp, 1, x, 1);
}
double vtmv_non_blas(double* x, double* A, double* y, const unsigned n)
{
double r = 0;
for (unsigned i = 0; i < n; ++i)
{
for (unsigned j = 0; j < n; ++j)
{
r += x[i] * A[i*n + j] * y[j];
}
}
return r;
}
int main()
{
std::cout << std::fixed;
std::cout << std::setprecision(2);
constexpr unsigned N = 10000;
rng r;
std::unique_ptr<double[]> A = r.generate_square_matrix(N);
std::unique_ptr<double[]> x = r.generate_vector(N);
std::unique_ptr<double[]> y = r.generate_vector(N);
auto start = std::chrono::system_clock::now();
const double prod = vtmv_blas(x.get(), A.get(), y.get(), N);
auto end = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start);
std::cout << "Result: " << prod << std::endl;
std::cout << "Time (ms): " << duration.count() << std::endl;
GCC no blas is poor because it does not use vectorized SMID instructions, while others all do. icpc will auto-vectorize you loop.
You don't show your matrix size, but generally gemv is memory bound. As the matrix is much larger than a temp vector, eliminating it may not be able to increase the performance a lot.

Fastest way to determine whether elements of a vector y occur in a vector x

I have the following problem: I have two vectors x and y of type double that are increasingly sorted and I would like to obtain a vector z indicating whether an element of y is present in x. Up to now, I have used std::binary_search in a for-loop as illustrated below, but I think there should be a faster way making use of the fact that also x is sorted?
The issue is that this needs to be super fast as it turns out to be the bottleneck in my code.
For those familiar with R, I need an equivalent to match(y, x, nomatch = 0L) > 0L.
#include <iostream>
#include <algorithm>
#include <vector>
int main() {
using namespace std;
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
vector<bool> z(y.size());
for (int i = 0; i != y.size(); ++i)
z[i] = binary_search(x.begin(), x.end(), y[i]);
for (vector<bool>::const_iterator i = z.begin(); i != z.end(); ++i)
cout << *i << " ";
return 0;
}
EDIT
Here are representative sample data for my problem:
#include <iostream>
#include <algorithm>
#include <vector>
#include <cstdlib>
#include <ctime>
// function generator:
double RandomNumber () { return (std::rand() / 10e+7); }
int main() {
using namespace std;
std::srand ( unsigned ( std::time(0) ) );
// 5000 is representative
int n = 5000;
std::vector<double> x (n);
std::generate (x.begin(), x.end(), RandomNumber);
std::vector<double> y (n);
std::generate (y.begin(), y.end(), RandomNumber);
for(std::vector<double>::const_iterator i = x.begin(); i != x.end(); i++) {
y.push_back(*i);
}
std::sort(x.begin(), x.end());
std::sort(y.begin(), y.end());
return 0;
}
You can use std::set_itersection:
#include <vector>
#include <algorithm>
#include <iterator>
#include <iostream>
int main()
{
std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
std::vector<double> z {};
std::set_intersection(std::cbegin(x), std::cend(x),
std::cbegin(y), std::cend(y),
std::back_inserter(z));
std::copy(std::cbegin(z), std::cend(z),
std::ostream_iterator<double> {std::cout, " "});
}
Edit
To address Dieter Lücking point in the comments, here is a version that more closely matches R's match function:
#include <vector>
#include <deque>
#include <algorithm>
#include <iterator>
#include <functional>
#include <memory>
#include <iostream>
template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
std::vector<std::reference_wrapper<const T>> z {};
z.reserve(std::min(y.size(), x.size()));
std::set_intersection(std::cbegin(y), std::cend(y),
std::cbegin(x), std::cend(x),
std::back_inserter(z));
std::deque<bool> result(y.size(), false);
for (const auto& e : z) {
result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
}
return result;
}
int main()
{
std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
const auto matches = match(y, x);
std::copy(std::cbegin(matches), std::cend(matches),
std::ostream_iterator<bool> {std::cout});
}
I picked up all your codes, Dieter timing sample and the sample data of 5000 random doubles of the OP to perform a more complete timing of all the alternatives. This is the code:
#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>
#include <iterator>
#include <cstdlib>
#include <ctime>
#include <assert.h>
#include <deque>
#include <functional>
#include <memory>
using namespace std;
double RandomNumber () { return (std::rand() / 10e+7); }
template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
std::vector<std::reference_wrapper<const T>> z {};
z.reserve(std::min(y.size(), x.size()));
std::set_intersection(y.cbegin(), y.cend(),
x.cbegin(), x.cend(),
std::back_inserter(z));
std::deque<bool> result(y.size(), false);
for (const auto& e : z) {
result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
}
return result;
}
int main() {
const int NTESTS = 10;
long long time1 = 0;
long long time2 = 0;
long long time3 = 0;
long long time3_prime = 0;
long long time4 = 0;
long long time5 = 0;
long long time6 = 0;
for (int i = 0; i < NTESTS; ++i){
std::srand ( unsigned ( std::time(0) ) );
// 5000 is representative
int n = 5000;
std::vector<double> x (n);
std::generate (x.begin(), x.end(), RandomNumber);
std::vector<double> y (n);
std::generate (y.begin(), y.end(), RandomNumber);
for(std::vector<double>::const_iterator i = x.begin(); i != x.end(); i++) {
y.push_back(*i);
}
std::sort(x.begin(), x.end());
std::sort(y.begin(), y.end());
vector<bool> z1(y.size());
vector<unsigned char> z2(y.size());
vector<unsigned char> z3(y.size());
std::deque<bool> z3_prime;
vector<bool> z4(y.size());
std::vector<bool> z5(y.size());
std::vector<bool> z6(y.size());
// Original
{
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i != y.size(); ++i) {
z1[i] = binary_search(x.begin(), x.end(), y[i]);
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time1 += duration.count();
}
// Original (replacing vector<bool> by vector<unsigned char>)
{
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i != y.size(); ++i) {
z2[i] = binary_search(x.begin(), x.end(), y[i]);
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time2 += duration.count();
}
{ // Dieter Lücking set_intersection
auto start = std::chrono::high_resolution_clock::now();
size_t ix = 0;
size_t iy = 0;
while(ix < x.size() && iy < y.size())
{
if(x[ix] < y[iy]) ++ix;
else if(y[iy] < x[ix]) ++iy;
else {
z3[iy] = 1;
// ++ix; Not this if one vector is not uniquely sorted
++iy;
}
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time3 += duration.count();
}
// Std::set_intersection
{
auto start = std::chrono::high_resolution_clock::now();
z3_prime = match(y, x);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time3_prime += duration.count();
}
{ // Ed Heal
auto start = std::chrono::high_resolution_clock::now();
int i_x = 0, i_y = 0;
while (i_x < x.size() && i_y < y.size())
{
if (x[i_x] == y[i_y]) {
//cout << "In both" << x[i_x] << endl;
z4[i_y] = true;
++i_x;
++i_y;
} else if (x[i_x] < y[i_y]) {
++i_x;
} else {
z4[i_y] = false;
++i_y;
}
}
/* for (; i_y < y.size(); ++i_y) {
//Empty
} */
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time4 += duration.count();
}
{ // JacquesdeHooge
auto start = std::chrono::high_resolution_clock::now();
auto it_x = x.begin();
int i = 0;
for (; i < (int)y.size(); ++i) {
it_x = std::lower_bound(it_x, x.end(), y[i]);
if (it_x == x.end()) break;
z5[i] = *it_x == y[i];
}
std::fill(z5.begin() + i, z5.end(), false);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time5 += duration.count();
}
{ // Skizz
auto start = std::chrono::high_resolution_clock::now();
vector<double>::iterator a = x.begin(), b = y.begin();
int i = 0;
while (a != x.end () && b != y.end ())
{
if (*a == *b) {
z6[i] = true;
++a;
++b;
}
else
{
z6[i] = false;
if (*a < *b)
{
++a;
}
else
{
++b;
}
}
i++;
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time6 += duration.count();
}
assert (std::equal(z1.begin(), z1.begin() + 5000, z2.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z3.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z3_prime.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z4.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z5.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z6.begin()));
}
cout << "Original - vector<bool>: \t\t" << time1 << " ns\n";
cout << "Original - vector<unsigned char>: \t" << time2 << " ns\n";
cout << "Set intersection (Daniel): \t\t" << time3_prime << " ns\n";
cout << "Set intersection (Dieter Lücking): \t" << time3 << " ns\n";
cout << "Ed Heal: \t\t\t\t" << time4 << " ns\n";
cout << "JackesdeHooge: \t\t\t\t" << time5 << " ns\n";
cout << "Skizz: \t\t\t\t\t" << time6 << " ns\n";
cout << endl;
return 0;
}
My results with g++ 5.2.1 -std::c++11 and -O3:
Original - vector: 10152069 ns
Original - vector: 8686619 ns
Set intersection (Daniel): 1768855 ns
Set intersection (Dieter Lücking): 1617106 ns
Ed Heal: 1446596 ns
JackesdeHooge: 3998958 ns
Skizz: 1385193 ns
*Please note Ed Heal and Skizz solutions are essentially the same.
Since both vectors are sorted, you have to apply bin search only on the remainder part of the second vector.
So if you e.g. don't find x [i] in before y [j], you're certain you also won't find x [i + 1] before y [j]. In finding a match for x [i + 1] it therefore suffices to apply bin search starting with y [j].
Off the top of my head, I can only think of this:-
vector<double>::iterator a = x.begin(), b = y.begin();
while (a != x.end () && b != y.end ())
{
if (*a == *b)
{
// value is in both containers
++a;
}
else
{
if (*a < *b)
{
++a;
}
else
{
++b;
}
}
}
Perhaps this algorithm will be better as the two vectors are sorted. The time complexity is linear.
#include <iostream>
#include <algorithm>
#include <vector>
int main() {
using namespace std;
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
vector<bool> z(y.size());
int i_x = 0, i_y = 0;
while (i_x < x.size() && i_y < y.size())
{
if (x[i_x] == y[i_y]) {
cout << "In both" << x[i_x] << endl;
z[i_y] = true;
++i_x;
++i_y;
} else if (x[i_x] < y[i_y]) {
++i_x;
} else {
z[i_y] = false;
++i_y;
}
}
for (; i_y < y.size(); ++i_y) {
//Empty
}
for (vector<bool>::const_iterator i = z.begin(); i != z.end(); ++i)
cout << *i << " ";
return 0;
}
An implementation of #JacquesdeHooge's answer:
std::vector<bool> ComputeMatchFlags(const std::vector<double>& x,
const std::vector<double>& y) {
std::vector<bool> found(y.size());
auto it_x = x.begin();
int i = 0;
for (; i < (int)y.size(); ++i) {
it_x = std::lower_bound(it_x, x.end(), y[i]);
if (it_x == x.end()) break;
found[i] = *it_x == y[i];
}
std::fill(found.begin() + i, found.end(), false);
return found;
}
When you have found an element (or a place in the array the element would have been), you don't need to consider elements that occur before that any more. So use the result of the previous find instead of x.begin().
Since std::binary_search does not return an iterator, use std::lower_bound instead. Also consider std::find (yes linear search, it might be actually faster, depending on your data).
If this doesn't bring enough improvement, try std::unordered_set instead of an array.
Just a timing of binary search and set intersection with the improvement of using std::vector:
#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>
int main() {
using namespace std;
// Original
{
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
auto start = std::chrono::high_resolution_clock::now();
vector<bool> z(y.size());
for (size_t i = 0; i != y.size(); ++i)
z[i] = binary_search(x.begin(), x.end(), y[i]);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
cout << "vector<bool>: " << duration.count() << "ns\n";
for (auto i = z.begin(); i != z.end(); ++i)
cout << unsigned(*i) << " ";
cout << '\n';
}
// Original (replacing vector<bool> by vector<unsigned char>)
{
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
auto start = std::chrono::high_resolution_clock::now();
vector<unsigned char> z(y.size());
for (size_t i = 0; i != y.size(); ++i)
z[i] = binary_search(x.begin(), x.end(), y[i]);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
cout << "vector<unsigned char>: " << duration.count() << "ns\n";
for (auto i = z.begin(); i != z.end(); ++i)
cout << unsigned(*i) << " ";
cout << '\n';
}
// Similar to std::set_intersection
{
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
auto start = std::chrono::high_resolution_clock::now();
vector<unsigned char> z(y.size());
size_t ix = 0;
size_t iy = 0;
while(ix < x.size() && iy < y.size())
{
if(x[ix] < y[iy]) ++ix;
else if(y[iy] < x[ix]) ++iy;
else {
z[iy] = 1;
// ++ix; Not this if one vector is not uniquely sorted
++iy;
}
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
cout << "set intersection: " << duration.count() << "ns\n";
for (auto i = z.begin(); i != z.end(); ++i)
cout << unsigned(*i) << " ";
cout << '\n';
}
return 0;
}
Compiled with g++ -std=c++11 -O3 (g++ 4.84) gives:
vector<bool>: 3622ns
0 0 1 0 1 0 1 1 0
vector<unsigned char>: 1635ns
0 0 1 0 1 0 1 1 0
set intersection: 1299ns
0 0 1 0 1 0 1 1 0