I have a program that computes the matrix product x'Ay repeatedly. Is it better practice to compute this by making calls to MKL's blas, i.e. cblas_dgemv and cblas_ddot, which requires allocating memory to a temporary vector, or is better to simply take the sum of x_i * a_ij * y_j? In other words, does MKL's blas theoretically add any value?
I benchmarked this for my laptop. There was virtually no difference in each of the tests, other than g++_no_blas performed twice as poorly as the other tests (why?). There was also no difference between O2, O3 and Ofast.
g++_blas_static 57ms
g++_blas_dynamic 58ms
g++_no_blas 100ms
icpc_blas_static 57ms
icpc_blas_dynamic 58ms
icpc_no_blas 58ms
util.h
#ifndef UTIL_H
#define UTIL_H
#include <random>
#include <memory>
#include <iostream>
struct rng
{
rng() : unif(0.0, 1.0)
{
}
std::default_random_engine re;
std::uniform_real_distribution<double> unif;
double rand_double()
{
return unif(re);
}
std::unique_ptr<double[]> generate_square_matrix(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N * N]);
for (unsigned i = 0; i < N; ++i)
{
for (unsigned j = 0; j < N; ++j)
{
p.get()[i*N + j] = rand_double();
}
}
return p;
}
std::unique_ptr<double[]> generate_vector(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N]);
for (unsigned i = 0; i < N; ++i)
{
p.get()[i] = rand_double();
}
return p;
}
};
#endif // UTIL_H
main.cpp
#include <iostream>
#include <iomanip>
#include <memory>
#include <chrono>
#include "util.h"
#include "mkl.h"
double vtmv_blas(double* x, double* A, double* y, const unsigned n)
{
double temp[n];
cblas_dgemv(CblasRowMajor, CblasNoTrans, n, n, 1.0, A, n, y, 1, 0.0, temp, 1);
return cblas_ddot(n, temp, 1, x, 1);
}
double vtmv_non_blas(double* x, double* A, double* y, const unsigned n)
{
double r = 0;
for (unsigned i = 0; i < n; ++i)
{
for (unsigned j = 0; j < n; ++j)
{
r += x[i] * A[i*n + j] * y[j];
}
}
return r;
}
int main()
{
std::cout << std::fixed;
std::cout << std::setprecision(2);
constexpr unsigned N = 10000;
rng r;
std::unique_ptr<double[]> A = r.generate_square_matrix(N);
std::unique_ptr<double[]> x = r.generate_vector(N);
std::unique_ptr<double[]> y = r.generate_vector(N);
auto start = std::chrono::system_clock::now();
const double prod = vtmv_blas(x.get(), A.get(), y.get(), N);
auto end = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start);
std::cout << "Result: " << prod << std::endl;
std::cout << "Time (ms): " << duration.count() << std::endl;
GCC no blas is poor because it does not use vectorized SMID instructions, while others all do. icpc will auto-vectorize you loop.
You don't show your matrix size, but generally gemv is memory bound. As the matrix is much larger than a temp vector, eliminating it may not be able to increase the performance a lot.
I have the following problem: I have two vectors x and y of type double that are increasingly sorted and I would like to obtain a vector z indicating whether an element of y is present in x. Up to now, I have used std::binary_search in a for-loop as illustrated below, but I think there should be a faster way making use of the fact that also x is sorted?
The issue is that this needs to be super fast as it turns out to be the bottleneck in my code.
For those familiar with R, I need an equivalent to match(y, x, nomatch = 0L) > 0L.
#include <iostream>
#include <algorithm>
#include <vector>
int main() {
using namespace std;
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
vector<bool> z(y.size());
for (int i = 0; i != y.size(); ++i)
z[i] = binary_search(x.begin(), x.end(), y[i]);
for (vector<bool>::const_iterator i = z.begin(); i != z.end(); ++i)
cout << *i << " ";
return 0;
}
EDIT
Here are representative sample data for my problem:
#include <iostream>
#include <algorithm>
#include <vector>
#include <cstdlib>
#include <ctime>
// function generator:
double RandomNumber () { return (std::rand() / 10e+7); }
int main() {
using namespace std;
std::srand ( unsigned ( std::time(0) ) );
// 5000 is representative
int n = 5000;
std::vector<double> x (n);
std::generate (x.begin(), x.end(), RandomNumber);
std::vector<double> y (n);
std::generate (y.begin(), y.end(), RandomNumber);
for(std::vector<double>::const_iterator i = x.begin(); i != x.end(); i++) {
y.push_back(*i);
}
std::sort(x.begin(), x.end());
std::sort(y.begin(), y.end());
return 0;
}
You can use std::set_itersection:
#include <vector>
#include <algorithm>
#include <iterator>
#include <iostream>
int main()
{
std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
std::vector<double> z {};
std::set_intersection(std::cbegin(x), std::cend(x),
std::cbegin(y), std::cend(y),
std::back_inserter(z));
std::copy(std::cbegin(z), std::cend(z),
std::ostream_iterator<double> {std::cout, " "});
}
Edit
To address Dieter Lücking point in the comments, here is a version that more closely matches R's match function:
#include <vector>
#include <deque>
#include <algorithm>
#include <iterator>
#include <functional>
#include <memory>
#include <iostream>
template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
std::vector<std::reference_wrapper<const T>> z {};
z.reserve(std::min(y.size(), x.size()));
std::set_intersection(std::cbegin(y), std::cend(y),
std::cbegin(x), std::cend(x),
std::back_inserter(z));
std::deque<bool> result(y.size(), false);
for (const auto& e : z) {
result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
}
return result;
}
int main()
{
std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
const auto matches = match(y, x);
std::copy(std::cbegin(matches), std::cend(matches),
std::ostream_iterator<bool> {std::cout});
}
I picked up all your codes, Dieter timing sample and the sample data of 5000 random doubles of the OP to perform a more complete timing of all the alternatives. This is the code:
#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>
#include <iterator>
#include <cstdlib>
#include <ctime>
#include <assert.h>
#include <deque>
#include <functional>
#include <memory>
using namespace std;
double RandomNumber () { return (std::rand() / 10e+7); }
template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
std::vector<std::reference_wrapper<const T>> z {};
z.reserve(std::min(y.size(), x.size()));
std::set_intersection(y.cbegin(), y.cend(),
x.cbegin(), x.cend(),
std::back_inserter(z));
std::deque<bool> result(y.size(), false);
for (const auto& e : z) {
result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
}
return result;
}
int main() {
const int NTESTS = 10;
long long time1 = 0;
long long time2 = 0;
long long time3 = 0;
long long time3_prime = 0;
long long time4 = 0;
long long time5 = 0;
long long time6 = 0;
for (int i = 0; i < NTESTS; ++i){
std::srand ( unsigned ( std::time(0) ) );
// 5000 is representative
int n = 5000;
std::vector<double> x (n);
std::generate (x.begin(), x.end(), RandomNumber);
std::vector<double> y (n);
std::generate (y.begin(), y.end(), RandomNumber);
for(std::vector<double>::const_iterator i = x.begin(); i != x.end(); i++) {
y.push_back(*i);
}
std::sort(x.begin(), x.end());
std::sort(y.begin(), y.end());
vector<bool> z1(y.size());
vector<unsigned char> z2(y.size());
vector<unsigned char> z3(y.size());
std::deque<bool> z3_prime;
vector<bool> z4(y.size());
std::vector<bool> z5(y.size());
std::vector<bool> z6(y.size());
// Original
{
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i != y.size(); ++i) {
z1[i] = binary_search(x.begin(), x.end(), y[i]);
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time1 += duration.count();
}
// Original (replacing vector<bool> by vector<unsigned char>)
{
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i != y.size(); ++i) {
z2[i] = binary_search(x.begin(), x.end(), y[i]);
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time2 += duration.count();
}
{ // Dieter Lücking set_intersection
auto start = std::chrono::high_resolution_clock::now();
size_t ix = 0;
size_t iy = 0;
while(ix < x.size() && iy < y.size())
{
if(x[ix] < y[iy]) ++ix;
else if(y[iy] < x[ix]) ++iy;
else {
z3[iy] = 1;
// ++ix; Not this if one vector is not uniquely sorted
++iy;
}
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time3 += duration.count();
}
// Std::set_intersection
{
auto start = std::chrono::high_resolution_clock::now();
z3_prime = match(y, x);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time3_prime += duration.count();
}
{ // Ed Heal
auto start = std::chrono::high_resolution_clock::now();
int i_x = 0, i_y = 0;
while (i_x < x.size() && i_y < y.size())
{
if (x[i_x] == y[i_y]) {
//cout << "In both" << x[i_x] << endl;
z4[i_y] = true;
++i_x;
++i_y;
} else if (x[i_x] < y[i_y]) {
++i_x;
} else {
z4[i_y] = false;
++i_y;
}
}
/* for (; i_y < y.size(); ++i_y) {
//Empty
} */
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time4 += duration.count();
}
{ // JacquesdeHooge
auto start = std::chrono::high_resolution_clock::now();
auto it_x = x.begin();
int i = 0;
for (; i < (int)y.size(); ++i) {
it_x = std::lower_bound(it_x, x.end(), y[i]);
if (it_x == x.end()) break;
z5[i] = *it_x == y[i];
}
std::fill(z5.begin() + i, z5.end(), false);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time5 += duration.count();
}
{ // Skizz
auto start = std::chrono::high_resolution_clock::now();
vector<double>::iterator a = x.begin(), b = y.begin();
int i = 0;
while (a != x.end () && b != y.end ())
{
if (*a == *b) {
z6[i] = true;
++a;
++b;
}
else
{
z6[i] = false;
if (*a < *b)
{
++a;
}
else
{
++b;
}
}
i++;
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
time6 += duration.count();
}
assert (std::equal(z1.begin(), z1.begin() + 5000, z2.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z3.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z3_prime.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z4.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z5.begin()));
assert (std::equal(z1.begin(), z1.begin() + 5000, z6.begin()));
}
cout << "Original - vector<bool>: \t\t" << time1 << " ns\n";
cout << "Original - vector<unsigned char>: \t" << time2 << " ns\n";
cout << "Set intersection (Daniel): \t\t" << time3_prime << " ns\n";
cout << "Set intersection (Dieter Lücking): \t" << time3 << " ns\n";
cout << "Ed Heal: \t\t\t\t" << time4 << " ns\n";
cout << "JackesdeHooge: \t\t\t\t" << time5 << " ns\n";
cout << "Skizz: \t\t\t\t\t" << time6 << " ns\n";
cout << endl;
return 0;
}
My results with g++ 5.2.1 -std::c++11 and -O3:
Original - vector: 10152069 ns
Original - vector: 8686619 ns
Set intersection (Daniel): 1768855 ns
Set intersection (Dieter Lücking): 1617106 ns
Ed Heal: 1446596 ns
JackesdeHooge: 3998958 ns
Skizz: 1385193 ns
*Please note Ed Heal and Skizz solutions are essentially the same.
Since both vectors are sorted, you have to apply bin search only on the remainder part of the second vector.
So if you e.g. don't find x [i] in before y [j], you're certain you also won't find x [i + 1] before y [j]. In finding a match for x [i + 1] it therefore suffices to apply bin search starting with y [j].
Off the top of my head, I can only think of this:-
vector<double>::iterator a = x.begin(), b = y.begin();
while (a != x.end () && b != y.end ())
{
if (*a == *b)
{
// value is in both containers
++a;
}
else
{
if (*a < *b)
{
++a;
}
else
{
++b;
}
}
}
Perhaps this algorithm will be better as the two vectors are sorted. The time complexity is linear.
#include <iostream>
#include <algorithm>
#include <vector>
int main() {
using namespace std;
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
vector<bool> z(y.size());
int i_x = 0, i_y = 0;
while (i_x < x.size() && i_y < y.size())
{
if (x[i_x] == y[i_y]) {
cout << "In both" << x[i_x] << endl;
z[i_y] = true;
++i_x;
++i_y;
} else if (x[i_x] < y[i_y]) {
++i_x;
} else {
z[i_y] = false;
++i_y;
}
}
for (; i_y < y.size(); ++i_y) {
//Empty
}
for (vector<bool>::const_iterator i = z.begin(); i != z.end(); ++i)
cout << *i << " ";
return 0;
}
An implementation of #JacquesdeHooge's answer:
std::vector<bool> ComputeMatchFlags(const std::vector<double>& x,
const std::vector<double>& y) {
std::vector<bool> found(y.size());
auto it_x = x.begin();
int i = 0;
for (; i < (int)y.size(); ++i) {
it_x = std::lower_bound(it_x, x.end(), y[i]);
if (it_x == x.end()) break;
found[i] = *it_x == y[i];
}
std::fill(found.begin() + i, found.end(), false);
return found;
}
When you have found an element (or a place in the array the element would have been), you don't need to consider elements that occur before that any more. So use the result of the previous find instead of x.begin().
Since std::binary_search does not return an iterator, use std::lower_bound instead. Also consider std::find (yes linear search, it might be actually faster, depending on your data).
If this doesn't bring enough improvement, try std::unordered_set instead of an array.
Just a timing of binary search and set intersection with the improvement of using std::vector:
#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>
int main() {
using namespace std;
// Original
{
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
auto start = std::chrono::high_resolution_clock::now();
vector<bool> z(y.size());
for (size_t i = 0; i != y.size(); ++i)
z[i] = binary_search(x.begin(), x.end(), y[i]);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
cout << "vector<bool>: " << duration.count() << "ns\n";
for (auto i = z.begin(); i != z.end(); ++i)
cout << unsigned(*i) << " ";
cout << '\n';
}
// Original (replacing vector<bool> by vector<unsigned char>)
{
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
auto start = std::chrono::high_resolution_clock::now();
vector<unsigned char> z(y.size());
for (size_t i = 0; i != y.size(); ++i)
z[i] = binary_search(x.begin(), x.end(), y[i]);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
cout << "vector<unsigned char>: " << duration.count() << "ns\n";
for (auto i = z.begin(); i != z.end(); ++i)
cout << unsigned(*i) << " ";
cout << '\n';
}
// Similar to std::set_intersection
{
vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};
auto start = std::chrono::high_resolution_clock::now();
vector<unsigned char> z(y.size());
size_t ix = 0;
size_t iy = 0;
while(ix < x.size() && iy < y.size())
{
if(x[ix] < y[iy]) ++ix;
else if(y[iy] < x[ix]) ++iy;
else {
z[iy] = 1;
// ++ix; Not this if one vector is not uniquely sorted
++iy;
}
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
cout << "set intersection: " << duration.count() << "ns\n";
for (auto i = z.begin(); i != z.end(); ++i)
cout << unsigned(*i) << " ";
cout << '\n';
}
return 0;
}
Compiled with g++ -std=c++11 -O3 (g++ 4.84) gives:
vector<bool>: 3622ns
0 0 1 0 1 0 1 1 0
vector<unsigned char>: 1635ns
0 0 1 0 1 0 1 1 0
set intersection: 1299ns
0 0 1 0 1 0 1 1 0