Armadillo C++ bad performance ifft - c++

I have current test code
#include <iostream>
#define ARMA_DONT_USE_WRAPPER
#include <armadillo>
using namespace std::complex_literals;
int main()
{
arma::cx_mat testMat { };
testMat.set_size(40, 19586);
auto nPositions = static_cast<arma::sword>(floor(19586/2));
arma::cx_rowvec a_vec {19586, arma::fill::randu};
arma::cx_rowvec b_vec {19586, arma::fill::randu};
arma::cx_rowvec c_vec {19586, arma::fill::randu};
for (size_t nCo=0; nCo < 3; nCo++) {
arma::rowvec d {19586, arma::fill::randu};
for(size_t iDop = 0; iDop < 40; ++iDop)
{
arma::cx_rowvec signalFi = (b_vec % arma::exp(-1i*M_PI*a_vec));
testMat.row(iDop) += arma::ifft(arma::shift(arma::fft(signalFi), nPositions).eval() % c_vec).eval();
}
}
return 0;
}
I am trying to perform some computation.
StopWatch shared performance for each iteration around : 300 ms, which is bad performance for my needs.
Is someone which can explain what i am doing wrong or some tricks how can i increase the performance.
I used .eval() to perform 'eager' evaluation.
gcc 11.2
armadillo 10.8.2
Release Mode -O3
Updated Version. Is possible to redesign the ifft function ?
Test Code
#include <iostream>
#include <fftw3.h>
#include <armadillo>
#include "StopWatch.h"
using namespace std;
inline arma::cx_mat ifftshift(arma::cx_mat const &axx)
{
return arma::shift(axx, -ceil(axx.n_rows/2), 0);
}
void ifft(arma::cx_mat &inMat, arma::cx_mat &outMat)
{
size_t N = inMat.n_rows;
size_t n_cols = inMat.n_cols;
for (size_t index = 0; index < n_cols; ++index)
{
fftw_complex *in1 = reinterpret_cast<fftw_complex *>(inMat.colptr(index));
fftw_complex *out1 = reinterpret_cast<fftw_complex *>(outMat.colptr(index));
fftw_plan pl_ifft_cx1 = fftw_plan_dft_1d(N, in1, out1, FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute_dft(pl_ifft_cx1, in1, out1);
}
outMat /= N;
}
int main()
{
arma::cx_mat B;
B << std::complex<double>(+1.225e-01,+8.247e-01) << std::complex<double>(+4.078e-01,+5.632e-01) << std::complex<double>(+8.866e-01,+8.386e-01) << arma::endr
<< std::complex<double>(+5.958e-01,+1.015e-01) << std::complex<double>(+7.857e-01,+4.267e-01) << std::complex<double>(+7.997e-01,+9.176e-01) << arma::endr
<< std::complex<double>(+1.877e-01,+3.378e-01) << std::complex<double>(+2.921e-01,+9.651e-01) << std::complex<double>(+1.056e-01,+6.901e-01) << arma::endr
<< std::complex<double>(+2.322e-01,+6.990e-01) << std::complex<double>(+1.547e-01,+4.256e-01) << std::complex<double>(+9.094e-01,+1.194e-01) << arma::endr
<< std::complex<double>(+3.917e-01,+3.886e-01) << std::complex<double>(+2.166e-01,+4.962e-01) << std::complex<double>(+9.777e-01,+4.464e-01) << arma::endr;
arma::cx_mat output(5,3);
arma::cx_mat shifted = ifftshift(B);
arma::cx_mat arma_result = arma::ifft(shifted);
B.print("B");
arma_result.print("arma_result");
ifft(shifted, output);
output.print("output");
return 0;
}

I just tried a similar operation with my own library and, according to my measurements, you are correct that each iteration of the loop shouldn't take more than 1 millisecond (instead of 300 ms).
This is the equivalent code, sorry that this is not an Armadillo answer, I am just pointing out what are the concrete goals for minimizing operations and allocations.
#include<multi/adaptors/fftw.hpp>
#include<multi/array.hpp>
namespace fftw = multi::fftw;
int main() {
multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586);
multi::array<std::complex<double>, 1> res(arr.extensions()); // output allocated only once
fftw::plan fdft{arr, res, fftw::forward}; // fftw plan and internal buffers allocated only once
auto const N = 40;
for(int i = 0; i != N; ++i) { // each iteration takes ~1ms in an intel-i7
fdft(arr.base(), res.base()); // fft operation with precalculated plan
std::rotate(res.begin(), res.begin() + res.size()/2, res.end()); // rotation (shift on size/2) done in place, no allocation either
}
}
The full code and library is here: https://gitlab.com/correaa/boost-multi/-/blob/master/adaptors/fftw/test/shift.cpp#L45-58 (the extra code is for the timing measurement).
What is also telling is that I tried to do all the possible mistakes to pessimize the code.
To try to mimic what I think Armadillo is doing "wrong"... allocating inside the loop and making copies all the time. But what I get is that each iteration take 1.5 milliseconds.
My conclusion is that something is terribly wrong in your Armadillo usage or in the library itself.
multi::array<std::complex<double>, 1> const arr = n_random_complex<double>(19586); BOOST_REQUIRE(arr.size() == 19586);
auto const N = 40;
for(int i = 0; i != N; ++i) {
multi::array<std::complex<double>, 1> res(arr.extensions(), 0.);
fftw::plan fdft{arr, res, fftw::forward};
fdft(arr.base(), res.base());
multi::array<std::complex<double>, 1> res_copy(arr.extensions(), 0.);
std::rotate_copy(res.begin(), res.begin() + res.size()/2, res.end(), res_copy.begin());
}

Related

Eliminate Intermediate Eigen Arrays

Does Eigen make any intermediate array for calculation of x or Eigen just put the values into simd registers and do the calculation?
In general, how to know how many intermediates did Eigen make?
Will Eigen allocate new memory for the intermediates in every cycle of the loop?
Is there anyway to ensure that eigen would not make any intermediate? Does it have a macro like "EIGEN_NO_INTERMEDIATE"?
#include <Eigen/Eigen>
#include <iostream>
using namespace Eigen;
template<typename T>
void fill(T& x) {
for (int i = 0; i < x.size(); ++i) x.data()[i] = i + 1;
}
int main() {
int n = 10; // n is actually about 400
ArrayXXf x(n, n);
ArrayXf y(n);
fill(x);
fill(y);
for (int i = 0; i < 10; ++i) { // many cycles
x = x * ((x.colwise() / y).rowwise() / y.transpose()).exp();
}
std::cout << x << "\n";
}
You can add a hook into the DenseStorage constructor like so:
#include <iostream>
static long int nb_temporaries;
inline void on_temporary_creation(long int size) {
if(size!=0) nb_temporaries++;
}
// must be defined before including any Eigen header!
#define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { on_temporary_creation(size); }
#define VERIFY_EVALUATION_COUNT(XPR,N) {\
nb_temporaries = 0; \
XPR; \
if(nb_temporaries!=N) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
}
#include <Eigen/Core>
using namespace Eigen;
template<typename T>
void fill(T& x) { for(int i=0; i<x.size(); ++i) x(i)= i+1; }
int main() {
int n=10;
ArrayXXf x(n,n); fill(x);
ArrayXf y(n); fill(y);
for(int i=0; i<10; ++i)
{
VERIFY_EVALUATION_COUNT( x = x * ((x.colwise()/y).rowwise()/y.transpose()).exp(), 0);
}
std::cout << x << '\n';
}
Essentially, this is what Eigen does in its testsuite at some points:
See here for the original definition in the testsuite and here for an example usage in the testsuite.
Alternatively, if you only care about intermediate memory allocations, you can try the macro EIGEN_RUNTIME_NO_MALLOC -- this would allow fixed-sized expressions to evaluate into temporaries, as they would only allocate on the stack.

Casting a BigMatrix/array to a Armadillo matrix

I have a big.matrix that I want to cast to an arma::Mat so that I can use the linear algebra functionality of Armadillo.
However, I can't seem to get the cast to work.
As far as I can gather from reading, both are internally stored in column major format, and the actual matrix component of a big.matrix is simply a pointer of type <T> (char/short/int/double)
The following code compiles, but the cast to the arma::Mat doesn't work, segfaulting when iterating over the cast matrix.
#include <RcppArmadillo.h>
using namespace Rcpp;
// [[Rcpp::depends(BH, bigmemory, RcppArmadillo)]]
#include <bigmemory/BigMatrix.h>
template <typename T>
void armacast(const arma::Mat<T>& M) {
// This segfaults
for (int j = 0; j < 2; j++) {
for (int i = 0; i < 2; i++) {
std::cout << M.at(j, i) << std::endl;
}
}
std::cout << "Success!" << std::endl;
}
// [[Rcpp::export]]
void armacast(SEXP pDat) {
XPtr<BigMatrix> xpDat(pDat);
if (xpDat->matrix_type() == 8) {
// I can iterate over this *mat and get sensible output.
double *mat = (double *)xpDat->matrix();
for (int j = 0; j < 2; j++) {
for (int i = 0; i < 2; i++) {
std::cout << *mat + 2 * (j + 0) + i << std::endl;
}
}
armacast((const arma::Mat<double> &)mat);
} else {
std::cout << "Not implemented yet!" << std::endl;
}
}
In R:
library(Rcpp)
library(RcppArmadillo)
library(bigmemory)
sourceCpp("armacast.cpp")
m <- as.big.matrix(matrix(1:4, 2), type="double")
armacast(m#address)
Great question! We may spin this into another Rcpp Gallery post.
There is one important detail you may have glossed over. Bigmemory objects are external so that we get R to not let its memory management interfere. Armadillo does have constructors for this (and please read the docs and warnings there) so in a first instance
we can just do
arma::mat M( (double*) xpDat->matrix(), xpDat->nrow(), xpDat->ncol(), false);
where we using a pointer to the matrix data, as well as row and column counts. A complete version:
// [[Rcpp::export]]
void armacast(SEXP pDat) {
XPtr<BigMatrix> xpDat(pDat);
if (xpDat->matrix_type() == 8) {
arma::mat M(mat, xpDat->nrow(), xpDat>-ncol(), false);
M.print("Arma matrix M");
} else {
std::cout << "Not implemented yet!" << std::endl;
}
}
It correctly invokes the print method from Armadillo:
R> armacast(m#address)
Arma matrix M
1.0000 3.0000
2.0000 4.0000
R>

Assertion error in a simple C++ program using boost:odeint

I'm sorry if this is immediately obvious, but I am very new to C++ coming from a Python / MATLAB / Mathematica background. I've written a simple solver for the classic 1D heat equation using a finite difference spatial discretization in order to play around with the capabilities of the Odeint library and compare the performance with other libraries. The code should be quite self-explanatory:
#include <iostream>
#include <boost/math/constants/constants.hpp>
#include <boost/array.hpp>
#include <boost/numeric/odeint.hpp>
using namespace std;
using namespace boost::numeric::odeint;
const double a_sq = 1;
const int p = 10;
const double h = 1 / p;
double pi = boost::math::constants::pi<double>();
typedef boost::array<double, p> state_type;
void heat_equation(const state_type &x, state_type &dxdt, double t)
{
int i;
for (i=1; i<p; i++)
{
dxdt[i] = a_sq * (dxdt[i+1] - 2*dxdt[i] + dxdt[i-1]) / h / h;
}
dxdt[0] = 0;
dxdt[p] = 0;
}
void initial_conditions(state_type &x)
{
int i;
for (i=0; i<=p; i++)
{
x[i] = sin(pi*i*h);
}
}
void write_heat_equation(const state_type &x, const double t)
{
cout << t << '\t' << x[0] << '\t' << x[p] << '\t' << endl;
}
int main()
{
state_type x;
initial_conditions(x);
integrate(heat_equation, x, 0.0, 10.0, 0.1, write_heat_equation);
}
This compiles just fine on Ubuntu 14.04 using g++ 4.8.2 and the latest boost library from the Ubuntu repository. When I run the resulting executable, however, I get the following error:
***** Internal Program Error - assertion (i < N) failed in T& boost::array<T, N>::operator[](boost::array<T, N>::size_type) [with T = double; long unsigned int N = 10ul; boost::array<T, N>::reference = double&; boost::array<T, N>::size_type = long unsigned int]:
/usr/include/boost/array.hpp(123): out of range
Aborted (core dumped)
Unfortunately, this isn't particularly helpful to my novice brain and I'm at a loss as to how to fix this. What's causing the error?
Counting the elements of an array or a vector of N-elements start by zero. The last element has index N-1. So, you need to change your for-loop to iterate from i to p-1 and you need to modify the line dxdt[p] = 0; to dxdt[p-1] = 0:
for (i=1; i<p-1; i++)
{
dxdt[i] = a_sq * (dxdt[i+1] - 2*dxdt[i] + dxdt[i-1]) / h / h;
}
dxdt[0] = 0;
dxdt[p-1] = 0;

Why is accumulate faster than a simple for cycle?

I was testing algorithms and run into this weird behavior, when std::accumulate is faster than a simple for cycle.
Looking at the generated assembler I'm not much wiser :-) It seems that the for cycle is optimized into MMX instructions, while accumulate expands into a loop.
This is the code. The behavior manifests with -O3 optimization level, gcc 4.7.1
#include <vector>
#include <chrono>
#include <iostream>
#include <random>
#include <algorithm>
using namespace std;
int main()
{
const size_t vsize = 100*1000*1000;
vector<int> x;
x.reserve(vsize);
mt19937 rng;
rng.seed(chrono::system_clock::to_time_t(chrono::system_clock::now()));
uniform_int_distribution<uint32_t> dist(0,10);
for (size_t i = 0; i < vsize; i++)
{
x.push_back(dist(rng));
}
long long tmp = 0;
for (size_t i = 0; i < vsize; i++)
{
tmp += x[i];
}
cout << "dry run " << tmp << endl;
auto start = chrono::high_resolution_clock::now();
long long suma = accumulate(x.begin(),x.end(),0);
auto end = chrono::high_resolution_clock::now();
cout << "Accumulate runtime " << chrono::duration_cast<chrono::nanoseconds>(end-start).count() << " - " << suma << endl;
start = chrono::high_resolution_clock::now();
suma = 0;
for (size_t i = 0; i < vsize; i++)
{
suma += x[i];
}
end = chrono::high_resolution_clock::now();
cout << "Manual sum runtime " << chrono::duration_cast<chrono::nanoseconds>(end-start).count() << " - " << suma << endl;
return 0;
}
When you pass the 0 to accumulate, you are making it accumulate using an int instead of a long long.
If you code your manual loop like this, it will be equivalent:
int sumb = 0;
for (size_t i = 0; i < vsize; i++)
{
sumb += x[i];
}
suma = sumb;
or you can call accumulate like this:
long long suma = accumulate(x.begin(),x.end(),0LL);
I have some different results using Visual Studio 2012
// original code
Accumulate runtime 93600 ms
Manual sum runtime 140400 ms
Note that the original std::accumulate code isn't equivalent to the for loop because the third parameter to std::accumulate is an int 0 value. It performs the summation using an int and only at the end stores the result in a long long. Changing the third parameter to 0LL forces the algorithm to use a long long accumulator and results in the following times.
// change std::accumulate initial value -> 0LL
Accumulate runtime 265200 ms
Manual sum runtime 140400 ms
Since the final result fits in an int I changed suma and std::accumulate back to using only int values. After this change the MSVC 2012 compiler was able to auto-vectorize the for loop and resulted in the following times.
// change suma from long long to int
Accumulate runtime 93600 ms
Manual sum runtime 46800 ms
After fixing the accumulate issue others noted I tested with both Visual Studio 2008 & 2010 and accumulate was indeed faster than the manual loop.
Looking at the disassembly I saw some additional iterator checking being done in the manual loop so I switched to just a raw array to eliminate it.
Here's what I ended up testing with:
#include <Windows.h>
#include <iostream>
#include <numeric>
#include <stdlib.h>
int main()
{
const size_t vsize = 100*1000*1000;
int* x = new int[vsize];
for (size_t i = 0; i < vsize; i++) x[i] = rand() % 1000;
LARGE_INTEGER start,stop;
long long suma = 0, sumb = 0, timea = 0, timeb = 0;
QueryPerformanceCounter( &start );
suma = std::accumulate(x, x + vsize, 0LL);
QueryPerformanceCounter( &stop );
timea = stop.QuadPart - start.QuadPart;
QueryPerformanceCounter( &start );
for (size_t i = 0; i < vsize; ++i) sumb += x[i];
QueryPerformanceCounter( &stop );
timeb = stop.QuadPart - start.QuadPart;
std::cout << "Accumulate: " << timea << " - " << suma << std::endl;
std::cout << " Loop: " << timeb << " - " << sumb << std::endl;
delete [] x;
return 0;
}
Accumulate: 633942 - 49678806711
Loop: 292642 - 49678806711
Using this code, the manual loop easily beats accumulate. The big difference is the compiler unrolled the manual loop 4 times, otherwise the generated code is almost identical.

Generating random numbers with uniform distribution using Thrust

I need to generate a vector with random numbers between 0.0 and 1.0 using Thrust. The only documented example I could find produces very large random numbers (thrust::generate(myvector.begin(), myvector.end(), rand).
I'm sure the answer is simple, but I would appreciate any suggestions.
Thrust has random generators you can use to produce sequences of random numbers. To use them with a device vector you will need to create a functor which returns a different element of the random generator sequence. The most straightforward way to do this is using a transformation of a counting iterator. A very simple complete example (in this case generating random single precision numbers between 1.0 and 2.0) could look like:
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
struct prg
{
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void)
{
const int N = 20;
thrust::device_vector<float> numbers(N);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + N,
numbers.begin(),
prg(1.f,2.f));
for(int i = 0; i < N; i++)
{
std::cout << numbers[i] << std::endl;
}
return 0;
}
In this example, the functor prg takes the lower and upper bounds of the random number as an argument, with (0.f,1.f) as the default. Note that in order to have a different vector each time you call the transform operation, you should used a counting iterator initialised to a different starting value.
It might not be a direct answer to your question but, cuRand library is quite powerful in this concept. You may both generate random numbers at GPU and CPU, and it contains many distribution functions (normal distribution etc).
Search for the title: "An NVIDIA CURAND implementation" on this link: http://adnanboz.wordpress.com/tag/nvidia-curand/
//Create a new generator
curandCreateGenerator(&m_prng, CURAND_RNG_PSEUDO_DEFAULT);
//Set the generator options
curandSetPseudoRandomGeneratorSeed(m_prng, (unsigned long) mainSeed);
//Generate random numbers
curandGenerateUniform(m_prng, d_randomData, dataCount);
One note is that, do not generate the generator again and again, it makes some precalculations. Calling curandGenerateUniform is quite fast and produces values between 0.0 and 1.0.
The approach suggested by #talonmies has a number of useful characteristics. Here's another approach that mimics the example you quoted:
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <iostream>
#define DSIZE 5
__host__ static __inline__ float rand_01()
{
return ((float)rand()/RAND_MAX);
}
int main(){
thrust::host_vector<float> h_1(DSIZE);
thrust::generate(h_1.begin(), h_1.end(), rand_01);
std::cout<< "Values generated: " << std::endl;
for (unsigned i=0; i<DSIZE; i++)
std::cout<< h_1[i] << " : ";
std::cout<<std::endl;
return 0;
}
similar to the example you quoted, this uses rand(), and therefore can only be used to generate host vectors. Likewise it will produce the same sequence each time unless you re-seed rand() appropriately.
There are already satisfactory answers to this questions. In particular, the OP and Robert Crovella have dealt with thrust::generate while talonmies has proposed using thrust::transform.
I think there is another possibility, namely, using thrust::for_each, so I'm posting a fully worked example using such a primitive, just for the record.
I'm also timing the different solutions.
THE CODE
#include <iostream>
#include <thrust\host_vector.h>
#include <thrust\generate.h>
#include <thrust\for_each.h>
#include <thrust\execution_policy.h>
#include <thrust\random.h>
#include "TimingCPU.h"
/**************************************************/
/* RANDOM NUMBERS GENERATION STRUCTS AND FUNCTION */
/**************************************************/
template<typename T>
struct rand_01 {
__host__ T operator()(T& VecElem) const { return (T)rand() / RAND_MAX; }
};
template<typename T>
struct rand_01_for_each {
__host__ void operator()(T& VecElem) const { VecElem = (T)rand() / RAND_MAX; }
};
template<typename T>
__host__ T rand_01_fcn() { return ((T)rand() / RAND_MAX); }
struct prg
{
float a, b;
__host__ __device__
prg(float _a = 0.f, float _b = 1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
/********/
/* MAIN */
/********/
int main() {
TimingCPU timerCPU;
const int N = 2 << 18;
//const int N = 64;
const int numIters = 50;
thrust::host_vector<double> h_v1(N);
thrust::host_vector<double> h_v2(N);
thrust::host_vector<double> h_v3(N);
thrust::host_vector<double> h_v4(N);
printf("N = %d\n", N);
double timing = 0.;
for (int k = 0; k < numIters; k++) {
timerCPU.StartCounter();
thrust::transform(thrust::host, h_v1.begin(), h_v1.end(), h_v1.begin(), rand_01<double>());
timing = timing + timerCPU.GetCounter();
}
printf("Timing using transform = %f\n", timing / numIters);
timing = 0.;
for (int k = 0; k < numIters; k++) {
timerCPU.StartCounter();
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + N,
h_v2.begin(),
prg(0.f, 1.f));
timing = timing + timerCPU.GetCounter();
}
printf("Timing using transform and internal Thrust random generator = %f\n", timing / numIters);
timing = 0.;
for (int k = 0; k < numIters; k++) {
timerCPU.StartCounter();
thrust::for_each(h_v3.begin(), h_v3.end(), rand_01_for_each<double>());
timing = timing + timerCPU.GetCounter();
}
timerCPU.StartCounter();
printf("Timing using for_each = %f\n", timing / numIters);
//std::cout << "Values generated: " << std::endl;
//for (int k = 0; k < N; k++)
// std::cout << h_v3[k] << " : ";
//std::cout << std::endl;
timing = 0.;
for (int k = 0; k < numIters; k++) {
timerCPU.StartCounter();
thrust::generate(h_v4.begin(), h_v4.end(), rand_01_fcn<double>);
timing = timing + timerCPU.GetCounter();
}
timerCPU.StartCounter();
printf("Timing using generate = %f\n", timing / numIters);
//std::cout << "Values generated: " << std::endl;
//for (int k = 0; k < N; k++)
// std::cout << h_v4[k] << " : ";
//std::cout << std::endl;
//std::cout << "Values generated: " << std::endl;
//for (int k = 0; k < N * 2; k++)
// std::cout << h_v[k] << " : ";
//std::cout << std::endl;
return 0;
}
On a laptop Core i5 platform, I had the following timings
N = 2097152
Timing using transform = 33.202298
Timing using transform and internal Thrust random generator = 264.508662
Timing using for_each = 33.155237
Timing using generate = 35.309399
The timings are equivalent, apart from the second one which uses Thrust's internal random number generator instead of rand().
Please, note that, differently from the other solutions, the one thrust::generate is somewhat more rigid since the function used to generate the random numbers cannot have input parameters. So, for example, it is not possible to scale the input arguments by a constant.