I'm trying to write an R wrapper for the FINUFFT routines for calculating the FFT of an unevenly sampled series. I have virtually no experience with C/C++, so I'm working from an example that compares the traditional Fourier transform to the NUFFT. The example code follows.
// this is all you must include for the finufft lib...
#include "finufft.h"
#include <complex>
// also needed for this example...
#include <stdio.h>
#include <stdlib.h>
using namespace std;
int main(int argc, char* argv[])
/* Simple example of calling the FINUFFT library from C++, using plain
arrays of C++ complex numbers, with a math test. Barnett 3/10/17
Double-precision version (see example1d1f for single-precision)
Compile with:
g++ -fopenmp example1d1.cpp -I ../src ../lib-static/libfinufft.a -o example1d1 -lfftw3 -lfftw3_omp -lm
or if you have built a single-core version:
g++ example1d1.cpp -I ../src ../lib-static/libfinufft.a -o example1d1 -lfftw3 -lm
Usage: ./example1d1
*/
{
int M = 1e6; // number of nonuniform points
int N = 1e6; // number of modes
double acc = 1e-9; // desired accuracy
nufft_opts opts; finufft_default_opts(&opts);
complex<double> I = complex<double>(0.0,1.0); // the imaginary unit
// generate some random nonuniform points (x) and complex strengths (c):
double *x = (double *)malloc(sizeof(double)*M);
complex<double>* c = (complex<double>*)malloc(sizeof(complex<double>)*M);
for (int j=0; j<M; ++j) {
x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1); // uniform random in [-pi,pi)
c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
}
// allocate output array for the Fourier modes:
complex<double>* F = (complex<double>*)malloc(sizeof(complex<double>)*N);
// call the NUFFT (with iflag=+1): note N and M are typecast to BIGINT
int ier = finufft1d1(M,x,c,+1,acc,N,F,opts);
int n = 142519; // check the answer just for this mode...
complex<double> Ftest = complex<double>(0,0);
for (int j=0; j<M; ++j)
Ftest += c[j] * exp(I*(double)n*x[j]);
int nout = n+N/2; // index in output array for freq mode n
double Fmax = 0.0; // compute inf norm of F
for (int m=0; m<N; ++m) {
double aF = abs(F[m]);
if (aF>Fmax) Fmax=aF;
}
double err = abs(F[nout] - Ftest)/Fmax;
printf("1D type-1 NUFFT done. ier=%d, err in F[%d] rel to max(F) is %.3g\n",ier,n,err);
free(x); free(c); free(F);
return ier;
}
Much of this I don't need, such as generating the test series and comparing to the traditional FFT. Further, I want to return the values of the transform, not just an error code indicating success. Below is my code.
#include "finufft.h"
#include <complex>
#include <Rcpp.h>
#include <stdlib.h>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
ComplexVector finufft(int M, NumericVector x, ComplexVector c, int N) {
// From example code for finufft, sets precision and default options
double acc = 1e-9;
nufft_opts opts; finufft_default_opts(&opts);
// allocate output array for the finufft routine:
complex<double>* F = (complex<double>*)malloc(sizeof(complex<double>*)*N);
// Change vector inputs from R types to C++ types
double* xd = as< double* >(x);
complex<double>* cd = as< complex<double>* >(c);
// call the NUFFT (with iflag=-1): note N and M are typecast to BIGINT
int ier = finufft1d1(M,xd,cd,-1,acc,N,F,opts);
ComplexVector Fd = as<ComplexVector>(*F);
return Fd;
}
When I try to source this in Rstudio, I get the error "no matching function for call to 'as(std::complex<double>*&)'", pointing to the line declaring Fd towards the end. I believe the error indicates that either the function 'as' isn't defined (which I know is false), or the argument to 'as' isn't the correct type. The examples here include one using 'as' to convert to a NumericVector, so unless there's some complication with complex values I don't see why it should be a problem here.
I know there are potential problems using two namespaces, but I don't believe that's the issue here. My best guess is that there's an issue with how I'm trying to use pointers, but I lack the experience to identify it and I can't find any similar examples online to guide me.
Rcpp::as<T> converts from an R data type (SEXP) to a C++ data type, e.g. Rcpp::ComplexVector. This does not fit your situation, where you try to convert from a C-style array to C++. Fortunately Rcpp::Vector, which is the basis for Rcpp::ComplexVector, has a constructor for this task: Vector (InputIterator first, InputIterator last). For the other direction (going from C++ to C-style array) you can use vector.begin() or &vector[0].
However, one needs a reinterpret_cast to convert between Rcomplex* and std::complex<double>*. That should cause no problems, though, since Rcomplex (a.k.a. complex double in C) and std::complex<doulbe> are compatible.
A minimal example:
#include <Rcpp.h>
#include <complex>
using namespace Rcpp;
// [[Rcpp::export]]
ComplexVector foo(ComplexVector v) {
std::complex<double>* F = reinterpret_cast<std::complex<double>*>(v.begin());
int N = v.length();
// do something with F
ComplexVector Fd(reinterpret_cast<Rcomplex*>(F),
reinterpret_cast<Rcomplex*>(F + N));
return Fd;
}
/*** R
set.seed(42)
foo(runif(4)*(1+1i))
*/
Result:
> Rcpp::sourceCpp('56675308/code.cpp')
> set.seed(42)
> foo(runif(4)*(1+1i))
[1] 0.9148060+0.9148060i 0.9370754+0.9370754i 0.2861395+0.2861395i 0.8304476+0.8304476i
BTW, you can move these reinterpret_casts out of sight by using std::vector<std::complex<double>> as argument and return types for your function. Rcpp does the rest for you. This also helps getting rid of the naked malloc:
#include <Rcpp.h>
// dummy function with reduced signature
int finufft1d1(int M, double *xd, std::complex<double> *cd, int N, std::complex<double> *Fd) {
return 0;
}
// [[Rcpp::export]]
std::vector<std::complex<double>> finufft(int M,
std::vector<double> x,
std::vector<std::complex<double>> c,
int N) {
// allocate output array for the finufft routine:
std::vector<std::complex<double>> F(N);
// Change vector inputs from R types to C++ types
double* xd = x.data();
std::complex<double>* cd = c.data();
std::complex<double>* Fd = F.data();
int ier = finufft1d1(M, xd, cd, N, Fd);
return F;
}
Related
I am using Coin-Or's rehearse to implement linear programming.
I need a modulo constraint. Example: x shall be a multiple of 3.
OsiCbcSolverInterface solver;
CelModel model(solver);
CelNumVar x;
CelIntVar z;
unsigned int mod = 3;
// Maximize
solver.setObjSense(-1.0);
model.setObjective(x);
model.addConstraint(x <= 7.5);
// The modulo constraint:
model.addConstraint(x == z * mod);
The result for x should be 6. However, z is set to 2.5, which should not be possible as I declared it as a CellIntVar.
How can I enforce z to be an integer?
I never used that lib, but you i think you should follow the tests.
The core message comes from the readme:
If you want some of your variables to be integers, use CelIntVar instead of CelNumVar. You must bind the solver to an Integer Linear Programming solver as well, for example Coin-cbc.
Looking at Rehearse/tests/testRehearse.cpp -> exemple4() (here presented: incomplete code; no copy-paste):
OsiClpSolverInterface *solver = new OsiClpSolverInterface();
CelModel model(*solver);
...
CelIntVar x1("x1");
...
solver->initialSolve(); // this is the relaxation (and maybe presolving)!
...
CbcModel cbcModel(*solver); // MIP-solver
cbcModel.branchAndBound(); // Use MIP-solver
printf("Solution for x1 : %g\n", model.getSolutionValue(x1, *cbcModel.solver()));
printf("Solution objvalue = : %g\n", cbcModel.solver()->getObjValue());
This kind of usage (use Osi to get LP-solver; build MIP-solver on top of that Osi-provided-LP-solver and call brandAndBound) basically follows Cbc's internal interface (with python's cylp this looks similar).
Just as reference: This is the official CoinOR Cbc (Rehearse-free) example from here:
// Copyright (C) 2005, International Business Machines
// Corporation and others. All Rights Reserved.
#include "CbcModel.hpp"
// Using CLP as the solver
#include "OsiClpSolverInterface.hpp"
int main (int argc, const char *argv[])
{
OsiClpSolverInterface solver1;
// Read in example model in MPS file format
// and assert that it is a clean model
int numMpsReadErrors = solver1.readMps("../../Mps/Sample/p0033.mps","");
assert(numMpsReadErrors==0);
// Pass the solver with the problem to be solved to CbcModel
CbcModel model(solver1);
// Do complete search
model.branchAndBound();
/* Print the solution. CbcModel clones the solver so we
need to get current copy from the CbcModel */
int numberColumns = model.solver()->getNumCols();
const double * solution = model.bestSolution();
for (int iColumn=0;iColumn<numberColumns;iColumn++) {
double value=solution[iColumn];
if (fabs(value)>1.0e-7&&model.solver()->isInteger(iColumn))
printf("%d has value %g\n",iColumn,value);
}
return 0;
}
I am trying to convert matlab code to a c++ mex file in order to run a few computations more efficiently. I am using the armadillo library with blas and lapack for a few matrix operations, which involves interpolating data to apply a delay.
However, I am receiving an inconsistent output from my mex file. If I run the same mex file with the same input, sometimes I receive the correct output, and occasionally it will output a HUGE number (i.e. instead of on the order of 100, it is on the order of 10^246).
I am very new to c++ coding, and have exhausted my general knowledge base. I believe the problem is in my interpolation step, because I am able to consistently output the correct delay matrix, which is the preceeding step.
Does anyone have any idea what I am doing to produce this?
In Matlab I call:
mex test.cpp -lblas -llapack
[outData] = test( squeeze(inData(:,:,ang,:)) , params, angles(ang),1);
My mex file is generally:
#include <math.h>
#include <mex.h>
#include <armadillo>
#include "armaMex.hpp"
using namespace std; //avoid having to scope with std:: before commands
using namespace arma; //avoid having to scope with std:: before commands
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){
// ============== INITIALIZE =============
// Initialize Data
const mwSize *dims;
int cDim,dDim,aDim,numDims; // Dimension variables
int m, n, a; // Loop variables
mxArray *fs_p, *f0_p, *prf_p, *pval_p, *c_p; // Parameter pointers
const double *fs,*f0,*prf,*pval, *c, *ang; // Parameter variables
const int *nthreads;
// Initialize pointers for param variables
pval_p = mxGetField(prhs[1],0,"pval"); //note that your parameters need these exact names
fs_p = mxGetField(prhs[1],0,"fs");
f0_p = mxGetField(prhs[1],0,"f0");
prf_p = mxGetField(prhs[1],0,"prf");
c_p = mxGetField(prhs[1],0,"c");
// Initialize parameters
pval = mxGetPr(pval_p);
fs = mxGetPr(fs_p);
f0 = mxGetPr(pval_p);
prf = mxGetPr(prf_p);
c = mxGetPr(c_p);
ang = (double*)mxGetData(prhs[2]);
nthreads = (int*)mxGetData(prhs[3]);
dims = mxGetDimensions(prhs[0]);
numDims = (int)mxGetNumberOfDimensions(prhs[0]);
dDim=(int)dims[0];cDim=(int)dims[1];aDim=(int)dims[2];
//Read in channel Data
cube data_in = armaGetCubePr(prhs[0]);
(....... simple calculations that look okay ... )
cube data_out(dDim, bDim, aDim);
cube delayedData(dDim, aDim, bDim);
vec delayArray(dDim); //need to define these tmp variables bc subcube fcn otherwise gives me errors idk
vec tmpIN(dDim);
vec tmpOut(dDim);
vec tmpOUTdata(dDim);
for(m=0;m<bDim;m++){
for(n=0;n<cDim;n++){
for (a=0;a<aDim;a++){
delayArray = tdelays.subcube(0,n,m,dDim-1,n,m);
tmpIN = data_in.subcube(0,n,a,dDim-1,n,a);
tmpOUTdata = data_out.subcube(0,m,a,dDim-1,m,a);
interp1(timeArray, tmpIN , delayArray, tmpOut, "linear",0);
data_out.subcube(0,m,a,dDim-1,m,a) = tmpOUTdata +tmpOut;
}
}
}
// Define output data
plhs[0] = armaCreateMxMatrix(data_out.n_rows, data_out.n_cols, data_out.n_slices);
armaSetCubePr(plhs[0], data_out);
return
}
When calling a Fortran subroutine from a C++ function, and the C++ function is called inside an OpenMP parallel for construct, the Fortran subroutine returns different values from time to time. It is a blackbox subroutine that should return the same result with the same input (50 arguments). I parallelized the subroutine call in order to run it for hundreds of different input combinations. If I run the program twice and print the results from each subroutine execution, the results are not the same.
Details about the problem :
The serial version is consistent and working fine, giving the same answer all the time;
The subroutine does not use pseudo random generated numbers;
The subroutine calls other subroutines in the same .F90 file;
There is no nesting and no openmp pragmas or includes inside fortran subroutines;
If I try to use OpenMP API functions inside Fortran subroutines they return gibberish;
I am using -fautomatic, -fopenmp and -frecursive when compiling with gfortran (btw I am using gcc 5.2.0) and all subroutines were made RECURSIVE. Everything is compiling and linking fine and the problem does appear when I run the .exe.
The Fortran subroutines do not access I/O. All variables are passed through arguments. There are no COMMON or SAVED blocks. All subroutines use dummy arguments and output variables are explicitly initialized inside every subroutine;
I am not using any OpenMP clauses with #pragma omp parallel for.
The number of discrepancies between results is reduced if the number of threads is smaller than the number of available processors. Binding threads with processors does not solve the problem.
The code is huge, but I have managed to simplify it in an example to illustrate the problem :
//StdAfx.h
#include "other.h"
#include <omp.h>
//...many other includes
//first.cpp
#include "StdAfx.h"
typedef struct
{
float x[51];
float result;
} A;
typedef A *B;
B old=NULL;
int size;
float weight;
int var;
int main()
{
size = 100;
old = new (nothrow) A[size];
long* control=NULL;
control = new long[size];
int kk;
//...
//changing some control[] values to -1
var = 5; weight = 0.7;
//...
#pragma omp parallel for
for (kk=0; kk<=size-1; kk++)
{
if (control[kk]>-1) old[kk].result = calcresult(old[kk].x,kk);
}
...
delete [] old;
oldpop = NULL;
delete [] control;
control = NULL;
}
float calcresult(float *x, int k)
{
int dev=0;
double kresult;
dev = 10;
kresult = othercalcresult(&x[0],k);
kresult += (weight*dev*double(1.0/var));
return(kresult);
}
//other.h
float othercalcresult(float *x, int anyk=0);
//other.cpp
extern "C" {
void _stdcall fdlf_(int VET[93],int *N, double *extresult);
}
double anothercalcresult(float *x, int *iVet)
{
int iN=1;
double extresult=0.0;
//stuff here
//original fortran subroutine has 50 arguments
fdlf_(iVet,&iN,&extresult);
return(extresult);
}
float othercalcresult(float *x, int anyk=0)
{
unsigned int i,ii;
float otherresult=0.0;
int ulimit;
//ulimit = 10;
//iVet is a two dimensional array iVet
int** iVet = new int*[numcenarios_anaprog_local];
for (ii=0; ii<ulimit; ii++) iVet[ii]=new int[93];
//initialize new vector
for (i=0; i<ulimit; i++)
for (ii=0; ii<93; ii++)
iVet[i][ii]=(100*i)+ii;
double* partialresult=NULL;
partialresult= new double[ulimit];
for (int jj=0;jj<ulimit;jj++) partialresult[jj] = 0.0;
//stuff here
for (i=0;i<ulimit;i++) partialresult[i] = anothercalcresult(x,iVet[i])
for (i=0;i<ulimit;i++) otherresult+=float(partialresult[i]);
return(otherresult);
}
//EXT.F90
RECURSIVE SUBROUTINE AUXSUB1(N,VALUE1)
INTEGER N
REAL*8 VALUE1
VALUE1 = 1 / (2 ** N)
RETURN
END SUBROUTINE AUXSUB1
RECURSIVE SUBROUTINE AUXSUB2(N,VALUE2)
INTEGER N
REAL*8 VALUE2
VALUE2 = 1 / (3 ** N)
RETURN
END SUBROUTINE AUXSUB2
RECURSIVE SUBROUTINE FDLF(VET,N,EXTRESULT)
INTEGER VET(93),N
REAL*8 VALUE1, VALUE2, EXTRESULT
VALUE1 = 0.
VALUE2 = 0.
EXTRESULT = 0.0
CALL AUXSUB1(N,VALUE1)
CALL AUXSUB2(N,VALUE2)
DO I=1,93
IF I.LT.47 THEN
EXTRESULT = EXTRESULT + VALUE1
ELSE
EXTRESULT = EXTRESULT + VALUE2
END IF
END DO
EXTRESULT = 1 / EXTRESULT
RETURN
END SUBROUTINE FDLF
I am trying to get GPc (https://github.com/SheffieldML/GPc) working in Matlab, using mex-files. I got the examples working, I took the bit I'm currently being interested in out as a standalone C++ program, that works just fine. However, when I try to do the same in a mex and run it through Matlab, I'm getting some errors, in particular:
MKL ERROR: Parameter 4 was incorrect on entry to DPOTRF.
or
** On entry to DPOTRF parameter number 4 had an illegal value
depending on whether I use the system version of MKL or the one Matlab carries along. The call to dpotrf is:
dpotrf_(type, nrows, vals, nrows, info);
with all variables valid (type="U", nrows=40, vals = double[40*40]) and with the interface:
extern "C" void dpotrf_(
const char* t, // whether upper or lower triangluar 'U' or 'L'
const int &n, // (input)
double *a, // a[n][lda] (input/output)
const int &lda, // (input)
int &info // (output)
);
(both are taken from GPc). LDA was originally supplied as ncols (which I believe is incorrect, but I didn't inquiry the library author about it yet), but it shouldn't make a difference, because this is called on a square matrix.
I feared that there might be problem with the references, so I changed the interface header to accept int* (like in http://www.netlib.org/clapack/clapack-3.2.1-CMAKE/SRC/dpotrf.c), but that started giving me segfaults, so it made me thinking the references there are right.
Does anybody have an idea what might be wrong?
I've tried to reproduce with an example on my end, but I'm not seeing any errors. In fact the result is identical to MATLAB's.
mex_chol.cpp
#include "mex.h"
#include "lapack.h"
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
// verify arguments
if (nrhs != 1 || nlhs > 1) {
mexErrMsgTxt("Wrong number of arguments.");
}
if (!mxIsDouble(prhs[0]) || mxIsComplex(prhs[0])) {
mexErrMsgTxt("Input must be a real double matrix.");
}
if (mxGetM(prhs[0]) != mxGetN(prhs[0])) {
mexErrMsgTxt("Input must be a symmetric positive-definite matrix.");
}
// copy input matrix to output (its contents will be overwritten)
plhs[0] = mxDuplicateArray(prhs[0]);
// pointer to data
double *A = mxGetPr(plhs[0]);
mwSignedIndex n = mxGetN(plhs[0]);
// perform matrix factorization
mwSignedIndex info = 0;
dpotrf("U", &n, A, &n, &info);
// check if call was successful
if (info < 0) {
mexErrMsgTxt("Parameters had an illegal value.");
} else if (info > 0) {
mexErrMsgTxt("Matrix is not positive-definite.");
}
}
Note that MATLAB already ships with BLAS/LAPCK headers and libraries (Intel MKL implementation). In fact this is what $MATLABROOT\extern\include\lapack.h has as function prototype for dpotrf:
#define dpotrf FORTRAN_WRAPPER(dpotrf)
extern void dpotrf(
char *uplo,
ptrdiff_t *n,
double *a,
ptrdiff_t *lda,
ptrdiff_t *info
);
Here is how you compile the above C++ code:
>> mex -largeArrayDims mex_chol.cpp libmwblas.lib libmwlapack.lib
Finally let's test the MEX function:
% some random symmetric positive semidefinite matrix
A = gallery('randcorr',10);
% our MEX-version of Cholesky decomposition
chol2 = #(A) triu(mex_chol(A));
% compare
norm(chol(A) - chol2(A)) % I get 0
(Note that the MEX code returns the working matrix as is, where the LAPACK routine only overwrites half of the matrix. So I used TRIU to zero-out the other half and extract the upper part).
I've been looking at Thrust and I stumbled upon a question that almost (but not quite) answered mine: Finding the maximum element value AND its position using CUDA Thrust
The example posted in there in the answer works fine, but how to do the same thing with raw pointers? Let us assume this code which I believe to be correct (ignore the kernel configuration, it's for simplicity):
float* d_A;
const unsigned int noElems = 10;
cudaMalloc(&d_A, noElems * sizeof(float));
initDeviceVector<<<1, noElems>>>(d_A);
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_A);
thrust::device_vector<float>::iterator iter =
thrust::max_element(d_ptr, d_ptr + noElems);
I can't quite figure out how to extract the position using iter and raw pointers.
Thank you for your time.
There's probably a number of ways to do this. However working directly from your code, we can compare the value of iter to a device_ptr if we convert it to a suitable device pointer first.
The following fully worked example demonstrates this:
$ cat t436.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/extrema.h>
#include <stdio.h>
__global__ void initDeviceVector(float *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
data[idx] = idx%7;
}
int main(){
float* d_A;
const unsigned int noElems = 10;
cudaMalloc(&d_A, noElems * sizeof(float));
initDeviceVector<<<1, noElems>>>(d_A);
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_A);
thrust::device_vector<float>::iterator iter = thrust::max_element(d_ptr, d_ptr + noElems);
int pos = thrust::device_pointer_cast(&(iter[0])) - d_ptr;
printf("pos = %d\n", pos);
return 0;
}
$ nvcc -arch=sm_20 -o t436 t436.cu
$ ./t436
pos = 6
$