I am trying to convert matlab code to a c++ mex file in order to run a few computations more efficiently. I am using the armadillo library with blas and lapack for a few matrix operations, which involves interpolating data to apply a delay.
However, I am receiving an inconsistent output from my mex file. If I run the same mex file with the same input, sometimes I receive the correct output, and occasionally it will output a HUGE number (i.e. instead of on the order of 100, it is on the order of 10^246).
I am very new to c++ coding, and have exhausted my general knowledge base. I believe the problem is in my interpolation step, because I am able to consistently output the correct delay matrix, which is the preceeding step.
Does anyone have any idea what I am doing to produce this?
In Matlab I call:
mex test.cpp -lblas -llapack
[outData] = test( squeeze(inData(:,:,ang,:)) , params, angles(ang),1);
My mex file is generally:
#include <math.h>
#include <mex.h>
#include <armadillo>
#include "armaMex.hpp"
using namespace std; //avoid having to scope with std:: before commands
using namespace arma; //avoid having to scope with std:: before commands
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){
// ============== INITIALIZE =============
// Initialize Data
const mwSize *dims;
int cDim,dDim,aDim,numDims; // Dimension variables
int m, n, a; // Loop variables
mxArray *fs_p, *f0_p, *prf_p, *pval_p, *c_p; // Parameter pointers
const double *fs,*f0,*prf,*pval, *c, *ang; // Parameter variables
const int *nthreads;
// Initialize pointers for param variables
pval_p = mxGetField(prhs[1],0,"pval"); //note that your parameters need these exact names
fs_p = mxGetField(prhs[1],0,"fs");
f0_p = mxGetField(prhs[1],0,"f0");
prf_p = mxGetField(prhs[1],0,"prf");
c_p = mxGetField(prhs[1],0,"c");
// Initialize parameters
pval = mxGetPr(pval_p);
fs = mxGetPr(fs_p);
f0 = mxGetPr(pval_p);
prf = mxGetPr(prf_p);
c = mxGetPr(c_p);
ang = (double*)mxGetData(prhs[2]);
nthreads = (int*)mxGetData(prhs[3]);
dims = mxGetDimensions(prhs[0]);
numDims = (int)mxGetNumberOfDimensions(prhs[0]);
dDim=(int)dims[0];cDim=(int)dims[1];aDim=(int)dims[2];
//Read in channel Data
cube data_in = armaGetCubePr(prhs[0]);
(....... simple calculations that look okay ... )
cube data_out(dDim, bDim, aDim);
cube delayedData(dDim, aDim, bDim);
vec delayArray(dDim); //need to define these tmp variables bc subcube fcn otherwise gives me errors idk
vec tmpIN(dDim);
vec tmpOut(dDim);
vec tmpOUTdata(dDim);
for(m=0;m<bDim;m++){
for(n=0;n<cDim;n++){
for (a=0;a<aDim;a++){
delayArray = tdelays.subcube(0,n,m,dDim-1,n,m);
tmpIN = data_in.subcube(0,n,a,dDim-1,n,a);
tmpOUTdata = data_out.subcube(0,m,a,dDim-1,m,a);
interp1(timeArray, tmpIN , delayArray, tmpOut, "linear",0);
data_out.subcube(0,m,a,dDim-1,m,a) = tmpOUTdata +tmpOut;
}
}
}
// Define output data
plhs[0] = armaCreateMxMatrix(data_out.n_rows, data_out.n_cols, data_out.n_slices);
armaSetCubePr(plhs[0], data_out);
return
}
Related
I'm trying to write an R wrapper for the FINUFFT routines for calculating the FFT of an unevenly sampled series. I have virtually no experience with C/C++, so I'm working from an example that compares the traditional Fourier transform to the NUFFT. The example code follows.
// this is all you must include for the finufft lib...
#include "finufft.h"
#include <complex>
// also needed for this example...
#include <stdio.h>
#include <stdlib.h>
using namespace std;
int main(int argc, char* argv[])
/* Simple example of calling the FINUFFT library from C++, using plain
arrays of C++ complex numbers, with a math test. Barnett 3/10/17
Double-precision version (see example1d1f for single-precision)
Compile with:
g++ -fopenmp example1d1.cpp -I ../src ../lib-static/libfinufft.a -o example1d1 -lfftw3 -lfftw3_omp -lm
or if you have built a single-core version:
g++ example1d1.cpp -I ../src ../lib-static/libfinufft.a -o example1d1 -lfftw3 -lm
Usage: ./example1d1
*/
{
int M = 1e6; // number of nonuniform points
int N = 1e6; // number of modes
double acc = 1e-9; // desired accuracy
nufft_opts opts; finufft_default_opts(&opts);
complex<double> I = complex<double>(0.0,1.0); // the imaginary unit
// generate some random nonuniform points (x) and complex strengths (c):
double *x = (double *)malloc(sizeof(double)*M);
complex<double>* c = (complex<double>*)malloc(sizeof(complex<double>)*M);
for (int j=0; j<M; ++j) {
x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1); // uniform random in [-pi,pi)
c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
}
// allocate output array for the Fourier modes:
complex<double>* F = (complex<double>*)malloc(sizeof(complex<double>)*N);
// call the NUFFT (with iflag=+1): note N and M are typecast to BIGINT
int ier = finufft1d1(M,x,c,+1,acc,N,F,opts);
int n = 142519; // check the answer just for this mode...
complex<double> Ftest = complex<double>(0,0);
for (int j=0; j<M; ++j)
Ftest += c[j] * exp(I*(double)n*x[j]);
int nout = n+N/2; // index in output array for freq mode n
double Fmax = 0.0; // compute inf norm of F
for (int m=0; m<N; ++m) {
double aF = abs(F[m]);
if (aF>Fmax) Fmax=aF;
}
double err = abs(F[nout] - Ftest)/Fmax;
printf("1D type-1 NUFFT done. ier=%d, err in F[%d] rel to max(F) is %.3g\n",ier,n,err);
free(x); free(c); free(F);
return ier;
}
Much of this I don't need, such as generating the test series and comparing to the traditional FFT. Further, I want to return the values of the transform, not just an error code indicating success. Below is my code.
#include "finufft.h"
#include <complex>
#include <Rcpp.h>
#include <stdlib.h>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
ComplexVector finufft(int M, NumericVector x, ComplexVector c, int N) {
// From example code for finufft, sets precision and default options
double acc = 1e-9;
nufft_opts opts; finufft_default_opts(&opts);
// allocate output array for the finufft routine:
complex<double>* F = (complex<double>*)malloc(sizeof(complex<double>*)*N);
// Change vector inputs from R types to C++ types
double* xd = as< double* >(x);
complex<double>* cd = as< complex<double>* >(c);
// call the NUFFT (with iflag=-1): note N and M are typecast to BIGINT
int ier = finufft1d1(M,xd,cd,-1,acc,N,F,opts);
ComplexVector Fd = as<ComplexVector>(*F);
return Fd;
}
When I try to source this in Rstudio, I get the error "no matching function for call to 'as(std::complex<double>*&)'", pointing to the line declaring Fd towards the end. I believe the error indicates that either the function 'as' isn't defined (which I know is false), or the argument to 'as' isn't the correct type. The examples here include one using 'as' to convert to a NumericVector, so unless there's some complication with complex values I don't see why it should be a problem here.
I know there are potential problems using two namespaces, but I don't believe that's the issue here. My best guess is that there's an issue with how I'm trying to use pointers, but I lack the experience to identify it and I can't find any similar examples online to guide me.
Rcpp::as<T> converts from an R data type (SEXP) to a C++ data type, e.g. Rcpp::ComplexVector. This does not fit your situation, where you try to convert from a C-style array to C++. Fortunately Rcpp::Vector, which is the basis for Rcpp::ComplexVector, has a constructor for this task: Vector (InputIterator first, InputIterator last). For the other direction (going from C++ to C-style array) you can use vector.begin() or &vector[0].
However, one needs a reinterpret_cast to convert between Rcomplex* and std::complex<double>*. That should cause no problems, though, since Rcomplex (a.k.a. complex double in C) and std::complex<doulbe> are compatible.
A minimal example:
#include <Rcpp.h>
#include <complex>
using namespace Rcpp;
// [[Rcpp::export]]
ComplexVector foo(ComplexVector v) {
std::complex<double>* F = reinterpret_cast<std::complex<double>*>(v.begin());
int N = v.length();
// do something with F
ComplexVector Fd(reinterpret_cast<Rcomplex*>(F),
reinterpret_cast<Rcomplex*>(F + N));
return Fd;
}
/*** R
set.seed(42)
foo(runif(4)*(1+1i))
*/
Result:
> Rcpp::sourceCpp('56675308/code.cpp')
> set.seed(42)
> foo(runif(4)*(1+1i))
[1] 0.9148060+0.9148060i 0.9370754+0.9370754i 0.2861395+0.2861395i 0.8304476+0.8304476i
BTW, you can move these reinterpret_casts out of sight by using std::vector<std::complex<double>> as argument and return types for your function. Rcpp does the rest for you. This also helps getting rid of the naked malloc:
#include <Rcpp.h>
// dummy function with reduced signature
int finufft1d1(int M, double *xd, std::complex<double> *cd, int N, std::complex<double> *Fd) {
return 0;
}
// [[Rcpp::export]]
std::vector<std::complex<double>> finufft(int M,
std::vector<double> x,
std::vector<std::complex<double>> c,
int N) {
// allocate output array for the finufft routine:
std::vector<std::complex<double>> F(N);
// Change vector inputs from R types to C++ types
double* xd = x.data();
std::complex<double>* cd = c.data();
std::complex<double>* Fd = F.data();
int ier = finufft1d1(M, xd, cd, N, Fd);
return F;
}
I want to draw n random variables form the Normal(mean, sigma2) using the function vdRngGaussian.
One way to do this is with the command
vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, n, x, mean, sqrt(sigma2) )
Instead of this I want to use a for-loop. The mex-code I wrote is
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <mkl.h>
#include "mkl_vml.h"
#include "mex.h"
#include "matrix.h"
#include "mkl_vsl.h"
#include <time.h>
#define SEED time(NULL)
double normal(double mean, double sigma2, VSLStreamStatePtr stream);
/* main fucntion */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
double *x, mean, sigma2;
VSLStreamStatePtr stream;
vslNewStream( &stream, VSL_BRNG_MT19937, SEED );
/* make pointers to input data */
mean = (double)mxGetScalar(prhs[0]);
sigma2= (double)mxGetScalar(prhs[1]);
/* make pointers to output data */
plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
x = mxGetPr(plhs[0]);
x[0] = normal( mean, sigma2, stream);
/* Deleting the stream */
vslDeleteStream( &stream );
return;
}
double normal(double mean, double sigma2, VSLStreamStatePtr stream)
{
double x[1];
vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, x, mean, sqrt(sigma2) );
return(x[0]);
}
When I run the code with the command
for i=1:5
out(i) = normalc(0.0, 1.0);
end
I get the following results:
-1.1739 -1.1739 -1.1739 -1.1739 -1.1739
If I call my function 5 times without a for-loop I get these results
-0.2720, 2.1457, -1.2397, 0.7501, 0.1490
Could you please help me.
Thank you very much.
The stream you are creating with vslNewStream takes a seed (the initial condition of the random number generator) as an argument, and you are basing this on the output of the time function from the C time library. The problem is it returns a number of seconds. This is resolution too coarse, and you end up with the same seed when you call it in a for loop. (Assuming the for loop executes quickly and you are lucky with the start time.)
Perhaps use a different seed, like the high_resolution_clock from the C++11 <chrono> header. You may still get the epoch time, but in different units:
using namespace std::chrono;
system_clock::time_point now = high_resolution_clock::now();
system_clock::duration epoch = now.time_since_epoch();
long long ns = duration_cast<nanoseconds>(epoch).count();
long long mic = duration_cast<microseconds>(epoch).count();
long long ms = duration_cast<milliseconds>(epoch).count();
Demo (cpp.sh).
Alternatively, you can use elapsed CPU clocks from MKL's mkl_get_cpu_clocks function, but this goes back to zero when you shutdown your machine. Or just the time in seconds as a double with MKL's dsecnd.
I am trying to get GPc (https://github.com/SheffieldML/GPc) working in Matlab, using mex-files. I got the examples working, I took the bit I'm currently being interested in out as a standalone C++ program, that works just fine. However, when I try to do the same in a mex and run it through Matlab, I'm getting some errors, in particular:
MKL ERROR: Parameter 4 was incorrect on entry to DPOTRF.
or
** On entry to DPOTRF parameter number 4 had an illegal value
depending on whether I use the system version of MKL or the one Matlab carries along. The call to dpotrf is:
dpotrf_(type, nrows, vals, nrows, info);
with all variables valid (type="U", nrows=40, vals = double[40*40]) and with the interface:
extern "C" void dpotrf_(
const char* t, // whether upper or lower triangluar 'U' or 'L'
const int &n, // (input)
double *a, // a[n][lda] (input/output)
const int &lda, // (input)
int &info // (output)
);
(both are taken from GPc). LDA was originally supplied as ncols (which I believe is incorrect, but I didn't inquiry the library author about it yet), but it shouldn't make a difference, because this is called on a square matrix.
I feared that there might be problem with the references, so I changed the interface header to accept int* (like in http://www.netlib.org/clapack/clapack-3.2.1-CMAKE/SRC/dpotrf.c), but that started giving me segfaults, so it made me thinking the references there are right.
Does anybody have an idea what might be wrong?
I've tried to reproduce with an example on my end, but I'm not seeing any errors. In fact the result is identical to MATLAB's.
mex_chol.cpp
#include "mex.h"
#include "lapack.h"
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
// verify arguments
if (nrhs != 1 || nlhs > 1) {
mexErrMsgTxt("Wrong number of arguments.");
}
if (!mxIsDouble(prhs[0]) || mxIsComplex(prhs[0])) {
mexErrMsgTxt("Input must be a real double matrix.");
}
if (mxGetM(prhs[0]) != mxGetN(prhs[0])) {
mexErrMsgTxt("Input must be a symmetric positive-definite matrix.");
}
// copy input matrix to output (its contents will be overwritten)
plhs[0] = mxDuplicateArray(prhs[0]);
// pointer to data
double *A = mxGetPr(plhs[0]);
mwSignedIndex n = mxGetN(plhs[0]);
// perform matrix factorization
mwSignedIndex info = 0;
dpotrf("U", &n, A, &n, &info);
// check if call was successful
if (info < 0) {
mexErrMsgTxt("Parameters had an illegal value.");
} else if (info > 0) {
mexErrMsgTxt("Matrix is not positive-definite.");
}
}
Note that MATLAB already ships with BLAS/LAPCK headers and libraries (Intel MKL implementation). In fact this is what $MATLABROOT\extern\include\lapack.h has as function prototype for dpotrf:
#define dpotrf FORTRAN_WRAPPER(dpotrf)
extern void dpotrf(
char *uplo,
ptrdiff_t *n,
double *a,
ptrdiff_t *lda,
ptrdiff_t *info
);
Here is how you compile the above C++ code:
>> mex -largeArrayDims mex_chol.cpp libmwblas.lib libmwlapack.lib
Finally let's test the MEX function:
% some random symmetric positive semidefinite matrix
A = gallery('randcorr',10);
% our MEX-version of Cholesky decomposition
chol2 = #(A) triu(mex_chol(A));
% compare
norm(chol(A) - chol2(A)) % I get 0
(Note that the MEX code returns the working matrix as is, where the LAPACK routine only overwrites half of the matrix. So I used TRIU to zero-out the other half and extract the upper part).
I have a created a simulator in C/C++ that is supposed to output the results in a .mat file that can be imported into some visualization tools in Matlab.
During the simulation results are stored in a data buffer. The buffer is a std::map<const char *, double *>, where the string should be the same name as the corresponding matlab struct field, and the double* is the buffered data.
At the end of the simulation I then use the following code to write the buffered data into a .mat file
const char **fieldnames; // Declared and populated in another class method
int numFields; // Declared in another method. Equal to fieldnames length.
int buffer_size; // Declared in another method. Equal to number of timesteps in simulation.
std::map<const char *, double *> field_data;
std::map<const char *, mxArray *> field_matrices;
// Open .mat file
MATFile *pmat = matOpen(filename.str().c_str(), "w");
// Create an empty Matlab struct of the right size
mxArray *SimData_struct = mxCreateStructMatrix(1,1,this->numFields,this->fieldnames);
int rows=this->buffer_size, cols=1;
for(int i=0; i<this->numFields; i++) {
// Create an empty matlab array for each struct field
field_matrices[this->fieldnames[i]] = mxCreateDoubleMatrix(rows, cols, mxREAL);
// Copy data from buffers to struct fields
memcpy(mxGetPr(field_matrices[this->fieldnames[i]]), this->field_data[this->fieldnames[i]], rows * cols * sizeof(double));
// Insert arrays into the struct
mxSetField(SimData_struct,0,this->fieldnames[i],field_matrices[this->fieldnames[i]]);
}
matPutVariable(pmat, object_name.str().c_str(), SimData_struct);
I can compile and start the simulation, but it dies with an error when the matPutVariable command is reached. The error I get is terminate called after throwing an instance of 'matrix::serialize::WrongSize'. I have tried to google for more information, but have been unable to find something that could help me.
Mathworks support helped me to identify the cause of the issue. My application uses boost 1.55, but Matlab uses 1.49. There was a clash between those dependencies that was solved by adding an additional external dependencies directory path.
-Wl,-rpath={matlab path}/bin/glnxa64
I tried to reproduce the error with a simple example, but I don't see the problem. Here is my code:
test_mat_api.cpp
#include "mat.h"
#include <algorithm>
int main()
{
// output MAT-file
MATFile *pmat = matOpen("out.mat", "w");
// create a scalar struct array with two fields
const char *fieldnames[2] = {"a", "b"};
mxArray *s = mxCreateStructMatrix(1, 1, 2, fieldnames);
// fill struct fields
for (mwIndex i=0; i<2; i++) {
// 10x1 vector
mxArray *arr = mxCreateDoubleMatrix(10, 1, mxREAL);
double *x = mxGetPr(arr);
std::fill(x, x+10, i);
// assign field
mxSetField(s, 0, fieldnames[i], arr);
}
// write struct to MAT-file
matPutVariable(pmat, "my_struct", s);
// cleanup
mxDestroyArray(s);
matClose(pmat);
return 0;
}
First I compile the standalone program:
>> mex -client engine -largeArrayDims test_map_api.cpp
Next I run the executable:
>> !test_map_api.exe
Finally I load the created MAT-file in MATLAB:
>> whos -file out.mat
Name Size Bytes Class Attributes
my_struct 1x1 512 struct
>> load out.mat
>> my_struct
my_struct =
a: [10x1 double]
b: [10x1 double]
>> (my_struct.b)'
ans =
1 1 1 1 1 1 1 1 1 1
So everything runs successfully (I'm using MATLAB R2014a on Windows x64).
I started implementing a few m-files in C++ in order to reduce run times. The m-files produce n-dimensional points and evaluate function values at these points. The functions are user-defined and they are passed to m-files and mex-files as function handles. The mex-files use mexCallMATLAB with feval for finding function values.
I constructed the below example where a function handle fn constructed in the Matlab command line is passed to matlabcallingmatlab.m and mexcallingmatlab.cpp routines. With a freshly opened Matlab, mexcallingmatlab evaluates this function 200000 in 241.5 seconds while matlabcallingmatlab evaluates it in 0.81522 seconds therefore a 296 times slow-down with the mex implementation. These times are the results of the second runs as the first runs seem to be larger probably due to some overhead associated first time loading the program etc.
I have spent many days searching online on this problem and tried some suggestions on it. I tried different mex compiling flags to optimize the mex but there was almost no difference in performance. A previous post in Stackoverflow stated that upgrading Matlab was the solution but I am using probably the latest version MATLAB Version: 8.1.0.604 (R2013a) on Mac OS X Version: 10.8.4. I did compile the mex file with and without –largeArrayDims flag but this didn’t make any difference either. Some suggested that the content of the function handle could be directly coded in the cpp file but this is impossible as I would like to provide this code to any user with any type of function with a vector input and real number output.
As far as I found out, mex files need to go through feval function for using a function handle whereas m-files can directly call function handles provided that Matlab version is newer than some version.
Any help would be greatly appreciated.
simple function handle created in the Matlab command line:
fn = #(x) x'*x
matlabcallingmatlab.m :
function matlabcallingmatlab( fn )
x = zeros(2,1);
for i = 0 : 199999
x(2) = i;
f = fn( x );
end
mexcallingmatlab.cpp:
#include "mex.h"
#include <cstring>
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[] )
{
mxArray *lhs[1], *rhs[2]; //parameters to be passed to feval
double f, *xptr, x[] = {0.0, 0.0}; // x: input to f and f=f(x)
int n = 2, nbytes = n * sizeof(double); // n: dimension of input x to f
// prhs[0] is the function handle as first argument to feval
rhs[0] = const_cast<mxArray *>( prhs[0] );
// rhs[1] contains input x to the function
rhs[1] = mxCreateDoubleMatrix( n, 1, mxREAL);
xptr = mxGetPr( rhs[1] );
for (int i = 0; i < 200000; ++i)
{
x[1] = double(i); // change input
memcpy( xptr, x, nbytes ); // now rhs[1] has new x
mexCallMATLAB(1, lhs, 2, rhs, "feval");
f = *mxGetPr( lhs[0] );
}
}
Compilation of mex file:
>> mex -v -largeArrayDims mexcallingmatlab.cpp
So I tried to implement this myself, and I think I found the reason for the slowness.
Basically your code have a small memory leak where you are not freeing the lhs mxArray returned from the call to mexCallMATLAB. It is not exactly a memory-leak, seeing that MATLAB memory manager takes care of freeing the memory when the MEX-file exits:
MATLAB allocates dynamic memory to store the mxArrays in plhs.
MATLAB automatically deallocates the dynamic memory when you clear the MEX-file.
However, if heap space is at a premium, call mxDestroyArray when
you are finished with the mxArrays plhs points to.
Still explicit is better than implicit... So your code is really stressing the deallocator of the MATLAB memory manager :)
mexcallingmatlab.cpp
#include "mex.h"
#ifndef N
#define N 100
#endif
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
// validate input/output arguments
if (nrhs != 1) {
mexErrMsgTxt("One input argument required.");
}
if (mxGetClassID(prhs[0]) != mxFUNCTION_CLASS) {
mexErrMsgTxt("Input must be a function handle.");
}
if (nlhs > 1) {
mexErrMsgTxt("Too many output arguments.");
}
// allocate output
plhs[0] = mxCreateDoubleMatrix(N, 1, mxREAL);
double *out = mxGetPr(plhs[0]);
// prepare for mexCallMATLAB: val = feval(#fh, zeros(2,1))
mxArray *lhs, *rhs[2];
rhs[0] = mxDuplicateArray(prhs[0]);
rhs[1] = mxCreateDoubleMatrix(2, 1, mxREAL);
double *xptr = mxGetPr(rhs[1]) + 1;
for (int i=0; i<N; ++i) {
*xptr = i;
mexCallMATLAB(1, &lhs, 2, rhs, "feval");
out[i] = *mxGetPr(lhs);
mxDestroyArray(lhs);
}
// cleanup
mxDestroyArray(rhs[0]);
mxDestroyArray(rhs[1]);
}
MATLAB
fh = #(x) x'*x;
N = 2e5;
% MATLAB
tic
out = zeros(N,1);
for i=0:N-1
out(i+1) = feval(fh, [0;i]);
end
toc
% MEX
mex('-largeArrayDims', sprintf('-DN=%d',N), 'mexcallingmatlab.cpp')
tic
out2 = mexcallingmatlab(fh);
toc
% check results
assert(isequal(out,out2))
Running the above benchmark a couple of times (to warm it up), I get the following consistent results:
Elapsed time is 0.732890 seconds. % pure MATLAB
Elapsed time is 1.621439 seconds. % MEX-file
No where near the slow times you initially had! Still the pure MATLAB part is about twice as fast, probably because of the overhead of calling an external MEX-function.
(My system: Win8 running 64-bit R2013a)
There's absolutely no reason to expect that a MEX file is, in general, faster than an M file. The only reason that this is often true is that many loops in MATLAB incur a lot of function call overhead, along with parameter checking and such. Rewriting that in C eliminates the overhead, and gives your C compiler a chance to optimize the code.
In this case, there's nothing for the C compiler to optimize... it MUST make the MATLAB interface call for every iteration. In fact, the MATLAB optimizer will do a better job, since it can, in some cases "see" into the function.
In other words, forget using MEX to speed up this program.
There is some overhead cost in calls from mex to Matlab and vice versa. The overhead per call is small, but it it really adds up in a tight loop like this. As your testing indicates, pure Matlab can be much faster in this case! Your other option is to eliminate the mexCallMATLAB call and do everything in pure C++.