Weighted Variance and Weighted Standard Deviation in C++ - c++

Hi I'm trying to calculate the weighted variance and weighted standard deviation of a series of ints or floats. I found these links:
http://math.tutorvista.com/statistics/standard-deviation.html#weighted-standard-deviation
http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weightsd.pdf (warning pdf)
Here are my template functions so far. Variance and standard deviation work fine but for the life of me I can't get the weighted versions to match the test case at the bottom of the pdf:
template <class T>
inline float Mean( T samples[], int count )
{
float mean = 0.0f;
if( count >= 1 )
{
for( int i = 0; i < count; i++ )
mean += samples[i];
mean /= (float) count;
}
return mean;
}
template <class T>
inline float Variance( T samples[], int count )
{
float variance = 0.0f;
if( count > 1 )
{
float mean = 0.0f;
for( int i = 0; i < count; i++ )
mean += samples[i];
mean /= (float) count;
for( int i = 0; i < count; i++ )
{
float sum = (float) samples[i] - mean;
variance += sum*sum;
}
variance /= (float) count - 1.0f;
}
return variance;
}
template <class T>
inline float StdDev( T samples[], int count )
{
return sqrtf( Variance( samples, count ) );
}
template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
float varianceWeighted = 0.0f;
if( count > 1 )
{
float sumWeights = 0.0f, meanWeighted = 0.0f;
int numNonzero = 0;
for( int i = 0; i < count; i++ )
{
meanWeighted += samples[i]*weights[i];
sumWeights += weights[i];
if( ((float) weights[i]) != 0.0f ) numNonzero++;
}
if( sumWeights != 0.0f && numNonzero > 1 )
{
meanWeighted /= sumWeights;
for( int i = 0; i < count; i++ )
{
float sum = samples[i] - meanWeighted;
varianceWeighted += weights[i]*sum*sum;
}
varianceWeighted *= ((float) numNonzero)/((float) count*(numNonzero - 1.0f)*sumWeights); // this should be right but isn't?!
}
}
return varianceWeighted;
}
template <class T>
inline float StdDevWeighted( T samples[], T weights[], int count )
{
return sqrtf( VarianceWeighted( samples, weights, count ) );
}
Test case:
int samples[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23 };
printf( "%.2f\n", StdDev( samples, 9 ) );
int weights[] = { 1, 1, 0, 0, 4, 1, 2, 1, 0 };
printf( "%.2f\n", StdDevWeighted( samples, weights, 9 ) );
Result:
7.46
1.94
Should be:
7.46
5.82
I think the problem is that weighted variance has a few different interpretations and I don't know which one is standard. I found this variation:
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm
template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
float varianceWeighted = 0.0f;
if( count > 1 )
{
float sumWeights = 0.0f, meanWeighted = 0.0f, m2 = 0.0f;
for( int i = 0; i < count; i++ )
{
float temp = weights[i] + sumWeights,
delta = samples[i] - meanWeighted,
r = delta*weights[i]/temp;
meanWeighted += r;
m2 += sumWeights*delta*r; // Alternatively, m2 += weights[i] * delta * (samples[i]−meanWeighted)
sumWeights = temp;
}
varianceWeighted = (m2/sumWeights)*((float) count/(count - 1));
}
return varianceWeighted;
}
Result:
7.46
5.64
I also tried looking at boost and esutil but they didn't help much:
http://www.boost.org/doc/libs/1_48_0/boost/accumulators/statistics/weighted_variance.hpp
http://esutil.googlecode.com/svn-history/r269/trunk/esutil/stat/util.py
I don't need an entire statistics library, and more importantly, I want to understand the implementation.
Can someone please post functions to calculate these correctly?
Bonus points if your functions can do it in a single pass.
Also, does anyone know if weighted variance gives the same result as ordinary variance with repeated values? For example, would the variance of samples[] = { 1, 2, 3, 3 } be the same as weighted variance of samples[] = { 1, 2, 3 }, weights[] = { 1, 1, 2 }?
Update: here is a google docs spreadsheet I have set up to explore the problem. Unfortunately my answers are nowhere close to the NIST pdf. I think the problem is in the unbias step, but I can't see how to fix it.
https://docs.google.com/spreadsheet/ccc?key=0ApzPh5nRin0ldGNNYjhCUTlWTks2TGJrZW4wQUcyZnc&usp=sharing
The result is a weighted variance of 3.77, which is the square of the weighted standard deviation of 1.94 I got in my c++ code.
I am attempting to install octave on my Mac OS X setup so that I can run their var() function with weights, but it is taking forever to install it with brew. I am deeply into yak shaving now.

float mean(uint16_t* x, uint16_t n) {
uint16_t sum_xi = 0;
int i;
for (i = 0; i < n; i++) {
sum_xi += x[i];
}
return (float) sum_xi / n;
}
/**
* http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weigmean.pdf
*/
float weighted_mean(uint16_t* x, uint16_t* w, uint16_t n) {
int sum_wixi = 0;
int sum_wi = 0;
int i;
for (i = 0; i < n; i++) {
sum_wixi += w[i] * x[i];
sum_wi += w[i];
}
return (float) sum_wixi / (float) sum_wi;
}
float variance(uint16_t* x, uint16_t n) {
float mean_x = mean(x, n);
float dist, dist2;
float sum_dist2 = 0;
int i;
for (i = 0; i < n; i++) {
dist = x[i] - mean_x;
dist2 = dist * dist;
sum_dist2 += dist2;
}
return sum_dist2 / (n - 1);
}
/**
* http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
*/
float weighted_variance(uint16_t* x, uint16_t* w, uint16_t n) {
float xw = weighted_mean(x, w, n);
float dist, dist2;
float sum_wi_times_dist2 = 0;
int sum_wi = 0;
int n_prime = 0;
int i;
for (i = 0; i < n; i++) {
dist = x[i] - xw;
dist2 = dist * dist;
sum_wi_times_dist2 += w[i] * dist2;
sum_wi += w[i];
if (w[i] > 0)
n_prime++;
}
if (n_prime > 1) {
return sum_wi_times_dist2 / ((float) ((n_prime - 1) * sum_wi) / n_prime);
} else {
return 0.0f;
}
}
/**
* http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm
*/
float weighted_incremental_variance(uint16_t* x, uint16_t* w, uint16_t n) {
uint16_t sumweight = 0;
float mean = 0;
float M2 = 0;
int n_prime = 0;
uint16_t temp;
float delta;
float R;
int i;
for (i = 0; i < n; i++) {
if (w[i] == 0)
continue;
temp = w[i] + sumweight;
delta = x[i] - mean;
R = delta * w[i] / temp;
mean += R;
M2 += sumweight * delta * R;
sumweight = temp;
n_prime++;
}
if (n_prime > 1) {
float variance_n = M2 / sumweight;
return variance_n * n_prime / (n_prime - 1);
} else {
return 0.0f;
}
}
void main(void) {
uint16_t n = 9;
uint16_t x[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23 };
uint16_t w[] = { 1, 1, 0, 0, 4, 1, 2, 1, 0 };
printf("%f\n", weighted_variance(x, w, n)); /* outputs: 33.900002 */
printf("%f\n", weighted_incremental_variance(x, w, n)); /* outputs: 33.900005 */
}

Solution
You accidentally added an extra term "count" in the denominator of the variance update term.
When using the correction below I get your expected answer of
5.82
FYI, one way to pick up on things like this when you are doing a code review is to do a "dimensional analysis". The "units" of the equation were wrong. You were effectively dividing by an order N squared term when it should be an order N term.
Before
template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
...
varianceWeighted *= ((float) numNonzero)/((float) count*(numNonzero - 1.0f)*sumWeights); // this should be right but isn't?!
...
}
After
Removing "count" this line should be replaced by
template <class T>
inline float VarianceWeighted( T samples[], T weights[], int count )
{
...
varianceWeighted *= ((float) numNonzero)/((float) (numNonzero - 1.0f)*sumWeights); // removed count term
...
}

Here's a much shorter version with a working Demo :
#include <iostream>
#include <vector>
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/stats.hpp>
#include <boost/accumulators/statistics/weighted_variance.hpp>
#include <boost/accumulators/statistics/variance.hpp>
namespace ba = boost::accumulators;
int main() {
std::vector<double> numbers{2, 3, 5, 7, 11, 13, 17, 19, 23};
std::vector<double> weights{1, 1, 0, 0, 4, 1, 2, 1, 0 };
ba::accumulator_set<double, ba::stats<ba::tag::variance > > acc;
ba::accumulator_set<double, ba::stats<ba::tag::weighted_variance > , double > acc_weighted;
double n = numbers.size();
double N = n;
for(size_t i = 0 ; i<numbers.size() ; i++ ) {
acc ( numbers[i] );
acc_weighted( numbers[i] , ba::weight = weights[i] );
if(weights[i] == 0) {
n=n-1;
}
};
std::cout << "Sample Standard Deviation, s: " << std::sqrt(ba::variance(acc) *N/(N-1)) << std::endl;
std::cout << "Weighted Sample Standard Deviation, s: " << std::sqrt(ba::weighted_variance(acc_weighted)*n/(n-1)) << std::endl;
}
Make note that n must reflect the number of samples with nonzero weights, hence extra n=n-1; line.
Sample Standard Deviation, s: 7.45729
Weighted Sample Standard Deviation, s: 5.82237

Related

Is it possible to use CUDA parallelizing this nested for loop?

I want to speed up this nested for loop, just start learn CUDA, how could I use CUDA to parallel this c++ code ?
#define PI 3.14159265
using namespace std;
int main()
{
int nbint = 2;
int hits = 20;
int nbinp = 2;
float _theta, _phi, _l, _m, _n, _k = 0, delta = 5;
float x[20],y[20],z[20],a[20],t[20];
for (int i = 0; i < hits; ++i)
{
x[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
y[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
z[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
a[i] = rand() / (float)(RAND_MAX / 100);
}
float maxforall = 1e-6;
float theta0;
float phi0;
for (int i = 0; i < nbint; i++)
{
_theta = (0.5 + i)*delta;
for (int j = 0; j < nbinp; j++)
{
_phi = (0.5 + j)*delta / _theta;
_l = sin(_theta* PI / 180.0)*cos(_phi* PI / 180.0);
_m = sin(_theta* PI / 180.0)*sin(_phi* PI / 180.0);
_n = cos(_theta* PI / 180.0);
for (int k = 0; k < hits; k++)
{
_k = -(_l*x[k] + _m*y[k] + _n*z[k]);
t[k] = a[k] - _k;
}
qsort(t, 0, hits - 1);
float max = t[0];
for (int k = 0; k < hits; k++)
{
if (max < t[k])
max = t[k];
}
if (max > maxforall)
{
maxforall = max;
}
}
}
return 0;
}
I want to put innermost for loop and the sort part(maybe the whole nested loop) into parallel. After sort those array I found the maximum of all arrays. I use maximum to simplify the code. The reason I need sort is that maximum represent
here is a continuous time information(all arrays contain time information). The sort part make those time from lowest to highest. Then I compare the a specific time interval(not a single value). The compare process almost like I choose maximum but with a continuous interval not a single value.
Your 3 nested loops calculate nbint*nbinp*hits values. Since each of those values is independent from each other, all values can be calculated in parallel.
You stated in your comments that you have a commutative and associative "filter condition" which reduces the output to a single scalar value. This can be exploited to avoid sorting and storing the temporary values. Instead, we can calculate the values on-the-fly and then apply a parallel reduction to determine the end result.
This can be done in "raw" CUDA, below I implemented this idea using thrust. The main idea is to run grid_op nbint*nbinp*hits times in parallel. In order to find out the three original "loop indices" from the single scalar index which is passed to grid_op the algorithm from this SO question is used.
thrust::transform_reduce performs the on-the-fly transformation and the subsequent parallel reduction (here thrust::maximum is used as a substitute).
#include <cmath>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
// ### BEGIN utility for demo ####
#include <iostream>
#include <thrust/random.h>
thrust::host_vector<float> random_vector(const size_t N)
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
thrust::host_vector<float> temp(N);
for(size_t i = 0; i < N; i++) {
temp[i] = u01(rng);
}
return temp;
}
// ### END utility for demo ####
template <typename... Iterators>
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename ZipIterator>
class grid_op
{
public:
grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) : zipIt(zipIt), dim1(dim1), dim2(dim2){}
__host__ __device__
float operator()(std::size_t index) const
{
const auto coords = unflatten_3d_index(index, dim1, dim2);
const auto values = zipIt[thrust::get<2>(coords)];
const float delta = 5;
const float _theta = (0.5f + thrust::get<0>(coords))*delta;
const float _phi = (0.5f + thrust::get<1>(coords))*delta / _theta;
const float _l = sin(_theta* M_PI / 180.0)*cos(_phi* M_PI / 180.0);
const float _m = sin(_theta* M_PI / 180.0)*sin(_phi* M_PI / 180.0);
const float _n = cos(_theta* M_PI / 180.0);
const float _k = -(_l*thrust::get<0>(values) + _m*thrust::get<1>(values) + _n*thrust::get<2>(values));
return (thrust::get<3>(values) - _k);
}
private:
__host__ __device__
thrust::tuple<std::size_t, std::size_t, std::size_t>
unflatten_3d_index(std::size_t index, std::size_t dim1, std::size_t dim2) const
{
// taken from https://stackoverflow.com/questions/29142417/4d-position-from-1d-index
std::size_t x = index % dim1;
std::size_t y = ( ( index - x ) / dim1 ) % dim2;
std::size_t z = ( ( index - y * dim1 - x ) / (dim1 * dim2) );
return thrust::make_tuple(x,y,z);
}
ZipIterator zipIt;
std::size_t dim1;
std::size_t dim2;
};
template <typename ZipIterator>
grid_op<ZipIterator> make_grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2)
{
return grid_op<ZipIterator>(zipIt, dim1, dim2);
}
int main()
{
const int nbint = 3;
const int nbinp = 4;
const int hits = 20;
const std::size_t N = nbint * nbinp * hits;
thrust::device_vector<float> d_x = random_vector(hits);
thrust::device_vector<float> d_y = random_vector(hits);
thrust::device_vector<float> d_z = random_vector(hits);
thrust::device_vector<float> d_a = random_vector(hits);
auto zipIt = zip(d_x.begin(), d_y.begin(), d_z.begin(), d_a.begin());
auto countingIt = thrust::counting_iterator<std::size_t>(0);
auto unary_op = make_grid_op(zipIt, nbint, nbinp);
auto binary_op = thrust::maximum<float>();
const float init = 0;
float max = thrust::transform_reduce(
countingIt, countingIt+N,
unary_op,
init,
binary_op
);
std::cout << "max = " << max << std::endl;
}

Gradient descent converging towards the wrong value

I'm trying to implement a gradient descent algorithm in C++. Here's the code I have so far :
#include <iostream>
double X[] {163,169,158,158,161,172,156,161,154,145};
double Y[] {52, 68, 49, 73, 71, 99, 50, 82, 56, 46 };
double m, p;
int n = sizeof(X)/sizeof(X[0]);
int main(void) {
double alpha = 0.00004; // 0.00007;
m = (Y[1] - Y[0]) / (X[1] - X[0]);
p = Y[0] - m * X[0];
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
}
return 0;
}
double Loss_function(void) {
double res = 0;
double tmp;
for (int i = 0; i < n; i++) {
tmp = Y[i] - m * X[i] - p;
res += tmp * tmp;
}
return res / 2.0 / (double)n;
}
void gradientStep(double alpha) {
double pg = 0, mg = 0;
for (int i = 0; i < n; i++) {
pg += Y[i] - m * X[i] - p;
mg += X[i] * (Y[i] - m * X[i] - p);
}
p += alpha * pg / n;
m += alpha * mg / n;
}
This code converges towards m = 2.79822, p = -382.666, and an error of 102.88. But if I use my calculator to find out the correct linear regression model, I find that the correct values of m and p should respectively be 1.601 and -191.1.
I also noticed that the algorithm won't converge for alpha > 0.00007, which seems quite low, and the value of p barely changes during the 8 iterations (or even after 2000 iterations).
What's wrong with my code?
Here's a good overview of the algorithm I'm trying to implement. The values of theta0 and theta1 are called p and m in my program.
Other implementation in python
More about the algorithm
This link gives a comprehensive view of the algorithm; it turns out I was following a completely wrong approach.
The following code does not work properly (and I have no plans to work on it further), but should put on track anyone who's confronted to the same problem as me :
#include <vector>
#include <iostream>
typedef std::vector<double> vect;
std::vector<double> y, omega(2, 0), omega2(2, 0);;
std::vector<std::vector<double>> X;
int n = 10;
int main(void) {
/* Initialize x so that each members contains (1, x_i) */
/* Initialize x so that each members contains y_i */
double alpha = 0.00001;
display();
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
display();
}
return 0;
}
double f_function(const std::vector<double> &x) {
double c;
for (unsigned int i = 0; i < omega.size(); i++) {
c += omega[i] * x[i];
}
return c;
}
void gradientStep(double alpha) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < X[0].size(); j++) {
omega2[j] -= alpha/(double)n * (f_function(X[i]) - y[i]) * X[i][j];
}
}
omega = omega2;
}
void display(void) {
double res = 0, tmp = 0;
for (int i = 0; i < n; i++) {
tmp = y[i] - f_function(X[i]);
res += tmp * tmp; // Loss functionn
}
std::cout << "omega = ";
for (unsigned int i = 0; i < omega.size(); i++) {
std::cout << "[" << omega[i] << "] ";
}
std::cout << "\tError : " << res * .5/(double)n << std::endl;
}

opencv: Rigid Transformation between two 3D point clouds

I have two 3D point clouds, and I'd like to use opencv to find the rigid transformation matrix (translation, rotation, constant scaling among all 3 axes).
I've found an estimateRigidTransformation function, but it's only for 2D points apparently
In addition, I've found estimateAffine3D, but it doesn't seem to support rigid transformation mode.
Do I need to just write my own rigid transformation function?
I did not find the required functionality in OpenCV so I have written my own implementation. Based on ideas from OpenSFM.
cv::Vec3d
CalculateMean(const cv::Mat_<cv::Vec3d> &points)
{
cv::Mat_<cv::Vec3d> result;
cv::reduce(points, result, 0, CV_REDUCE_AVG);
return result(0, 0);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3d> &points1, const cv::Mat_<cv::Vec3d> points2)
{
/* Calculate centroids. */
cv::Vec3d t1 = -CalculateMean(points1);
cv::Vec3d t2 = -CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = t1[0];
T1(1, 3) = t1[1];
T1(2, 3) = t1[2];
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = -t2[0];
T2(1, 3) = -t2[1];
T2(2, 3) = -t2[2];
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
double p1Rms = 0, p2Rms = 0;
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3d p1 = points1(ptIdx, 0) + t1;
cv::Vec3d p2 = points2(ptIdx, 0) + t2;
p1Rms += p1.dot(p1);
p2Rms += p2.dot(p2);
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += p2[i] * p1[j];
}
}
}
cv::Mat_<double> u, s, vh;
cv::SVD::compute(C, s, u, vh);
cv::Mat_<double> R = u * vh;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vh.row(2) * 2.0);
}
double scale = sqrt(p2Rms / p1Rms);
R *= scale;
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result.rowRange(0, 3);
}
I've found PCL to be a nice adjunct to OpenCV. Take a look at their Iterative Closest Point (ICP) example. The provided example registers the two point clouds and then displays the rigid transformation.
Here's my rmsd code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
typedef struct
{
float m[4][4];
} MATRIX;
#define vdiff2(a,b) ( ((a)[0]-(b)[0]) * ((a)[0]-(b)[0]) + \
((a)[1]-(b)[1]) * ((a)[1]-(b)[1]) + \
((a)[2]-(b)[2]) * ((a)[2]-(b)[2]) )
static double alignedrmsd(float *v1, float *v2, int N);
static void centroid(float *ret, float *v, int N);
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx);
static void crossproduct(float *ans, float *pt1, float *pt2);
static void mtx_root(MATRIX *mtx);
static int almostequal(MATRIX *a, MATRIX *b);
static void mulpt(MATRIX *mtx, float *pt);
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y);
static void mtx_identity(MATRIX *mtx);
static void mtx_trans(MATRIX *mtx, float x, float y, float z);
static int mtx_invert(float *mtx, int N);
static float absmaxv(float *v, int N);
/*
calculate rmsd between two structures
Params: v1 - first set of points
v2 - second set of points
N - number of points
mtx - return for transfrom matrix used to align structures
Returns: rmsd score
Notes: mtx can be null. Transform will be rigid. Inputs must
be previously aligned for sequence alignment
*/
double rmsd(float *v1, float *v2, int N, float *mtx)
{
float cent1[3];
float cent2[3];
MATRIX tmtx;
MATRIX tempmtx;
MATRIX move1;
MATRIX move2;
int i;
double answer;
float *temp1 = 0;
float *temp2 = 0;
int err;
assert(N > 3);
temp1 = malloc(N * 3 * sizeof(float));
temp2 = malloc(N * 3 * sizeof(float));
if(!temp1 || !temp2)
goto error_exit;
centroid(cent1, v1, N);
centroid(cent2, v2, N);
for(i=0;i<N;i++)
{
temp1[i*3+0] = v1[i*3+0] - cent1[0];
temp1[i*3+1] = v1[i*3+1] - cent1[1];
temp1[i*3+2] = v1[i*3+2] - cent1[2];
temp2[i*3+0] = v2[i*3+0] - cent2[0];
temp2[i*3+1] = v2[i*3+1] - cent2[1];
temp2[i*3+2] = v2[i*3+2] - cent2[2];
}
err = getalignmtx(temp1, temp2, N, &tmtx);
if(err == -1)
goto error_exit;
mtx_trans(&move1, -cent2[0], -cent2[1], -cent2[2]);
mtx_mul(&tempmtx, &move1, &tmtx);
mtx_trans(&move2, cent1[0], cent1[1], cent1[2]);
mtx_mul(&tmtx, &tempmtx, &move2);
memcpy(temp2, v2, N * sizeof(float) * 3);
for(i=0;i<N;i++)
mulpt(&tmtx, temp2 + i * 3);
answer = alignedrmsd(v1, temp2, N);
free(temp1);
free(temp2);
if(mtx)
memcpy(mtx, &tmtx.m, 16 * sizeof(float));
return answer;
error_exit:
free(temp1);
free(temp2);
if(mtx)
{
for(i=0;i<16;i++)
mtx[i] = 0;
}
return sqrt(-1.0);
}
/*
calculate rmsd between two aligned structures (trivial)
Params: v1 - first structure
v2 - second structure
N - number of points
Returns: rmsd
*/
static double alignedrmsd(float *v1, float *v2, int N)
{
double answer =0;
int i;
for(i=0;i<N;i++)
answer += vdiff2(v1 + i *3, v2 + i * 3);
return sqrt(answer/N);
}
/*
compute the centroid
*/
static void centroid(float *ret, float *v, int N)
{
int i;
ret[0] = 0;
ret[1] = 0;
ret[2] = 0;
for(i=0;i<N;i++)
{
ret[0] += v[i*3+0];
ret[1] += v[i*3+1];
ret[2] += v[i*3+2];
}
ret[0] /= N;
ret[1] /= N;
ret[2] /= N;
}
/*
get the matrix needed to align two structures
Params: v1 - reference structure
v2 - structure to align
N - number of points
mtx - return for rigid body alignment matrix
Notes: only calculates rotation part of matrix.
assumes input has been aligned to centroids
*/
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx)
{
MATRIX A = { {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,1}} };
MATRIX At;
MATRIX Ainv;
MATRIX temp;
float tv[3];
float tw[3];
float tv2[3];
float tw2[3];
int k, i, j;
int flag = 0;
float correction;
correction = absmaxv(v1, N * 3) * absmaxv(v2, N * 3);
for(k=0;k<N;k++)
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += (v1[k*3+i] * v2[k*3+j])/correction;
while(flag < 3)
{
for(i=0;i<4;i++)
for(j=0;j<4;j++)
At.m[i][j] = A.m[j][i];
memcpy(&Ainv, &A, sizeof(MATRIX));
/* this will happen if all points are in a plane */
if( mtx_invert((float *) &Ainv, 4) == -1)
{
if(flag == 0)
{
crossproduct(tv, v1, v1+3);
crossproduct(tw, v2, v2+3);
}
else
{
crossproduct(tv2, tv, v1);
crossproduct(tw2, tw, v2);
memcpy(tv, tv2, 3 * sizeof(float));
memcpy(tw, tw2, 3 * sizeof(float));
}
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += tv[i] * tw[j];
flag++;
}
else
flag = 5;
}
if(flag != 5)
return -1;
mtx_mul(&temp, &At, &A);
mtx_root(&temp);
mtx_mul(mtx, &temp, &Ainv);
return 0;
}
/*
get the crossproduct of two vectors.
Params: ans - return pinter for answer.
pt1 - first vector
pt2 - second vector.
Notes: crossproduct is at right angles to the two vectors.
*/
static void crossproduct(float *ans, float *pt1, float *pt2)
{
ans[0] = pt1[1] * pt2[2] - pt1[2] * pt2[1];
ans[1] = pt1[0] * pt2[2] - pt1[2] * pt2[0];
ans[2] = pt1[0] * pt2[1] - pt1[1] * pt2[0];
}
/*
Denman-Beavers square root iteration
*/
static void mtx_root(MATRIX *mtx)
{
MATRIX Y = *mtx;
MATRIX Z;
MATRIX Y1;
MATRIX Z1;
MATRIX invY;
MATRIX invZ;
MATRIX Y2;
int iter = 0;
int i, ii;
mtx_identity(&Z);
do
{
invY = Y;
invZ = Z;
if( mtx_invert((float *) &invY, 4) == -1)
return;
if( mtx_invert((float *) &invZ, 4) == -1)
return;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
Y1.m[i][ii] = 0.5 * (Y.m[i][ii] + invZ.m[i][ii]);
Z1.m[i][ii] = 0.5 * (Z.m[i][ii] + invY.m[i][ii]);
}
Y = Y1;
Z = Z1;
mtx_mul(&Y2, &Y, &Y);
}
while(!almostequal(&Y2, mtx) && iter++ < 20 );
*mtx = Y;
}
/*
Check two matrices for near-enough equality
Params: a - first matrix
b - second matrix
Returns: 1 if almost equal, else 0, epsilon 0.0001f.
*/
static int almostequal(MATRIX *a, MATRIX *b)
{
int i, ii;
float epsilon = 0.001f;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
if(fabs(a->m[i][ii] - b->m[i][ii]) > epsilon)
return 0;
return 1;
}
/*
multiply a point by a matrix.
Params: mtx - matrix
pt - the point (transformed)
*/
static void mulpt(MATRIX *mtx, float *pt)
{
float ans[4] = {0};
int i;
int ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<3;ii++)
{
ans[i] += pt[ii] * mtx->m[ii][i];
}
ans[i] += mtx->m[3][i];
}
pt[0] = ans[0];
pt[1] = ans[1];
pt[2] = ans[2];
}
/*
multiply two matrices.
Params: ans - return pointer for answer.
x - first matrix
y - second matrix.
Notes: ans may not be equal to x or y.
*/
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y)
{
int i;
int ii;
int iii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
ans->m[i][ii] = 0;
for(iii=0;iii<4;iii++)
ans->m[i][ii] += x->m[i][iii] * y->m[iii][ii];
}
}
/*
create an identity matrix.
Params: mtx - return pointer.
*/
static void mtx_identity(MATRIX *mtx)
{
int i;
int ii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
if(i==ii)
mtx->m[i][ii] = 1.0f;
else
mtx->m[i][ii] = 0;
}
}
/*
create a translation matrix.
Params: mtx - return pointer for matrix.
x - x translation.
y - y translation.
z - z translation
*/
static void mtx_trans(MATRIX *mtx, float x, float y, float z)
{
mtx->m[0][0] = 1;
mtx->m[0][1] = 0;
mtx->m[0][2] = 0;
mtx->m[0][3] = 0;
mtx->m[1][0] = 0;
mtx->m[1][1] = 1;
mtx->m[1][2] = 0;
mtx->m[1][3] = 0;
mtx->m[2][0] = 0;
mtx->m[2][1] = 0;
mtx->m[2][2] = 1;
mtx->m[2][3] = 0;
mtx->m[3][0] = x;
mtx->m[3][1] = y;
mtx->m[3][2] = z;
mtx->m[3][3] = 1;
}
/*
matrix invert routine
Params: mtx - the matrix in raw format, in/out
N - width and height
Returns: 0 on success, -1 on fail
*/
static int mtx_invert(float *mtx, int N)
{
int indxc[100]; /* these 100s are the only restriction on matrix size */
int indxr[100];
int ipiv[100];
int i, j, k;
int irow, icol;
double big;
double pinv;
int l, ll;
double dum;
double temp;
assert(N <= 100);
for(i=0;i<N;i++)
ipiv[i] = 0;
for(i=0;i<N;i++)
{
big = 0.0;
/* find biggest element */
for(j=0;j<N;j++)
if(ipiv[j] != 1)
for(k=0;k<N;k++)
if(ipiv[k] == 0)
if(fabs(mtx[j*N+k]) >= big)
{
big = fabs(mtx[j*N+k]);
irow = j;
icol = k;
}
ipiv[icol]=1;
if(irow != icol)
for(l=0;l<N;l++)
{
temp = mtx[irow * N + l];
mtx[irow * N + l] = mtx[icol * N + l];
mtx[icol * N + l] = temp;
}
indxr[i] = irow;
indxc[i] = icol;
/* if biggest element is zero matrix is singular, bail */
if(mtx[icol* N + icol] == 0)
goto error_exit;
pinv = 1.0/mtx[icol * N + icol];
mtx[icol * N + icol] = 1.0;
for(l=0;l<N;l++)
mtx[icol * N + l] *= pinv;
for(ll=0;ll<N;ll++)
if(ll != icol)
{
dum = mtx[ll * N + icol];
mtx[ll * N + icol] = 0.0;
for(l=0;l<N;l++)
mtx[ll * N + l] -= mtx[icol * N + l]*dum;
}
}
/* unscramble matrix */
for (l=N-1;l>=0;l--)
{
if (indxr[l] != indxc[l])
for (k=0;k<N;k++)
{
temp = mtx[k * N + indxr[l]];
mtx[k * N + indxr[l]] = mtx[k * N + indxc[l]];
mtx[k * N + indxc[l]] = temp;
}
}
return 0;
error_exit:
return -1;
}
/*
get the asolute maximum of an array
*/
static float absmaxv(float *v, int N)
{
float answer;
int i;
for(i=0;i<N;i++)
if(answer < fabs(v[i]))
answer = fabs(v[i]);
return answer;
}
#include <stdio.h>
/*
debug utlitiy
*/
static void printmtx(FILE *fp, MATRIX *mtx)
{
int i, ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<4;ii++)
fprintf(fp, "%f, ", mtx->m[i][ii]);
fprintf(fp, "\n");
}
}
int rmsdmain(void)
{
float one[4*3] = {0,0,0, 1,0,0, 2,1,0, 0,3,1};
float two[4*3] = {0,0,0, 0,1,0, 1,2,0, 3,0,1};
MATRIX mtx;
double diff;
int i;
diff = rmsd(one, two, 4, (float *) &mtx.m);
printf("%f\n", diff);
printmtx(stdout, &mtx);
for(i=0;i<4;i++)
{
mulpt(&mtx, two + i * 3);
printf("%f %f %f\n", two[i*3], two[i*3+1], two[i*3+2]);
}
return 0;
}
I took #vagran's implementation and added RANSAC on top of it, since estimateRigidTransform2d does it and it was helpful for me since my data is noisy. (Note: This code doesn't have constant scaling along all 3 axes; you can add it back in easily by comparing to vargran's).
cv::Vec3f CalculateMean(const cv::Mat_<cv::Vec3f> &points)
{
if(points.size().height == 0){
return 0;
}
assert(points.size().width == 1);
double mx = 0.0;
double my = 0.0;
double mz = 0.0;
int n_points = points.size().height;
for(int i = 0; i < n_points; i++){
double x = double(points(i)[0]);
double y = double(points(i)[1]);
double z = double(points(i)[2]);
mx += x;
my += y;
mz += z;
}
return cv::Vec3f(mx/n_points, my/n_points, mz/n_points);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> points2)
{
/* Calculate centroids. */
cv::Vec3f t1 = CalculateMean(points1);
cv::Vec3f t2 = CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = double(-t1[0]);
T1(1, 3) = double(-t1[1]);
T1(2, 3) = double(-t1[2]);
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = double(t2[0]);
T2(1, 3) = double(t2[1]);
T2(2, 3) = double(t2[2]);
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3f p1 = points1(ptIdx) - t1;
cv::Vec3f p2 = points2(ptIdx) - t2;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += double(p2[i] * p1[j]);
}
}
}
cv::Mat_<double> u, s, vt;
cv::SVD::compute(C, s, u, vt);
cv::Mat_<double> R = u * vt;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vt.row(2) * 2.0);
}
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result;
}
cv::Mat_<double> RANSACFindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> &points2)
{
cv::Mat points1Homo;
cv::convertPointsToHomogeneous(points1, points1Homo);
int iterations = 100;
int min_n_points = 3;
int n_points = points1.size().height;
std::vector<int> range(n_points);
cv::Mat_<double> best;
int best_inliers = -1;
// inlier points should be projected within this many units
float threshold = .02;
std::iota(range.begin(), range.end(), 0);
auto gen = std::mt19937{std::random_device{}()};
for(int i = 0; i < iterations; i++) {
std::shuffle(range.begin(), range.end(), gen);
cv::Mat_<cv::Vec3f> points1subset(min_n_points, 1, cv::Vec3f(0,0,0));
cv::Mat_<cv::Vec3f> points2subset(min_n_points, 1, cv::Vec3f(0,0,0));
for(int j = 0; j < min_n_points; j++) {
points1subset(j) = points1(range[j]);
points2subset(j) = points2(range[j]);
}
cv::Mat_<float> rigidT = FindRigidTransform(points1subset, points2subset);
cv::Mat_<float> rigidT_float = cv::Mat::eye(4, 4, CV_32F);
rigidT.convertTo(rigidT_float, CV_32F);
std::vector<int> inliers;
for(int j = 0; j < n_points; j++) {
cv::Mat_<float> t1_3d = rigidT_float * cv::Mat_<float>(points1Homo.at<cv::Vec4f>(j));
if(t1_3d(3) == 0) {
continue; // Avoid 0 division
}
float dx = (t1_3d(0)/t1_3d(3) - points2(j)[0]);
float dy = (t1_3d(1)/t1_3d(3) - points2(j)[1]);
float dz = (t1_3d(2)/t1_3d(3) - points2(j)[2]);
float square_dist = dx * dx + dy * dy + dz * dz;
if(square_dist < threshold * threshold){
inliers.push_back(j);
}
}
int n_inliers = inliers.size();
if(n_inliers > best_inliers) {
best_inliers = n_inliers;
best = rigidT;
}
}
return best;
}
#vagran Thanks for the code! Seems to work very well.
I do have a little terminology suggestion though. Since you are estimating and applying a scale during the transformation, it is a 7-parameter transformation, or Helmert / similarity transformation. And in a rigid transformation, no scaling is applied because all Euclidiean distances need to be reserved.
I would've added this as comment, but don't have enough points.. D: sorry for that.
rigid transformation: https://en.wikipedia.org/wiki/Rigid_transformation
Helmert transformation: https://www.researchgate.net/publication/322841143_Parameter_estimation_in_3D_affine_and_similarity_transformation_implementation_of_variance_component_estimation

TBB Reduction causes segfault when using more than 1 logical core

I implemented the Jacobi algorithm using TBB and it works just fine. Then I parallelized the convergence calculation using a reduction, but for some reason if I use more than 1 logical core I get an segmentation fault and i can't figure out why.
I can use more than 1 thread on a system that has only 1 logical core.
The same implementation using OpenMP works without a hassle
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/task_scheduler_init.h>
#include <tbb/tick_count.h>
// ----------------------------------------------------------------
#define SIZE 1024
#define RESIDUO 0.0009f*SIZE
#define THREADS 2
using namespace tbb;
// ----------------------------------------------------------------
struct Sum {
float ret;
float (*a)[SIZE];
float (*x);
float (*b);
Sum(float A[SIZE][SIZE], float X[SIZE], float B[SIZE]) : ret(0), a(A), x(X), b(B) {}
Sum( Sum&, split ) {ret = 0;}
void operator()( const blocked_range<int>& r ) {
float temp = ret;
for( int i = r.begin(); i != r.end(); i++ ) {
float sum = 0.0f;
for(int j = 0; j < SIZE; j++){
sum += a[i][j] * x[j];
}
temp += pow(b[i] - sum, 2);
}
ret = temp;
}
void join( Sum& rhs ) {ret += rhs.ret;}
};
/*
// || b - Ax ||
*/
int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){
Sum total(a, x, b);
parallel_reduce( blocked_range<int>(0, SIZE), total );
float norm = sqrt(total.ret);
printf("Ret: %f | Residuo: %f\n", total.ret, norm);
return (norm <= RESIDUO);
}
// ----------------------------------------------------------------
float randomFloat()
{
float r = (float)rand()/(float)RAND_MAX;
return r;
}
// ----------------------------------------------------------------
int check_ddm(float (*a)[SIZE]){
float sum = 0.0f;
int i = 0, j = 0;
for(i = 0; i < SIZE; i++){
sum = 0.0f;
for(j = 0; j < SIZE; j++){
if(i != j){
sum += a[i][j];
}
}
if(a[i][i] < sum){
printf("line: %d, sum: %f, a[i][i]: %f \n", i, sum, a[i][i]);
for(j = 0; j < SIZE; j++){
if(i != j) printf("%f ", a[i][j]);
else printf("(%f) ", a[i][j]);
}
printf("\n");
return 0;
}
}
return 1;
}
// ----------------------------------------------------------------
int generate_ddm(float (*a)[SIZE], float *b)
{
int i = 0, j = 0;
float line = 0.0f;
for(i = 0; i < SIZE; i++){
line = 0.0f;
for(j = 0; j < SIZE; j++){
if(i != j){
a[i][j] = randomFloat();
}
line += a[i][j];
}
a[i][i] = SIZE;
b[i] = line + SIZE;
}
return check_ddm(a);
}
// ----------------------------------------------------------------
int main( )
{
float (*x)[SIZE] = (float(*)[SIZE])malloc(sizeof *x * 2);
float (*a)[SIZE] = (float(*)[SIZE])malloc(sizeof *a * SIZE);
float (*b) = (float*)malloc(sizeof(float) * SIZE);
int i = 0, j = 0;
float delta = 0.0f;
int read = 0;
int write = 1;
srand(time(NULL));
tbb::task_scheduler_init init(THREADS);
// set up initial solution
for(i = 0; i < SIZE; i++){
x[0][i] = i;
x[1][i] = i;
}
// generate a diagonal dominant matrix
if(!generate_ddm(a, b)){
printf("Array generated is not ddm!\n");
return 1;
}
tick_count startTime = tick_count::now();
while(!converge(a, x[write], b)){
read = !read;
write = !write;
parallel_for(blocked_range<int>(0,SIZE),
[&] (const blocked_range<int>& r) {
for (int i = r.begin(); i < r.end(); i++) {
float delta = 0.0f;
for(int j = 0; j < SIZE; j++){
if(j != i){
delta += a[i][j] * x[read][j];
}
}
x[write][i] = (b[i] - delta) / a[i][i];
}
});
}
tick_count lastTime = tick_count::now();
float walltime = (lastTime - startTime).seconds();
printf("tbb %f\n", walltime);
converge(a, x[write], b);
printf("x0: %f | x%d: %f\n", x[write][0], SIZE-1, x[write][SIZE-1]);
free(a);
free(b);
free(x);
return 0;
}
The segfault occurs on the following line inside the Sum class:
sum += a[i][j] * x[j];
And if I change that line to
float tmpa = a[i][j];
float tmpx = x[j];
sum += tmpa * tmpx;
The error continues to be on
sum += tmpa * tmpx;
In the original version, the "splitting constructor" left a, x, and b undefined. They need to be copied from the incoming Sum& argument. E.g., change the splitting constructor to:
Sum( Sum& s, split ) {a=s.a; b=s.b; x=s.x; ret = 0;}
Changing the Class to a lambda expression solved the problem. It maybe a bug in TBB's parallel_reduce
int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){
float val = 0.0f;
val = parallel_reduce(
blocked_range<int>(0, SIZE),
0.0f,
[&]( const blocked_range<int>& r, float init )->float {
float temp = init;
for(int i = r.begin(); i != r.end(); i++ ) {
float sum = 0.0f;
for(int j = 0; j < SIZE; j++){
sum += a[i][j] * x[j];
}
temp += pow(b[i] - sum, 2);
}
return temp;
},
[]( float x, float y)->float{
return x+y;
}
);
float norm = sqrt(val);
printf("Ret: %f | Residuo: %f\n", val, norm);
return (norm <= RESIDUO);
}

bandpass butterworth filter implementation in C++

I am implementing an image analysis algorithm using openCV and c++, but I found out openCV doesnt have any function for Butterworth Bandpass filter officially.
in my project I have to pass a time series of pixels into the Butterworth 5 order filter and the function will return the filtered time series pixels. Butterworth(pixelseries,order, frequency), if you have any idea to help me of how to start please let me know. Thank you
EDIT :
after getting help, finally I come up with the following code. which can calculate the Numerator Coefficients and Denominator Coefficients, but the problem is that some of the numbers is not as same as matlab results. here is my code:
#include <iostream>
#include <stdio.h>
#include <vector>
#include <math.h>
using namespace std;
#define N 10 //The number of images which construct a time series for each pixel
#define PI 3.14159
double *ComputeLP( int FilterOrder )
{
double *NumCoeffs;
int m;
int i;
NumCoeffs = (double *)calloc( FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
NumCoeffs[0] = 1;
NumCoeffs[1] = FilterOrder;
m = FilterOrder/2;
for( i=2; i <= m; ++i)
{
NumCoeffs[i] =(double) (FilterOrder-i+1)*NumCoeffs[i-1]/i;
NumCoeffs[FilterOrder-i]= NumCoeffs[i];
}
NumCoeffs[FilterOrder-1] = FilterOrder;
NumCoeffs[FilterOrder] = 1;
return NumCoeffs;
}
double *ComputeHP( int FilterOrder )
{
double *NumCoeffs;
int i;
NumCoeffs = ComputeLP(FilterOrder);
if(NumCoeffs == NULL ) return( NULL );
for( i = 0; i <= FilterOrder; ++i)
if( i % 2 ) NumCoeffs[i] = -NumCoeffs[i];
return NumCoeffs;
}
double *TrinomialMultiply( int FilterOrder, double *b, double *c )
{
int i, j;
double *RetVal;
RetVal = (double *)calloc( 4 * FilterOrder, sizeof(double) );
if( RetVal == NULL ) return( NULL );
RetVal[2] = c[0];
RetVal[3] = c[1];
RetVal[0] = b[0];
RetVal[1] = b[1];
for( i = 1; i < FilterOrder; ++i )
{
RetVal[2*(2*i+1)] += c[2*i] * RetVal[2*(2*i-1)] - c[2*i+1] * RetVal[2*(2*i-1)+1];
RetVal[2*(2*i+1)+1] += c[2*i] * RetVal[2*(2*i-1)+1] + c[2*i+1] * RetVal[2*(2*i-1)];
for( j = 2*i; j > 1; --j )
{
RetVal[2*j] += b[2*i] * RetVal[2*(j-1)] - b[2*i+1] * RetVal[2*(j-1)+1] +
c[2*i] * RetVal[2*(j-2)] - c[2*i+1] * RetVal[2*(j-2)+1];
RetVal[2*j+1] += b[2*i] * RetVal[2*(j-1)+1] + b[2*i+1] * RetVal[2*(j-1)] +
c[2*i] * RetVal[2*(j-2)+1] + c[2*i+1] * RetVal[2*(j-2)];
}
RetVal[2] += b[2*i] * RetVal[0] - b[2*i+1] * RetVal[1] + c[2*i];
RetVal[3] += b[2*i] * RetVal[1] + b[2*i+1] * RetVal[0] + c[2*i+1];
RetVal[0] += b[2*i];
RetVal[1] += b[2*i+1];
}
return RetVal;
}
double *ComputeNumCoeffs(int FilterOrder)
{
double *TCoeffs;
double *NumCoeffs;
int i;
NumCoeffs = (double *)calloc( 2*FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
TCoeffs = ComputeHP(FilterOrder);
if( TCoeffs == NULL ) return( NULL );
for( i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
free(TCoeffs);
return NumCoeffs;
}
double *ComputeDenCoeffs( int FilterOrder, double Lcutoff, double Ucutoff )
{
int k; // loop variables
double theta; // PI * (Ucutoff - Lcutoff) / 2.0
double cp; // cosine of phi
double st; // sine of theta
double ct; // cosine of theta
double s2t; // sine of 2*theta
double c2t; // cosine 0f 2*theta
double *RCoeffs; // z^-2 coefficients
double *TCoeffs; // z^-1 coefficients
double *DenomCoeffs; // dk coefficients
double PoleAngle; // pole angle
double SinPoleAngle; // sine of pole angle
double CosPoleAngle; // cosine of pole angle
double a; // workspace variables
cp = cos(PI * (Ucutoff + Lcutoff) / 2.0);
theta = PI * (Ucutoff - Lcutoff) / 2.0;
st = sin(theta);
ct = cos(theta);
s2t = 2.0*st*ct; // sine of 2*theta
c2t = 2.0*ct*ct - 1.0; // cosine of 2*theta
RCoeffs = (double *)calloc( 2 * FilterOrder, sizeof(double) );
TCoeffs = (double *)calloc( 2 * FilterOrder, sizeof(double) );
for( k = 0; k < FilterOrder; ++k )
{
PoleAngle = PI * (double)(2*k+1)/(double)(2*FilterOrder);
SinPoleAngle = sin(PoleAngle);
CosPoleAngle = cos(PoleAngle);
a = 1.0 + s2t*SinPoleAngle;
RCoeffs[2*k] = c2t/a;
RCoeffs[2*k+1] = s2t*CosPoleAngle/a;
TCoeffs[2*k] = -2.0*cp*(ct+st*SinPoleAngle)/a;
TCoeffs[2*k+1] = -2.0*cp*st*CosPoleAngle/a;
}
DenomCoeffs = TrinomialMultiply(FilterOrder, TCoeffs, RCoeffs );
free(TCoeffs);
free(RCoeffs);
DenomCoeffs[1] = DenomCoeffs[0];
DenomCoeffs[0] = 1.0;
for( k = 3; k <= 2*FilterOrder; ++k )
DenomCoeffs[k] = DenomCoeffs[2*k-2];
return DenomCoeffs;
}
void filter(int ord, double *a, double *b, int np, double *x, double *y)
{
int i,j;
y[0]=b[0] * x[0];
for (i=1;i<ord+1;i++)
{
y[i]=0.0;
for (j=0;j<i+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<i;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
for (i=ord+1;i<np+1;i++)
{
y[i]=0.0;
for (j=0;j<ord+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<ord;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
}
int main(int argc, char *argv[])
{
//Frequency bands is a vector of values - Lower Frequency Band and Higher Frequency Band
//First value is lower cutoff and second value is higher cutoff
double FrequencyBands[2] = {0.25,0.375};//these values are as a ratio of f/fs, where fs is sampling rate, and f is cutoff frequency
//and therefore should lie in the range [0 1]
//Filter Order
int FiltOrd = 5;
//Pixel Time Series
/*int PixelTimeSeries[N];
int outputSeries[N];
*/
//Create the variables for the numerator and denominator coefficients
double *DenC = 0;
double *NumC = 0;
//Pass Numerator Coefficients and Denominator Coefficients arrays into function, will return the same
NumC = ComputeNumCoeffs(FiltOrd);
for(int k = 0; k<11; k++)
{
printf("NumC is: %lf\n", NumC[k]);
}
//is A in matlab function and the numbers are correct
DenC = ComputeDenCoeffs(FiltOrd, FrequencyBands[0], FrequencyBands[1]);
for(int k = 0; k<11; k++)
{
printf("DenC is: %lf\n", DenC[k]);
}
double y[5];
double x[5]={1,2,3,4,5};
filter(5, DenC, NumC, 5, x, y);
return 1;
}
I get this resutls for my code :
B= 1,0,-5,0,10,0,-10,0,5,0,-1
A= 1.000000000000000, -4.945988709743181, 13.556489496973796, -24.700711850327743,
32.994881546824828, -33.180726698160655, 25.546126213403539, -14.802008410165968,
6.285430089797051, -1.772929809750849, 0.277753012228403
but if I want to test the coefficinets in same frequency band in MATLAB, I get the following results:
>> [B, A]=butter(5, [0.25,0.375])
B = 0.0002, 0, -0.0008, 0, 0.0016, 0, -0.0016, 0, 0.0008, 0, -0.0002
A = 1.0000, -4.9460, 13.5565, -24.7007, 32.9948, -33.1806, 25.5461, -14.8020, 6.2854, -1.7729, 0.2778
I have test this website :http://www.exstrom.com/journal/sigproc/ code, but the result is equal as mine, not matlab. anybody knows why? or how can I get the same result as matlab toolbox?
I know this is a post on an old thread, and I would usually leave this as a comment, but I'm apparently not able to do that.
In any case, for people searching for similar code, I thought I would post the link from where this code originates (it also has C code for other types of Butterworth filter coefficients and some other cool signal processing code).
The code is located here:
http://www.exstrom.com/journal/sigproc/
Additionally, I think there is a piece of code which calculates said scaling factor for you already.
/**********************************************************************
sf_bwbp - calculates the scaling factor for a butterworth bandpass filter.
The scaling factor is what the c coefficients must be multiplied by so
that the filter response has a maximum value of 1.
*/
double sf_bwbp( int n, double f1f, double f2f )
{
int k; // loop variables
double ctt; // cotangent of theta
double sfr, sfi; // real and imaginary parts of the scaling factor
double parg; // pole angle
double sparg; // sine of pole angle
double cparg; // cosine of pole angle
double a, b, c; // workspace variables
ctt = 1.0 / tan(M_PI * (f2f - f1f) / 2.0);
sfr = 1.0;
sfi = 0.0;
for( k = 0; k < n; ++k )
{
parg = M_PI * (double)(2*k+1)/(double)(2*n);
sparg = ctt + sin(parg);
cparg = cos(parg);
a = (sfr + sfi)*(sparg - cparg);
b = sfr * sparg;
c = -sfi * cparg;
sfr = b - c;
sfi = a - b - c;
}
return( 1.0 / sfr );
}
I finally found it.
I just need to implement the following code from matlab source code to c++ . "the_mandrill" were right, I need to add the normalizing constant into the coefficient:
kern = exp(-j*w*(0:length(b)-1));
b = real(b*(kern*den(:))/(kern*b(:)));
EDIT:
and here is the final edition, which the whole code will return numbers exactly equal to MATLAB :
double *ComputeNumCoeffs(int FilterOrder,double Lcutoff, double Ucutoff, double *DenC)
{
double *TCoeffs;
double *NumCoeffs;
std::complex<double> *NormalizedKernel;
double Numbers[11]={0,1,2,3,4,5,6,7,8,9,10};
int i;
NumCoeffs = (double *)calloc( 2*FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
NormalizedKernel = (std::complex<double> *)calloc( 2*FilterOrder+1, sizeof(std::complex<double>) );
if( NormalizedKernel == NULL ) return( NULL );
TCoeffs = ComputeHP(FilterOrder);
if( TCoeffs == NULL ) return( NULL );
for( i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
double cp[2];
double Bw, Wn;
cp[0] = 2*2.0*tan(PI * Lcutoff/ 2.0);
cp[1] = 2*2.0*tan(PI * Ucutoff / 2.0);
Bw = cp[1] - cp[0];
//center frequency
Wn = sqrt(cp[0]*cp[1]);
Wn = 2*atan2(Wn,4);
double kern;
const std::complex<double> result = std::complex<double>(-1,0);
for(int k = 0; k<11; k++)
{
NormalizedKernel[k] = std::exp(-sqrt(result)*Wn*Numbers[k]);
}
double b=0;
double den=0;
for(int d = 0; d<11; d++)
{
b+=real(NormalizedKernel[d]*NumCoeffs[d]);
den+=real(NormalizedKernel[d]*DenC[d]);
}
for(int c = 0; c<11; c++)
{
NumCoeffs[c]=(NumCoeffs[c]*den)/b;
}
free(TCoeffs);
return NumCoeffs;
}
There are code which could be found online implementing butterworth filter. If you use the source code to try to get result matching MATLAB results, there will be the same problem.Basically the result you got from the code hasn't been normalized, and in the source code there is a variable sff in bwhp.c. If you set that to 1, the problem will be easily solved.
I recommend you to use this source code and
the source code and usage could be found here
I added the final edition of function ComputeNumCoeffs to the program and fix "FilterOrder" (k<11 to k<2*FiltOrd+1). Maybe it will save someone's time.
f1=0.5Gz, f2=10Gz, fs=127Gz/2
In MatLab
a={1.000000000000000,-3.329746259105707, 4.180522138699884,-2.365540522960743,0.514875789136976};
b={0.041065495448784, 0.000000000000000,-0.082130990897568, 0.000000000000000,0.041065495448784};
Program:
#include <iostream>
#include <stdio.h>
#include <vector>
#include <math.h>
#include <complex>
using namespace std;
#define N 10 //The number of images which construct a time series for each pixel
#define PI 3.1415926535897932384626433832795
double *ComputeLP(int FilterOrder)
{
double *NumCoeffs;
int m;
int i;
NumCoeffs = (double *)calloc(FilterOrder+1, sizeof(double));
if(NumCoeffs == NULL) return(NULL);
NumCoeffs[0] = 1;
NumCoeffs[1] = FilterOrder;
m = FilterOrder/2;
for(i=2; i <= m; ++i)
{
NumCoeffs[i] =(double) (FilterOrder-i+1)*NumCoeffs[i-1]/i;
NumCoeffs[FilterOrder-i]= NumCoeffs[i];
}
NumCoeffs[FilterOrder-1] = FilterOrder;
NumCoeffs[FilterOrder] = 1;
return NumCoeffs;
}
double *ComputeHP(int FilterOrder)
{
double *NumCoeffs;
int i;
NumCoeffs = ComputeLP(FilterOrder);
if(NumCoeffs == NULL) return(NULL);
for(i = 0; i <= FilterOrder; ++i)
if(i % 2) NumCoeffs[i] = -NumCoeffs[i];
return NumCoeffs;
}
double *TrinomialMultiply(int FilterOrder, double *b, double *c)
{
int i, j;
double *RetVal;
RetVal = (double *)calloc(4 * FilterOrder, sizeof(double));
if(RetVal == NULL) return(NULL);
RetVal[2] = c[0];
RetVal[3] = c[1];
RetVal[0] = b[0];
RetVal[1] = b[1];
for(i = 1; i < FilterOrder; ++i)
{
RetVal[2*(2*i+1)] += c[2*i] * RetVal[2*(2*i-1)] - c[2*i+1] * RetVal[2*(2*i-1)+1];
RetVal[2*(2*i+1)+1] += c[2*i] * RetVal[2*(2*i-1)+1] + c[2*i+1] * RetVal[2*(2*i-1)];
for(j = 2*i; j > 1; --j)
{
RetVal[2*j] += b[2*i] * RetVal[2*(j-1)] - b[2*i+1] * RetVal[2*(j-1)+1] +
c[2*i] * RetVal[2*(j-2)] - c[2*i+1] * RetVal[2*(j-2)+1];
RetVal[2*j+1] += b[2*i] * RetVal[2*(j-1)+1] + b[2*i+1] * RetVal[2*(j-1)] +
c[2*i] * RetVal[2*(j-2)+1] + c[2*i+1] * RetVal[2*(j-2)];
}
RetVal[2] += b[2*i] * RetVal[0] - b[2*i+1] * RetVal[1] + c[2*i];
RetVal[3] += b[2*i] * RetVal[1] + b[2*i+1] * RetVal[0] + c[2*i+1];
RetVal[0] += b[2*i];
RetVal[1] += b[2*i+1];
}
return RetVal;
}
double *ComputeNumCoeffs(int FilterOrder,double Lcutoff, double Ucutoff, double *DenC)
{
double *TCoeffs;
double *NumCoeffs;
std::complex<double> *NormalizedKernel;
double Numbers[11]={0,1,2,3,4,5,6,7,8,9,10};
int i;
NumCoeffs = (double *)calloc(2*FilterOrder+1, sizeof(double));
if(NumCoeffs == NULL) return(NULL);
NormalizedKernel = (std::complex<double> *)calloc(2*FilterOrder+1, sizeof(std::complex<double>));
if(NormalizedKernel == NULL) return(NULL);
TCoeffs = ComputeHP(FilterOrder);
if(TCoeffs == NULL) return(NULL);
for(i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
double cp[2];
//double Bw;
double Wn;
cp[0] = 2*2.0*tan(PI * Lcutoff/ 2.0);
cp[1] = 2*2.0*tan(PI * Ucutoff/2.0);
//Bw = cp[1] - cp[0];
//center frequency
Wn = sqrt(cp[0]*cp[1]);
Wn = 2*atan2(Wn,4);
//double kern;
const std::complex<double> result = std::complex<double>(-1,0);
for(int k = 0; k<2*FilterOrder+1; k++)
{
NormalizedKernel[k] = std::exp(-sqrt(result)*Wn*Numbers[k]);
}
double b=0;
double den=0;
for(int d = 0; d<2*FilterOrder+1; d++)
{
b+=real(NormalizedKernel[d]*NumCoeffs[d]);
den+=real(NormalizedKernel[d]*DenC[d]);
}
for(int c = 0; c<2*FilterOrder+1; c++)
{
NumCoeffs[c]=(NumCoeffs[c]*den)/b;
}
free(TCoeffs);
return NumCoeffs;
}
double *ComputeDenCoeffs(int FilterOrder, double Lcutoff, double Ucutoff)
{
int k; // loop variables
double theta; // PI * (Ucutoff - Lcutoff)/2.0
double cp; // cosine of phi
double st; // sine of theta
double ct; // cosine of theta
double s2t; // sine of 2*theta
double c2t; // cosine 0f 2*theta
double *RCoeffs; // z^-2 coefficients
double *TCoeffs; // z^-1 coefficients
double *DenomCoeffs; // dk coefficients
double PoleAngle; // pole angle
double SinPoleAngle; // sine of pole angle
double CosPoleAngle; // cosine of pole angle
double a; // workspace variables
cp = cos(PI * (Ucutoff + Lcutoff)/2.0);
theta = PI * (Ucutoff - Lcutoff)/2.0;
st = sin(theta);
ct = cos(theta);
s2t = 2.0*st*ct; // sine of 2*theta
c2t = 2.0*ct*ct - 1.0; // cosine of 2*theta
RCoeffs = (double *)calloc(2 * FilterOrder, sizeof(double));
TCoeffs = (double *)calloc(2 * FilterOrder, sizeof(double));
for(k = 0; k < FilterOrder; ++k)
{
PoleAngle = PI * (double)(2*k+1)/(double)(2*FilterOrder);
SinPoleAngle = sin(PoleAngle);
CosPoleAngle = cos(PoleAngle);
a = 1.0 + s2t*SinPoleAngle;
RCoeffs[2*k] = c2t/a;
RCoeffs[2*k+1] = s2t*CosPoleAngle/a;
TCoeffs[2*k] = -2.0*cp*(ct+st*SinPoleAngle)/a;
TCoeffs[2*k+1] = -2.0*cp*st*CosPoleAngle/a;
}
DenomCoeffs = TrinomialMultiply(FilterOrder, TCoeffs, RCoeffs);
free(TCoeffs);
free(RCoeffs);
DenomCoeffs[1] = DenomCoeffs[0];
DenomCoeffs[0] = 1.0;
for(k = 3; k <= 2*FilterOrder; ++k)
DenomCoeffs[k] = DenomCoeffs[2*k-2];
return DenomCoeffs;
}
void filter(int ord, double *a, double *b, int np, double *x, double *y)
{
int i,j;
y[0]=b[0] * x[0];
for (i=1;i<ord+1;i++)
{
y[i]=0.0;
for (j=0;j<i+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<i;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
for (i=ord+1;i<np+1;i++)
{
y[i]=0.0;
for (j=0;j<ord+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<ord;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
}
int main(int argc, char *argv[])
{
(void)argc;
(void)argv;
//Frequency bands is a vector of values - Lower Frequency Band and Higher Frequency Band
//First value is lower cutoff and second value is higher cutoff
//f1 = 0.5Gz f2=10Gz
//fs=127Gz
//Kotelnikov/2=Nyquist (127/2)
double FrequencyBands[2] = {0.5/(127.0/2.0),10.0/(127.0/2.0)};//these values are as a ratio of f/fs, where fs is sampling rate, and f is cutoff frequency
//and therefore should lie in the range [0 1]
//Filter Order
int FiltOrd = 2;//5;
//Pixel Time Series
/*int PixelTimeSeries[N];
int outputSeries[N];
*/
//Create the variables for the numerator and denominator coefficients
double *DenC = 0;
double *NumC = 0;
//Pass Numerator Coefficients and Denominator Coefficients arrays into function, will return the same
printf("\n");
//is A in matlab function and the numbers are correct
DenC = ComputeDenCoeffs(FiltOrd, FrequencyBands[0], FrequencyBands[1]);
for(int k = 0; k<2*FiltOrd+1; k++)
{
printf("DenC is: %lf\n", DenC[k]);
}
printf("\n");
NumC = ComputeNumCoeffs(FiltOrd,FrequencyBands[0],FrequencyBands[1],DenC);
for(int k = 0; k<2*FiltOrd+1; k++)
{
printf("NumC is: %lf\n", NumC[k]);
}
double y[5];
double x[5]={1,2,3,4,5};
filter(5, DenC, NumC, 5, x, y);
return 1;
}