We need to change/reimplement standard DFT implementation in GSL, which is
int
FUNCTION(gsl_dft_complex,transform) (const BASE data[],
const size_t stride, const size_t n,
BASE result[],
const gsl_fft_direction sign)
{
size_t i, j, exponent;
const double d_theta = 2.0 * ((int) sign) * M_PI / (double) n;
/* FIXME: check that input length == output length and give error */
for (i = 0; i < n; i++)
{
ATOMIC sum_real = 0;
ATOMIC sum_imag = 0;
exponent = 0;
for (j = 0; j < n; j++)
{
double theta = d_theta * (double) exponent;
/* sum = exp(i theta) * data[j] */
ATOMIC w_real = (ATOMIC) cos (theta);
ATOMIC w_imag = (ATOMIC) sin (theta);
ATOMIC data_real = REAL(data,stride,j);
ATOMIC data_imag = IMAG(data,stride,j);
sum_real += w_real * data_real - w_imag * data_imag;
sum_imag += w_real * data_imag + w_imag * data_real;
exponent = (exponent + i) % n;
}
REAL(result,stride,i) = sum_real;
IMAG(result,stride,i) = sum_imag;
}
return 0;
}
In this implementation, GSL iterates over input vector twice for sample/input size. However, we need to construct for different frequency bins. For instance, we have 4096 samples, but we need to calculate DFT for 128 different frequencies. Could you help me to define or implement required DFT behaviour? Thanks in advance.
EDIT: We do not search for first m frequencies.
Actually, is below approach correct for finding DFT result with given frequency bin number?
N = sample size
B = frequency bin size
k = 0,...,127 X[k] = SUM(0,N){x[i]*exp(-j*2*pi*k*i/B)}
EDIT: I might have not explained the problem for DFT elaborately, nevertheless, I am happy to provide the answer below:
void compute_dft(const std::vector<double>& signal,
const std::vector<double>& frequency_band,
std::vector<double>& result,
const double sampling_rate)
{
if(0 == result.size() || result.size() != (frequency_band.size() << 1)){
result.resize(frequency_band.size() << 1, 0.0);
}
//note complex signal assumption
const double d_theta = -2.0 * PI * sampling_rate;
for(size_t k = 0; k < frequency_band.size(); ++k){
const double f_k = frequency_band[k];
double real_sum = 0.0;
double imag_sum = 0.0;
for(size_t n = 0; n < (signal.size() >> 1); ++n){
double theta = d_theta * f_k * (n + 1);
double w_real = cos(theta);
double w_imag = sin(theta);
double d_real = signal[2*n];
double d_imag = signal[2*n + 1];
real_sum += w_real * d_real - w_imag * d_imag;
imag_sum += w_real * d_imag + w_imag * d_real;
}
result[2*k] = real_sum;
result[2*k + 1] = imag_sum;
}
}
Assuming you just want the the first m output frequencies:
int
FUNCTION(gsl_dft_complex,transform) (const BASE data[],
const size_t stride,
const size_t n, // input size
const size_t m, // output size (m <= n)
BASE result[],
const gsl_fft_direction sign)
{
size_t i, j, exponent;
const double d_theta = 2.0 * ((int) sign) * M_PI / (double) n;
/* FIXME: check that m <= n and give error */
for (i = 0; i < m; i++) // for each of m output bins
{
ATOMIC sum_real = 0;
ATOMIC sum_imag = 0;
exponent = 0;
for (j = 0; j < n; j++) // for each of n input points
{
double theta = d_theta * (double) exponent;
/* sum = exp(i theta) * data[j] */
ATOMIC w_real = (ATOMIC) cos (theta);
ATOMIC w_imag = (ATOMIC) sin (theta);
ATOMIC data_real = REAL(data,stride,j);
ATOMIC data_imag = IMAG(data,stride,j);
sum_real += w_real * data_real - w_imag * data_imag;
sum_imag += w_real * data_imag + w_imag * data_real;
exponent = (exponent + i) % n;
}
REAL(result,stride,i) = sum_real;
IMAG(result,stride,i) = sum_imag;
}
return 0;
}
Related
Suppose that I have a sequence x(n) which is K * N long and that only the first N elements are different from zero. I'm assuming that N << K, say, for example, N = 10 and K = 100000. I want to calculate the FFT, by FFTW, of such a sequence. This is equivalent to having a sequence of length N and having a zero padding to K * N. Since N and K may be "large", I have a significant zero padding. I'm exploring if I can save some computation time avoid explicit zero padding.
The case K = 2
Let us begin by considering the case K = 2. In this case, the DFT of x(n) can be written as
If k is even, namely k = 2 * m, then
which means that such values of the DFT can be calculated through an FFT of a sequence of length N, and not K * N.
If k is odd, namely k = 2 * m + 1, then
which means that such values of the DFT can be again calculated through an FFT of a sequence of length N, and not K * N.
So, in conclusion, I can exchange a single FFT of length 2 * N with 2 FFTs of length N.
The case of arbitrary K
In this case, we have
On writing k = m * K + t, we have
So, in conclusion, I can exchange a single FFT of length K * N with K FFTs of length N. Since the FFTW has fftw_plan_many_dft, I can expect to have some gaining against the case of a single FFT.
To verify that, I have set up the following code
#include <stdio.h>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <math.h>
#include <fstream>
#include <fftw3.h>
#include "TimingCPU.h"
#define PI_d 3.141592653589793
void main() {
const int N = 10;
const int K = 100000;
fftw_plan plan_zp;
fftw_complex *h_x = (fftw_complex *)malloc(N * sizeof(fftw_complex));
fftw_complex *h_xzp = (fftw_complex *)calloc(N * K, sizeof(fftw_complex));
fftw_complex *h_xpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
fftw_complex *h_xhatpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
fftw_complex *h_xhatpruning_temp = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
fftw_complex *h_xhat = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
// --- Random number generation of the data sequence
srand(time(NULL));
for (int k = 0; k < N; k++) {
h_x[k][0] = (double)rand() / (double)RAND_MAX;
h_x[k][1] = (double)rand() / (double)RAND_MAX;
}
memcpy(h_xzp, h_x, N * sizeof(fftw_complex));
plan_zp = fftw_plan_dft_1d(N * K, h_xzp, h_xhat, FFTW_FORWARD, FFTW_ESTIMATE);
fftw_plan plan_pruning = fftw_plan_many_dft(1, &N, K, h_xpruning, NULL, 1, N, h_xhatpruning_temp, NULL, 1, N, FFTW_FORWARD, FFTW_ESTIMATE);
TimingCPU timerCPU;
timerCPU.StartCounter();
fftw_execute(plan_zp);
printf("Stadard %f\n", timerCPU.GetCounter());
timerCPU.StartCounter();
double factor = -2. * PI_d / (K * N);
for (int k = 0; k < K; k++) {
double arg1 = factor * k;
for (int n = 0; n < N; n++) {
double arg = arg1 * n;
double cosarg = cos(arg);
double sinarg = sin(arg);
h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
}
}
printf("Optimized first step %f\n", timerCPU.GetCounter());
timerCPU.StartCounter();
fftw_execute(plan_pruning);
printf("Optimized second step %f\n", timerCPU.GetCounter());
timerCPU.StartCounter();
for (int k = 0; k < K; k++) {
for (int p = 0; p < N; p++) {
h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
}
}
printf("Optimized third step %f\n", timerCPU.GetCounter());
double rmserror = 0., norm = 0.;
for (int n = 0; n < N; n++) {
rmserror = rmserror + (h_xhatpruning[n][0] - h_xhat[n][0]) * (h_xhatpruning[n][0] - h_xhat[n][0]) + (h_xhatpruning[n][1] - h_xhat[n][1]) * (h_xhatpruning[n][1] - h_xhat[n][1]);
norm = norm + h_xhat[n][0] * h_xhat[n][0] + h_xhat[n][1] * h_xhat[n][1];
}
printf("rmserror %f\n", 100. * sqrt(rmserror / norm));
fftw_destroy_plan(plan_zp);
}
The approach I have developed consists of three steps:
Multiplying the input sequence by "twiddle" complex exponentials;
Performing the fftw_many;
Reorganizing the results.
The fftw_many is faster than the single FFTW on K * N input points. However, steps #1 and #3 completely destroy such a gain. I would expect that steps #1 and #3 be computationally much lighter than step #2.
My questions are:
How is it possible that steps #1 and #3 a so computationally more demanding than step #2?
How can I improve steps #1 and #3 to have a net gain against the "standard" approach?
Thank you very much for any hint.
EDIT
I'm working with Visual Studio 2013 and compiling in Release mode.
Several options to run faster:
Run multi-threaded if you're only running single-threaded and have multiple cores available.
Create and save an FFTW wisdom file, especially if the FFT dimensions are known in advance. Use FFTW_EXHAUSTIVE, and reload the FFTW wisdom instead of recalculating it every time. This is also important if you want your results to be consistent. Since FFTW may compute FFTs differently with different calculated wisdom, and the wisdom results aren't necessarily going to always be the same, different runs of your process may produce different results when both are given identical input data.
If you're on x86, run 64-bit. The FFTW algorithm is extremely register-intensive, and an x86 CPU running in 64-bit mode has a lot more general-purpose registers available than it does when running in 32-bit mode.
Since the FFTW algorithm is so register intensive, I've had good success improving FFTW performance by compiling FFTW with compiler options that prevent the use of prefetching and prevent the implicit inlining of functions.
For the third step you might want to try switching the order of the loops:
for (int p = 0; p < N; p++) {
for (int k = 0; k < K; k++) {
h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
}
}
since it's generally more beneficial to have the store addresses be contiguous than the load addresses.
Either way you have a cache-unfriendly access pattern though. You could try working with blocks to improve this, e.g. assuming N is a multiple of 4:
for (int p = 0; p < N; p += 4) {
for (int k = 0; k < K; k++) {
for (int p0 = 0; p0 < 4; p0++) {
h_xhatpruning[(p + p0) * K + k][0] = h_xhatpruning_temp[(p + p0) + k * N][0];
h_xhatpruning[(p + p0) * K + k][1] = h_xhatpruning_temp[(p + p0) + k * N][1];
}
}
}
This should help to reduce the churn of cache lines somewhat. If it does then maybe also experiment with block sizes other than 4 to see if there is a "sweet spot".
Also following Paul R's comments, I have improved my code. Now, the alternative approach is faster than the standard (zero padded) one. Below is the full C++ script. For step #1 and #3, I have commented other tried solutions, which have shown to be slower or as fast as the uncommented one. I have priviledged non-nested for loops, also in view of a simpler future CUDA parallelization. I'm not yet using multi-threading for FFTW.
#include <stdio.h>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <math.h>
#include <fstream>
#include <omp.h>
#include <fftw3.h>
#include "TimingCPU.h"
#define PI_d 3.141592653589793
/******************/
/* STEP #1 ON CPU */
/******************/
void step1CPU(fftw_complex * __restrict h_xpruning, const fftw_complex * __restrict h_x, const int N, const int K) {
// double factor = -2. * PI_d / (K * N);
// int n;
// omp_set_nested(1);
//#pragma omp parallel for private(n) num_threads(4)
// for (int k = 0; k < K; k++) {
// double arg1 = factor * k;
//#pragma omp parallel for num_threads(4)
// for (n = 0; n < N; n++) {
// double arg = arg1 * n;
// double cosarg = cos(arg);
// double sinarg = sin(arg);
// h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
// h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
// }
// }
//double factor = -2. * PI_d / (K * N);
//int k;
//omp_set_nested(1);
//#pragma omp parallel for private(k) num_threads(4)
//for (int n = 0; n < N; n++) {
// double arg1 = factor * n;
// #pragma omp parallel for num_threads(4)
// for (k = 0; k < K; k++) {
// double arg = arg1 * k;
// double cosarg = cos(arg);
// double sinarg = sin(arg);
// h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
// h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
// }
//}
//double factor = -2. * PI_d / (K * N);
//for (int k = 0; k < K; k++) {
// double arg1 = factor * k;
// for (int n = 0; n < N; n++) {
// double arg = arg1 * n;
// double cosarg = cos(arg);
// double sinarg = sin(arg);
// h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
// h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
// }
//}
//double factor = -2. * PI_d / (K * N);
//for (int n = 0; n < N; n++) {
// double arg1 = factor * n;
// for (int k = 0; k < K; k++) {
// double arg = arg1 * k;
// double cosarg = cos(arg);
// double sinarg = sin(arg);
// h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
// h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
// }
//}
double factor = -2. * PI_d / (K * N);
#pragma omp parallel for num_threads(8)
for (int n = 0; n < K * N; n++) {
int row = n / N;
int col = n % N;
double arg = factor * row * col;
double cosarg = cos(arg);
double sinarg = sin(arg);
h_xpruning[n][0] = h_x[col][0] * cosarg - h_x[col][1] * sinarg;
h_xpruning[n][1] = h_x[col][0] * sinarg + h_x[col][1] * cosarg;
}
}
/******************/
/* STEP #3 ON CPU */
/******************/
void step3CPU(fftw_complex * __restrict h_xhatpruning, const fftw_complex * __restrict h_xhatpruning_temp, const int N, const int K) {
//int k;
//omp_set_nested(1);
//#pragma omp parallel for private(k) num_threads(4)
//for (int p = 0; p < N; p++) {
// #pragma omp parallel for num_threads(4)
// for (k = 0; k < K; k++) {
// h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
// h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
// }
//}
//int p;
//omp_set_nested(1);
//#pragma omp parallel for private(p) num_threads(4)
//for (int k = 0; k < K; k++) {
// #pragma omp parallel for num_threads(4)
// for (p = 0; p < N; p++) {
// h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
// h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
// }
//}
//for (int p = 0; p < N; p++) {
// for (int k = 0; k < K; k++) {
// h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
// h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
// }
//}
//for (int k = 0; k < K; k++) {
// for (int p = 0; p < N; p++) {
// h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
// h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
// }
//}
#pragma omp parallel for num_threads(8)
for (int p = 0; p < K * N; p++) {
int col = p % N;
int row = p / K;
h_xhatpruning[col * K + row][0] = h_xhatpruning_temp[col + row * N][0];
h_xhatpruning[col * K + row][1] = h_xhatpruning_temp[col + row * N][1];
}
//for (int p = 0; p < N; p += 2) {
// for (int k = 0; k < K; k++) {
// for (int p0 = 0; p0 < 2; p0++) {
// h_xhatpruning[(p + p0) * K + k][0] = h_xhatpruning_temp[(p + p0) + k * N][0];
// h_xhatpruning[(p + p0) * K + k][1] = h_xhatpruning_temp[(p + p0) + k * N][1];
// }
// }
//}
}
/********/
/* MAIN */
/********/
void main() {
int N = 10;
int K = 100000;
// --- CPU memory allocations
fftw_complex *h_x = (fftw_complex *)malloc(N * sizeof(fftw_complex));
fftw_complex *h_xzp = (fftw_complex *)calloc(N * K, sizeof(fftw_complex));
fftw_complex *h_xpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
fftw_complex *h_xhatpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
fftw_complex *h_xhatpruning_temp = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
fftw_complex *h_xhat = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
//double2 *h_xhatGPU = (double2 *)malloc(N * K * sizeof(double2));
// --- Random number generation of the data sequence on the CPU - moving the data from CPU to GPU
srand(time(NULL));
for (int k = 0; k < N; k++) {
h_x[k][0] = (double)rand() / (double)RAND_MAX;
h_x[k][1] = (double)rand() / (double)RAND_MAX;
}
//gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double2), cudaMemcpyHostToDevice));
memcpy(h_xzp, h_x, N * sizeof(fftw_complex));
// --- FFTW and cuFFT plans
fftw_plan h_plan_zp = fftw_plan_dft_1d(N * K, h_xzp, h_xhat, FFTW_FORWARD, FFTW_ESTIMATE);
fftw_plan h_plan_pruning = fftw_plan_many_dft(1, &N, K, h_xpruning, NULL, 1, N, h_xhatpruning_temp, NULL, 1, N, FFTW_FORWARD, FFTW_ESTIMATE);
double totalTimeCPU = 0., totalTimeGPU = 0.;
double partialTimeCPU, partialTimeGPU;
/****************************/
/* STANDARD APPROACH ON CPU */
/****************************/
printf("Number of processors available = %i\n", omp_get_num_procs());
printf("Number of threads = %i\n", omp_get_max_threads());
TimingCPU timerCPU;
timerCPU.StartCounter();
fftw_execute(h_plan_zp);
printf("\nStadard on CPU: \t \t %f\n", timerCPU.GetCounter());
/******************/
/* STEP #1 ON CPU */
/******************/
timerCPU.StartCounter();
step1CPU(h_xpruning, h_x, N, K);
partialTimeCPU = timerCPU.GetCounter();
totalTimeCPU = totalTimeCPU + partialTimeCPU;
printf("\nOptimized first step CPU: \t %f\n", totalTimeCPU);
/******************/
/* STEP #2 ON CPU */
/******************/
timerCPU.StartCounter();
fftw_execute(h_plan_pruning);
partialTimeCPU = timerCPU.GetCounter();
totalTimeCPU = totalTimeCPU + partialTimeCPU;
printf("Optimized second step CPU: \t %f\n", timerCPU.GetCounter());
/******************/
/* STEP #3 ON CPU */
/******************/
timerCPU.StartCounter();
step3CPU(h_xhatpruning, h_xhatpruning_temp, N, K);
partialTimeCPU = timerCPU.GetCounter();
totalTimeCPU = totalTimeCPU + partialTimeCPU;
printf("Optimized third step CPU: \t %f\n", partialTimeCPU);
printf("Total time CPU: \t \t %f\n", totalTimeCPU);
double rmserror = 0., norm = 0.;
for (int n = 0; n < N; n++) {
rmserror = rmserror + (h_xhatpruning[n][0] - h_xhat[n][0]) * (h_xhatpruning[n][0] - h_xhat[n][0]) + (h_xhatpruning[n][1] - h_xhat[n][1]) * (h_xhatpruning[n][1] - h_xhat[n][1]);
norm = norm + h_xhat[n][0] * h_xhat[n][0] + h_xhat[n][1] * h_xhat[n][1];
}
printf("\nrmserror %f\n", 100. * sqrt(rmserror / norm));
fftw_destroy_plan(h_plan_zp);
}
For the case
N = 10
K = 100000
my timing is the following
Stadard on CPU: 23.895417
Optimized first step CPU: 4.472087
Optimized second step CPU: 4.926603
Optimized third step CPU: 2.394958
Total time CPU: 11.793648
Here's a code snipped that I have for a larger program
double *pos_x_h[224];
double *pos_y_h[224];
const double A = 1;
const int N = 224;
double d_0;
double alpha;
void initialize(double nu, int rows = 16, int columns = 14) {
double d = 1 / double(columns);
d_0 = d * (1 - pow(2.0, nu - 8));
alpha = d - d_0;
double dx = d;
double dy = d * sqrt(3.0) / 2;
for (int j = 0; j < rows; j++) {
for (int i = 0; i < columns; i++) {
int n = i + j * columns;
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
*pos_y_h[n] = j * dy;
}
}
}
int main(int argc, char *argv[]) {
double nu=7.5;
int rows=16;
int columns=14;
initialize(nu);
return 0;
}
The code compiles but it is gives a seg fault error. Can't see why that's the case. Am I going over array_size?
There doesn't seem to be any point in utilizing pos_x_h and pos_y_h as pointer arrays.
Change this:
double *pos_x_h[224];
double *pos_y_h[224];
To this:
double pos_x_h[224];
double pos_y_h[224];
And this:
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
*pos_y_h[n] = j * dy;
To this:
pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
pos_y_h[n] = j * dy;
If you really insist on utilizing pointer arrays, then you can use this (in addition to the above):
double *pos_x_h_ptr[224];
double *pos_y_h_ptr[224];
for (int n=0; n<224; n++)
{
pos_x_h_ptr[n] = pos_x_h+n;
pos_y_h_ptr[n] = pos_y_h+n;
}
double *pos_x_h[224];
double *pos_y_h[224];
are arrays of pointers, but you use them wihtout allocating memory
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
*pos_y_h[n] = j * dy;
probably something like that
pos_x_h[n] = malloc(sizeof(double));
*pos_x_h[n] = i * dx + (j % 2) * dx / 2.0;
pos_y_h[n] = malloc(sizeof(double));
*pos_y_h[n] = j * dy;
if you need to alocate memory outside the initialize function (why would you? it is init function) you can do it in main
int i = 0;
for(;i< 224;++i)
{
pos_x_h[i] = malloc(sizeof(double));
pos_y_h[i] = malloc(sizeof(double));
}
I have a 3007 x 1644 dimensional matrix of terms and documents. I am trying to assign weights to frequency of terms in each document so I'm using this log entropy formula http://en.wikipedia.org/wiki/Latent_semantic_indexing#Term_Document_Matrix (See entropy formula in the last row).
I'm successfully doing this but my code is running for >7 minutes.
Here's the code:
int N = mat.cols();
for(int i=1;i<=mat.rows();i++){
double gfi = sum(mat(i,colon()))(1,1); //sum of occurrence of terms
double g =0;
if(gfi != 0){// to avoid divide by zero error
for(int j = 1;j<=N;j++){
double tfij = mat(i,j);
double pij = gfi==0?0.0:tfij/gfi;
pij = pij + 1; //avoid log0
double G = (pij * log(pij))/log(N);
g = g + G;
}
}
double gi = 1 - g;
for(int j=1;j<=N;j++){
double tfij = mat(i,j) + 1;//avoid log0
double aij = gi * log(tfij);
mat(i,j) = aij;
}
}
Anyone have ideas how I can optimize this to make it faster? Oh and mat is a RealSparseMatrix from amlpp matrix library.
UPDATE
Code runs on Linux mint with 4gb RAM and AMD Athlon II dual core
Running time before change: > 7mins
After #Kereks answer: 4.1sec
Here's a very naive rewrite that removes some redundancies:
int const N = mat.cols();
double const logN = log(N);
for (int i = 1; i <= mat.rows(); ++i)
{
double const gfi = sum(mat(i, colon()))(1, 1); // sum of occurrence of terms
double g = 0;
if (gfi != 0)
{
for (int j = 1; j <= N; ++j)
{
double const pij = mat(i, j) / gfi + 1;
g += pij * log(pij);
}
g /= logN;
}
for (int j = 1; j <= N; ++j)
{
mat(i,j) = (1 - g) * log(mat(i, j) + 1);
}
}
Also make sure that the matrix data structure is sane (e.g. a flat array accessed in strides; not a bunch of dynamically allocated rows).
Also, I think the first + 1 is a bit silly. You know that x -> x * log(x) is continuous at zero with limit zero, so you should write:
double const pij = mat(i, j) / gfi;
if (pij != 0) { g += pij + log(pij); }
In fact, you might even write the first inner for loop like this, avoiding a division when it isn't needed:
for (int j = 1; j <= N; ++j)
{
if (double pij = mat(i, j))
{
pij /= gfi;
g += pij * log(pij);
}
}
I have written a global version of Particle Swarm Optimization algorithm in C++.
I tried to write it exactly as same as my MATLAB PSO code that have written before, but this code generates different and so worst answers.
The MATLAB code is:
clear all;
numofdims = 30;
numofparticles = 50;
c1 = 2;
c2 = 2;
numofiterations = 1000;
V = zeros(50, 30);
initialpop = V;
Vmin = zeros(30, 1);
Vmax = Vmin;
Xmax = ones(30, 1) * 100;
Xmin = -Xmax;
pbestfits = zeros(50, 1);
worsts = zeros(50, 1);
bests = zeros(50, 1);
meanfits = zeros(50, 1);
pbests = zeros(50, 30);
initialpop = Xmin + (Xmax - Xmin) .* rand(numofparticles, numofdims);
X = initialpop;
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
gbestfit = minfit;
gbest = X(minfitidx, :);
for i = 1:numofdims
Vmax(i) = 0.2 * (Xmax(i) - Xmin(i));
Vmin(i) = -Vmax(i);
end
for t = 1:1000
w = 0.9 - 0.7 * (t / numofiterations);
for i = 1:numofparticles
if(fitnesses(i) < pbestfits(i))
pbestfits(i) = fitnesses(i);
pbests(i, :) = X(i, :);
end
end
for i = 1:numofparticles
for j = 1:numofdims
V(i, j) = min(max((w * V(i, j) + rand * c1 * (pbests(i, j) - X(i, j))...
+ rand * c2 * (gbest(j) - X(i, j))), Vmin(j)), Vmax(j));
X(i, j) = min(max((X(i, j) + V(i, j)), Xmin(j)), Xmax(j));
end
end
fitnesses = testfunc1(X);
[minfit, minfitidx] = min(fitnesses);
if(minfit < gbestfit)
gbestfit = minfit;
gbest = X(minfitidx, :);
end
worsts(t) = max(fitnesses);
bests(t) = gbestfit;
meanfits(t) = mean(fitnesses);
end
In which, testfunc1 is:
function [out] = testfunc1(R)
out = sum(R .^ 2, 2);
end
The C++ code is:
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#define rand_01 ((float)rand() / (float)RAND_MAX)
const int numofdims = 30;
const int numofparticles = 50;
using namespace std;
void fitnessfunc(float X[numofparticles][numofdims], float fitnesses[numofparticles])
{
memset(fitnesses, 0, sizeof (float) * numofparticles);
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
fitnesses[i] += (pow(X[i][j], 2));
}
}
}
float mean(float inputval[], int vallength)
{
int addvalue = 0;
for(int i = 0; i < vallength; i++)
{
addvalue += inputval[i];
}
return (float)(addvalue / vallength);
}
void PSO(int numofiterations, float c1, float c2,
float Xmin[numofdims], float Xmax[numofdims], float initialpop[numofparticles][numofdims],
float worsts[], float meanfits[], float bests[], float *gbestfit, float gbest[numofdims])
{
float V[numofparticles][numofdims] = {0};
float X[numofparticles][numofdims];
float Vmax[numofdims];
float Vmin[numofdims];
float pbests[numofparticles][numofdims];
float pbestfits[numofparticles];
float fitnesses[numofparticles];
float w;
float minfit;
int minfitidx;
memcpy(X, initialpop, sizeof(float) * numofparticles * numofdims);
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
for(int i = 0; i < numofdims; i++)
{
Vmax[i] = 0.2 * (Xmax[i] - Xmin[i]);
Vmin[i] = -Vmax[i];
}
for(int t = 0; t < 1000; t++)
{
w = 0.9 - 0.7 * (float) (t / numofiterations);
for(int i = 0; i < numofparticles; i++)
{
if(fitnesses[i] < pbestfits[i])
{
pbestfits[i] = fitnesses[i];
memcpy(pbests[i], X[i], sizeof(float) * numofdims);
}
}
for(int i = 0; i < numofparticles; i++)
{
for(int j = 0; j < numofdims; j++)
{
V[i][j] = min(max((w * V[i][j] + rand_01 * c1 * (pbests[i][j] - X[i][j])
+ rand_01 * c2 * (gbest[j] - X[i][j])), Vmin[j]), Vmax[j]);
X[i][j] = min(max((X[i][j] + V[i][j]), Xmin[j]), Xmax[j]);
}
}
fitnessfunc(X, fitnesses);
minfit = *min_element(fitnesses, fitnesses + numofparticles);
minfitidx = min_element(fitnesses, fitnesses + numofparticles) - fitnesses;
if(minfit < *gbestfit)
{
*gbestfit = minfit;
memcpy(gbest, X[minfitidx], sizeof(float) * numofdims);
}
worsts[t] = *max_element(fitnesses, fitnesses + numofparticles);
bests[t] = *gbestfit;
meanfits[t] = mean(fitnesses, numofparticles);
}
}
int main()
{
time_t t;
srand((unsigned) time(&t));
float xmin[30], xmax[30];
float initpop[50][30];
float worsts[1000], bests[1000];
float meanfits[1000];
float gbestfit;
float gbest[30];
for(int i = 0; i < 30; i++)
{
xmax[i] = 100;
xmin[i] = -100;
}
for(int i = 0; i < 50; i++)
for(int j = 0; j < 30; j++)
{
initpop[i][j] = rand() % (100 + 100 + 1) - 100;
}
PSO(1000, 2, 2, xmin, xmax, initpop, worsts, meanfits, bests, &gbestfit, gbest);
cout<<"fitness: "<<gbestfit<<endl;
return 0;
}
I have debugged two codes many times but can not find the difference which makes answers different.
It is making me crazy!
May you help me please?
Update:
Please consider that, the function mean is just used for reporting some information and is not used in the optimization procedure.
You've got integer division in the following line
w = 0.9 - 0.7 * (float) (t / numofiterations);
w will be 0.2 for every iteration, change it to
w = 0.9 - 0.7 * t / numofiterations;
The first multiplication will automatically promote t to a double the division should then promote numof iterations to a double.
The parenthesis means it will be done first and therefore not be promoted as wo integers is involved in the division.
This could be a mistake in function mean:
return (float)(addvalue / vallength);
This is integer division, so the result is truncated down, then cast to float. It is unlikely this is what you want.
I am implementing an image analysis algorithm using openCV and c++, but I found out openCV doesnt have any function for Butterworth Bandpass filter officially.
in my project I have to pass a time series of pixels into the Butterworth 5 order filter and the function will return the filtered time series pixels. Butterworth(pixelseries,order, frequency), if you have any idea to help me of how to start please let me know. Thank you
EDIT :
after getting help, finally I come up with the following code. which can calculate the Numerator Coefficients and Denominator Coefficients, but the problem is that some of the numbers is not as same as matlab results. here is my code:
#include <iostream>
#include <stdio.h>
#include <vector>
#include <math.h>
using namespace std;
#define N 10 //The number of images which construct a time series for each pixel
#define PI 3.14159
double *ComputeLP( int FilterOrder )
{
double *NumCoeffs;
int m;
int i;
NumCoeffs = (double *)calloc( FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
NumCoeffs[0] = 1;
NumCoeffs[1] = FilterOrder;
m = FilterOrder/2;
for( i=2; i <= m; ++i)
{
NumCoeffs[i] =(double) (FilterOrder-i+1)*NumCoeffs[i-1]/i;
NumCoeffs[FilterOrder-i]= NumCoeffs[i];
}
NumCoeffs[FilterOrder-1] = FilterOrder;
NumCoeffs[FilterOrder] = 1;
return NumCoeffs;
}
double *ComputeHP( int FilterOrder )
{
double *NumCoeffs;
int i;
NumCoeffs = ComputeLP(FilterOrder);
if(NumCoeffs == NULL ) return( NULL );
for( i = 0; i <= FilterOrder; ++i)
if( i % 2 ) NumCoeffs[i] = -NumCoeffs[i];
return NumCoeffs;
}
double *TrinomialMultiply( int FilterOrder, double *b, double *c )
{
int i, j;
double *RetVal;
RetVal = (double *)calloc( 4 * FilterOrder, sizeof(double) );
if( RetVal == NULL ) return( NULL );
RetVal[2] = c[0];
RetVal[3] = c[1];
RetVal[0] = b[0];
RetVal[1] = b[1];
for( i = 1; i < FilterOrder; ++i )
{
RetVal[2*(2*i+1)] += c[2*i] * RetVal[2*(2*i-1)] - c[2*i+1] * RetVal[2*(2*i-1)+1];
RetVal[2*(2*i+1)+1] += c[2*i] * RetVal[2*(2*i-1)+1] + c[2*i+1] * RetVal[2*(2*i-1)];
for( j = 2*i; j > 1; --j )
{
RetVal[2*j] += b[2*i] * RetVal[2*(j-1)] - b[2*i+1] * RetVal[2*(j-1)+1] +
c[2*i] * RetVal[2*(j-2)] - c[2*i+1] * RetVal[2*(j-2)+1];
RetVal[2*j+1] += b[2*i] * RetVal[2*(j-1)+1] + b[2*i+1] * RetVal[2*(j-1)] +
c[2*i] * RetVal[2*(j-2)+1] + c[2*i+1] * RetVal[2*(j-2)];
}
RetVal[2] += b[2*i] * RetVal[0] - b[2*i+1] * RetVal[1] + c[2*i];
RetVal[3] += b[2*i] * RetVal[1] + b[2*i+1] * RetVal[0] + c[2*i+1];
RetVal[0] += b[2*i];
RetVal[1] += b[2*i+1];
}
return RetVal;
}
double *ComputeNumCoeffs(int FilterOrder)
{
double *TCoeffs;
double *NumCoeffs;
int i;
NumCoeffs = (double *)calloc( 2*FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
TCoeffs = ComputeHP(FilterOrder);
if( TCoeffs == NULL ) return( NULL );
for( i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
free(TCoeffs);
return NumCoeffs;
}
double *ComputeDenCoeffs( int FilterOrder, double Lcutoff, double Ucutoff )
{
int k; // loop variables
double theta; // PI * (Ucutoff - Lcutoff) / 2.0
double cp; // cosine of phi
double st; // sine of theta
double ct; // cosine of theta
double s2t; // sine of 2*theta
double c2t; // cosine 0f 2*theta
double *RCoeffs; // z^-2 coefficients
double *TCoeffs; // z^-1 coefficients
double *DenomCoeffs; // dk coefficients
double PoleAngle; // pole angle
double SinPoleAngle; // sine of pole angle
double CosPoleAngle; // cosine of pole angle
double a; // workspace variables
cp = cos(PI * (Ucutoff + Lcutoff) / 2.0);
theta = PI * (Ucutoff - Lcutoff) / 2.0;
st = sin(theta);
ct = cos(theta);
s2t = 2.0*st*ct; // sine of 2*theta
c2t = 2.0*ct*ct - 1.0; // cosine of 2*theta
RCoeffs = (double *)calloc( 2 * FilterOrder, sizeof(double) );
TCoeffs = (double *)calloc( 2 * FilterOrder, sizeof(double) );
for( k = 0; k < FilterOrder; ++k )
{
PoleAngle = PI * (double)(2*k+1)/(double)(2*FilterOrder);
SinPoleAngle = sin(PoleAngle);
CosPoleAngle = cos(PoleAngle);
a = 1.0 + s2t*SinPoleAngle;
RCoeffs[2*k] = c2t/a;
RCoeffs[2*k+1] = s2t*CosPoleAngle/a;
TCoeffs[2*k] = -2.0*cp*(ct+st*SinPoleAngle)/a;
TCoeffs[2*k+1] = -2.0*cp*st*CosPoleAngle/a;
}
DenomCoeffs = TrinomialMultiply(FilterOrder, TCoeffs, RCoeffs );
free(TCoeffs);
free(RCoeffs);
DenomCoeffs[1] = DenomCoeffs[0];
DenomCoeffs[0] = 1.0;
for( k = 3; k <= 2*FilterOrder; ++k )
DenomCoeffs[k] = DenomCoeffs[2*k-2];
return DenomCoeffs;
}
void filter(int ord, double *a, double *b, int np, double *x, double *y)
{
int i,j;
y[0]=b[0] * x[0];
for (i=1;i<ord+1;i++)
{
y[i]=0.0;
for (j=0;j<i+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<i;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
for (i=ord+1;i<np+1;i++)
{
y[i]=0.0;
for (j=0;j<ord+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<ord;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
}
int main(int argc, char *argv[])
{
//Frequency bands is a vector of values - Lower Frequency Band and Higher Frequency Band
//First value is lower cutoff and second value is higher cutoff
double FrequencyBands[2] = {0.25,0.375};//these values are as a ratio of f/fs, where fs is sampling rate, and f is cutoff frequency
//and therefore should lie in the range [0 1]
//Filter Order
int FiltOrd = 5;
//Pixel Time Series
/*int PixelTimeSeries[N];
int outputSeries[N];
*/
//Create the variables for the numerator and denominator coefficients
double *DenC = 0;
double *NumC = 0;
//Pass Numerator Coefficients and Denominator Coefficients arrays into function, will return the same
NumC = ComputeNumCoeffs(FiltOrd);
for(int k = 0; k<11; k++)
{
printf("NumC is: %lf\n", NumC[k]);
}
//is A in matlab function and the numbers are correct
DenC = ComputeDenCoeffs(FiltOrd, FrequencyBands[0], FrequencyBands[1]);
for(int k = 0; k<11; k++)
{
printf("DenC is: %lf\n", DenC[k]);
}
double y[5];
double x[5]={1,2,3,4,5};
filter(5, DenC, NumC, 5, x, y);
return 1;
}
I get this resutls for my code :
B= 1,0,-5,0,10,0,-10,0,5,0,-1
A= 1.000000000000000, -4.945988709743181, 13.556489496973796, -24.700711850327743,
32.994881546824828, -33.180726698160655, 25.546126213403539, -14.802008410165968,
6.285430089797051, -1.772929809750849, 0.277753012228403
but if I want to test the coefficinets in same frequency band in MATLAB, I get the following results:
>> [B, A]=butter(5, [0.25,0.375])
B = 0.0002, 0, -0.0008, 0, 0.0016, 0, -0.0016, 0, 0.0008, 0, -0.0002
A = 1.0000, -4.9460, 13.5565, -24.7007, 32.9948, -33.1806, 25.5461, -14.8020, 6.2854, -1.7729, 0.2778
I have test this website :http://www.exstrom.com/journal/sigproc/ code, but the result is equal as mine, not matlab. anybody knows why? or how can I get the same result as matlab toolbox?
I know this is a post on an old thread, and I would usually leave this as a comment, but I'm apparently not able to do that.
In any case, for people searching for similar code, I thought I would post the link from where this code originates (it also has C code for other types of Butterworth filter coefficients and some other cool signal processing code).
The code is located here:
http://www.exstrom.com/journal/sigproc/
Additionally, I think there is a piece of code which calculates said scaling factor for you already.
/**********************************************************************
sf_bwbp - calculates the scaling factor for a butterworth bandpass filter.
The scaling factor is what the c coefficients must be multiplied by so
that the filter response has a maximum value of 1.
*/
double sf_bwbp( int n, double f1f, double f2f )
{
int k; // loop variables
double ctt; // cotangent of theta
double sfr, sfi; // real and imaginary parts of the scaling factor
double parg; // pole angle
double sparg; // sine of pole angle
double cparg; // cosine of pole angle
double a, b, c; // workspace variables
ctt = 1.0 / tan(M_PI * (f2f - f1f) / 2.0);
sfr = 1.0;
sfi = 0.0;
for( k = 0; k < n; ++k )
{
parg = M_PI * (double)(2*k+1)/(double)(2*n);
sparg = ctt + sin(parg);
cparg = cos(parg);
a = (sfr + sfi)*(sparg - cparg);
b = sfr * sparg;
c = -sfi * cparg;
sfr = b - c;
sfi = a - b - c;
}
return( 1.0 / sfr );
}
I finally found it.
I just need to implement the following code from matlab source code to c++ . "the_mandrill" were right, I need to add the normalizing constant into the coefficient:
kern = exp(-j*w*(0:length(b)-1));
b = real(b*(kern*den(:))/(kern*b(:)));
EDIT:
and here is the final edition, which the whole code will return numbers exactly equal to MATLAB :
double *ComputeNumCoeffs(int FilterOrder,double Lcutoff, double Ucutoff, double *DenC)
{
double *TCoeffs;
double *NumCoeffs;
std::complex<double> *NormalizedKernel;
double Numbers[11]={0,1,2,3,4,5,6,7,8,9,10};
int i;
NumCoeffs = (double *)calloc( 2*FilterOrder+1, sizeof(double) );
if( NumCoeffs == NULL ) return( NULL );
NormalizedKernel = (std::complex<double> *)calloc( 2*FilterOrder+1, sizeof(std::complex<double>) );
if( NormalizedKernel == NULL ) return( NULL );
TCoeffs = ComputeHP(FilterOrder);
if( TCoeffs == NULL ) return( NULL );
for( i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
double cp[2];
double Bw, Wn;
cp[0] = 2*2.0*tan(PI * Lcutoff/ 2.0);
cp[1] = 2*2.0*tan(PI * Ucutoff / 2.0);
Bw = cp[1] - cp[0];
//center frequency
Wn = sqrt(cp[0]*cp[1]);
Wn = 2*atan2(Wn,4);
double kern;
const std::complex<double> result = std::complex<double>(-1,0);
for(int k = 0; k<11; k++)
{
NormalizedKernel[k] = std::exp(-sqrt(result)*Wn*Numbers[k]);
}
double b=0;
double den=0;
for(int d = 0; d<11; d++)
{
b+=real(NormalizedKernel[d]*NumCoeffs[d]);
den+=real(NormalizedKernel[d]*DenC[d]);
}
for(int c = 0; c<11; c++)
{
NumCoeffs[c]=(NumCoeffs[c]*den)/b;
}
free(TCoeffs);
return NumCoeffs;
}
There are code which could be found online implementing butterworth filter. If you use the source code to try to get result matching MATLAB results, there will be the same problem.Basically the result you got from the code hasn't been normalized, and in the source code there is a variable sff in bwhp.c. If you set that to 1, the problem will be easily solved.
I recommend you to use this source code and
the source code and usage could be found here
I added the final edition of function ComputeNumCoeffs to the program and fix "FilterOrder" (k<11 to k<2*FiltOrd+1). Maybe it will save someone's time.
f1=0.5Gz, f2=10Gz, fs=127Gz/2
In MatLab
a={1.000000000000000,-3.329746259105707, 4.180522138699884,-2.365540522960743,0.514875789136976};
b={0.041065495448784, 0.000000000000000,-0.082130990897568, 0.000000000000000,0.041065495448784};
Program:
#include <iostream>
#include <stdio.h>
#include <vector>
#include <math.h>
#include <complex>
using namespace std;
#define N 10 //The number of images which construct a time series for each pixel
#define PI 3.1415926535897932384626433832795
double *ComputeLP(int FilterOrder)
{
double *NumCoeffs;
int m;
int i;
NumCoeffs = (double *)calloc(FilterOrder+1, sizeof(double));
if(NumCoeffs == NULL) return(NULL);
NumCoeffs[0] = 1;
NumCoeffs[1] = FilterOrder;
m = FilterOrder/2;
for(i=2; i <= m; ++i)
{
NumCoeffs[i] =(double) (FilterOrder-i+1)*NumCoeffs[i-1]/i;
NumCoeffs[FilterOrder-i]= NumCoeffs[i];
}
NumCoeffs[FilterOrder-1] = FilterOrder;
NumCoeffs[FilterOrder] = 1;
return NumCoeffs;
}
double *ComputeHP(int FilterOrder)
{
double *NumCoeffs;
int i;
NumCoeffs = ComputeLP(FilterOrder);
if(NumCoeffs == NULL) return(NULL);
for(i = 0; i <= FilterOrder; ++i)
if(i % 2) NumCoeffs[i] = -NumCoeffs[i];
return NumCoeffs;
}
double *TrinomialMultiply(int FilterOrder, double *b, double *c)
{
int i, j;
double *RetVal;
RetVal = (double *)calloc(4 * FilterOrder, sizeof(double));
if(RetVal == NULL) return(NULL);
RetVal[2] = c[0];
RetVal[3] = c[1];
RetVal[0] = b[0];
RetVal[1] = b[1];
for(i = 1; i < FilterOrder; ++i)
{
RetVal[2*(2*i+1)] += c[2*i] * RetVal[2*(2*i-1)] - c[2*i+1] * RetVal[2*(2*i-1)+1];
RetVal[2*(2*i+1)+1] += c[2*i] * RetVal[2*(2*i-1)+1] + c[2*i+1] * RetVal[2*(2*i-1)];
for(j = 2*i; j > 1; --j)
{
RetVal[2*j] += b[2*i] * RetVal[2*(j-1)] - b[2*i+1] * RetVal[2*(j-1)+1] +
c[2*i] * RetVal[2*(j-2)] - c[2*i+1] * RetVal[2*(j-2)+1];
RetVal[2*j+1] += b[2*i] * RetVal[2*(j-1)+1] + b[2*i+1] * RetVal[2*(j-1)] +
c[2*i] * RetVal[2*(j-2)+1] + c[2*i+1] * RetVal[2*(j-2)];
}
RetVal[2] += b[2*i] * RetVal[0] - b[2*i+1] * RetVal[1] + c[2*i];
RetVal[3] += b[2*i] * RetVal[1] + b[2*i+1] * RetVal[0] + c[2*i+1];
RetVal[0] += b[2*i];
RetVal[1] += b[2*i+1];
}
return RetVal;
}
double *ComputeNumCoeffs(int FilterOrder,double Lcutoff, double Ucutoff, double *DenC)
{
double *TCoeffs;
double *NumCoeffs;
std::complex<double> *NormalizedKernel;
double Numbers[11]={0,1,2,3,4,5,6,7,8,9,10};
int i;
NumCoeffs = (double *)calloc(2*FilterOrder+1, sizeof(double));
if(NumCoeffs == NULL) return(NULL);
NormalizedKernel = (std::complex<double> *)calloc(2*FilterOrder+1, sizeof(std::complex<double>));
if(NormalizedKernel == NULL) return(NULL);
TCoeffs = ComputeHP(FilterOrder);
if(TCoeffs == NULL) return(NULL);
for(i = 0; i < FilterOrder; ++i)
{
NumCoeffs[2*i] = TCoeffs[i];
NumCoeffs[2*i+1] = 0.0;
}
NumCoeffs[2*FilterOrder] = TCoeffs[FilterOrder];
double cp[2];
//double Bw;
double Wn;
cp[0] = 2*2.0*tan(PI * Lcutoff/ 2.0);
cp[1] = 2*2.0*tan(PI * Ucutoff/2.0);
//Bw = cp[1] - cp[0];
//center frequency
Wn = sqrt(cp[0]*cp[1]);
Wn = 2*atan2(Wn,4);
//double kern;
const std::complex<double> result = std::complex<double>(-1,0);
for(int k = 0; k<2*FilterOrder+1; k++)
{
NormalizedKernel[k] = std::exp(-sqrt(result)*Wn*Numbers[k]);
}
double b=0;
double den=0;
for(int d = 0; d<2*FilterOrder+1; d++)
{
b+=real(NormalizedKernel[d]*NumCoeffs[d]);
den+=real(NormalizedKernel[d]*DenC[d]);
}
for(int c = 0; c<2*FilterOrder+1; c++)
{
NumCoeffs[c]=(NumCoeffs[c]*den)/b;
}
free(TCoeffs);
return NumCoeffs;
}
double *ComputeDenCoeffs(int FilterOrder, double Lcutoff, double Ucutoff)
{
int k; // loop variables
double theta; // PI * (Ucutoff - Lcutoff)/2.0
double cp; // cosine of phi
double st; // sine of theta
double ct; // cosine of theta
double s2t; // sine of 2*theta
double c2t; // cosine 0f 2*theta
double *RCoeffs; // z^-2 coefficients
double *TCoeffs; // z^-1 coefficients
double *DenomCoeffs; // dk coefficients
double PoleAngle; // pole angle
double SinPoleAngle; // sine of pole angle
double CosPoleAngle; // cosine of pole angle
double a; // workspace variables
cp = cos(PI * (Ucutoff + Lcutoff)/2.0);
theta = PI * (Ucutoff - Lcutoff)/2.0;
st = sin(theta);
ct = cos(theta);
s2t = 2.0*st*ct; // sine of 2*theta
c2t = 2.0*ct*ct - 1.0; // cosine of 2*theta
RCoeffs = (double *)calloc(2 * FilterOrder, sizeof(double));
TCoeffs = (double *)calloc(2 * FilterOrder, sizeof(double));
for(k = 0; k < FilterOrder; ++k)
{
PoleAngle = PI * (double)(2*k+1)/(double)(2*FilterOrder);
SinPoleAngle = sin(PoleAngle);
CosPoleAngle = cos(PoleAngle);
a = 1.0 + s2t*SinPoleAngle;
RCoeffs[2*k] = c2t/a;
RCoeffs[2*k+1] = s2t*CosPoleAngle/a;
TCoeffs[2*k] = -2.0*cp*(ct+st*SinPoleAngle)/a;
TCoeffs[2*k+1] = -2.0*cp*st*CosPoleAngle/a;
}
DenomCoeffs = TrinomialMultiply(FilterOrder, TCoeffs, RCoeffs);
free(TCoeffs);
free(RCoeffs);
DenomCoeffs[1] = DenomCoeffs[0];
DenomCoeffs[0] = 1.0;
for(k = 3; k <= 2*FilterOrder; ++k)
DenomCoeffs[k] = DenomCoeffs[2*k-2];
return DenomCoeffs;
}
void filter(int ord, double *a, double *b, int np, double *x, double *y)
{
int i,j;
y[0]=b[0] * x[0];
for (i=1;i<ord+1;i++)
{
y[i]=0.0;
for (j=0;j<i+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<i;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
for (i=ord+1;i<np+1;i++)
{
y[i]=0.0;
for (j=0;j<ord+1;j++)
y[i]=y[i]+b[j]*x[i-j];
for (j=0;j<ord;j++)
y[i]=y[i]-a[j+1]*y[i-j-1];
}
}
int main(int argc, char *argv[])
{
(void)argc;
(void)argv;
//Frequency bands is a vector of values - Lower Frequency Band and Higher Frequency Band
//First value is lower cutoff and second value is higher cutoff
//f1 = 0.5Gz f2=10Gz
//fs=127Gz
//Kotelnikov/2=Nyquist (127/2)
double FrequencyBands[2] = {0.5/(127.0/2.0),10.0/(127.0/2.0)};//these values are as a ratio of f/fs, where fs is sampling rate, and f is cutoff frequency
//and therefore should lie in the range [0 1]
//Filter Order
int FiltOrd = 2;//5;
//Pixel Time Series
/*int PixelTimeSeries[N];
int outputSeries[N];
*/
//Create the variables for the numerator and denominator coefficients
double *DenC = 0;
double *NumC = 0;
//Pass Numerator Coefficients and Denominator Coefficients arrays into function, will return the same
printf("\n");
//is A in matlab function and the numbers are correct
DenC = ComputeDenCoeffs(FiltOrd, FrequencyBands[0], FrequencyBands[1]);
for(int k = 0; k<2*FiltOrd+1; k++)
{
printf("DenC is: %lf\n", DenC[k]);
}
printf("\n");
NumC = ComputeNumCoeffs(FiltOrd,FrequencyBands[0],FrequencyBands[1],DenC);
for(int k = 0; k<2*FiltOrd+1; k++)
{
printf("NumC is: %lf\n", NumC[k]);
}
double y[5];
double x[5]={1,2,3,4,5};
filter(5, DenC, NumC, 5, x, y);
return 1;
}