Related
I am using C++ to do some matrix calculations using Armadillo library.
I tried to make it similar to the Matlab version.
But when I run the code.
While Matlab took about 2 - 3 min, C++ took about 20 min.
I searched a bit and realized that some people also asked why C++ is slower than Matlab in matrix calculations.
But I heard that C++ is way faster than Matlab. So I was wondering whether C++ is not as good as Matlab in terms of Matrix calculations in usual.
Below is just part of my entire code.
Is there any way I can speed up C++ matrix calculations?
Should I use a different library?
while (dif >= tol && it <= itmax) {
it = it + 1;
V = Vnew;
Vfuture = beta * (Ptrans(0) * Vnew.slice(0) + Ptrans(1) * Vnew.slice(1) + Ptrans(2) * Vnew.slice(2));
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
Mat<double> YY(Na, Nd);
YY.fill(Y(c));
Mat<double> AA(Na, Nd);
AA.fill(A(a));
Mat<double> DD(Na, Nd);
DD.fill(D(b));
Mat<double> CC = YY + AA - mg_A_v / R - (mg_D_v - (1 - delta) * DD);
Mat<double> Val = 1 / (1 - 1 / sig) * pow(pow(CC, psi) % pow(mg_D_v, 1 - psi), (1 - 1 / sig)) + Vfuture;
double max_val = Val.max();
uword maxindex_val = Val.index_max();
int index_column = maxindex_val / Na; // column
int index_row = maxindex_val - index_column * Na; // row
Vnew(a, b, c) = max_val;
maxposition_a(a, b, c) = index_row;
maxposition_d(a, b, c) = index_column;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
Vhoward = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward(i, j) = beta * Vhoward(maxposition_a(i, j, k), maxposition_d(i, j, k), 0) * Ptrans(0) + beta * Vhoward(maxposition_a(i, j, k), maxposition_d(i, j, k), 1) * Ptrans(1) + beta * Vhoward(maxposition_a(i, j, k), maxposition_d(i, j, k), 2) * Ptrans(2);
Vnew(i, j, k) = temphoward(i, j) + utility(Y(k) + A(i) - A(maxposition_a(i, j, k)) / R - D(maxposition_d(i, j, k)) + (1 - delta) * D(j), D(maxposition_d(i, j, k)), sig, psi);
}
}
}
}
tempdiff = abs(V - Vnew);
dif = tempdiff.max();
cout << dif << endl;
cout << it << endl;
}
And this is the part from the matlab.
while dif >= tol && it <= itmax
tic;
it = it + 1;
V = Vnew;
vFuture = beta*reshape(V,Na*Nd,Ny)*P;
for i_a = 1:Na %Loop over state variable a
for i_d = 1:Nd %Loop over state variable d
for i_y = 1:Ny %Loop over state variable y
val = reshape(Utility(Y(i_y) + A(i_a) - mg_A_v/R - (mg_D_v - (1-delta)*D(i_d)),mg_D_v),Na*Nd,1) + vFuture;
[Vnew(i_a,i_d,i_y), indpol(i_a,i_d,i_y)] = max(val);
[indpol_ap(i_a,i_d,i_y),indpol_dp(i_a,i_d,i_y)] = ind2sub([Na,Nd],indpol(i_a,i_d,i_y));
end
end
end
% Howard improvement step
for h = 1:H
Vhoward = Vnew;
for i_a = 1:Na %Loop over state variable a
for i_d = 1:Nd %Loop over state variable d
for i_y = 1:Ny %Loop over state variable y
Vnew(i_a,i_d,i_y) = Utility(Y(i_y) + A(i_a) - A(indpol_ap(i_a,i_d,i_y))/R - ...
(D(indpol_dp(i_a,i_d,i_y)) - (1-delta)*D(i_d)),D(indpol_dp(i_a,i_d,i_y))) ...
+ beta*reshape(Vhoward(indpol_ap(i_a,i_d,i_y),indpol_dp(i_a,i_d,i_y),:),1,Ny)*P;
end
end
end
end
dif = max(max(max(abs(V-Vnew))));
disp([it dif toc])
end
This question might be long and I really appreciate your patience. The core problem is I used matlab and c++ to implement an optimization algorithm but they provided me different results(matlab's better).
I am recently studying some evolutionary algorithms and interested in one variant of PSO(Particle Swarm Optimization), which is called Competitive Swarm Optimizer(born in 2015). This is the paper link http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6819057.
The basic idea of this algorithm is to first generate some random particles in searching space and assign them random velocities. At each iteration, we randomly pair them and let every pair of particles compare their objective function values. Winners(with better objective values) keep status quo while losers update themselves by learning from winners(moving toward winners).
Suppose at iteration t, particle i and j are compared and i is better. Then we update particle j for iteration t+1 by following these formulas. If particle j is out of searching space, we simply pull it back to the boundary. R_1, R_2, R_3 are all random vectors uniformly drawn from [0, 1]; operation 'otimes' means elementwise product; phi is a parameter; x_bar is the center of swarm.
For example, suppose now I want to minimize a 500-d Schwefel function(minimize the maximal absolute element) and I use 250 particles, set phi=0.1, searching space is 500-d [-100, 100]. Matlab could return me something around 35 while C++ got stuck at 85 to 90. I cannot figure out what's the problem.
Let me attach my matlab and c++ code here.
Sch = #(x)max(abs(x))
lb = -100 * ones(1, 500);
ub = 100 * ones(1, 500);
swarmsize = 250;
phi = 0.1;
maxiter = 10000;
tic
cso(Sch, lb, ub, swarmsize, phi, maxiter);
toc
function [minf, minx] = cso(obj_fun, lb, ub, swarmsize, phi, maxiter)
assert(length(lb) == length(ub), 'Not equal length of bounds');
if all(ub - lb <= 0) > 0
error('Error. \n Upper bound must be greater than lower bound.')
end
vhigh = abs(ub - lb);
vlow = -vhigh;
S = swarmsize;
D = length(ub);
x = rand(S, D);
x = bsxfun(#plus, lb, bsxfun(#times, ub-lb, x)); % randomly initalize all particles
v = zeros([S D]); % set initial velocities to 0
iter = 0;
pairnum_1 = floor(S / 2);
losers = 1:S;
fx = arrayfun(#(K) obj_fun(x(K, :)), 1:S);
randperm_index = randperm(S);
while iter <= maxiter
fx(losers) = arrayfun(#(K) obj_fun(x(K, :)), losers);
swarm_center = mean(x); % calculate center all particles
randperm_index = randperm(S); % randomly permuate all particle indexes
rpairs = [randperm_index(1:pairnum_1); randperm_index(S-pairnum_1+1:S)]'; % random pair
cmask= (fx(rpairs(:, 1)) > fx(rpairs(:, 2)))';
losers = bsxfun(#times, cmask, rpairs(:, 1)) + bsxfun(#times, ~cmask, rpairs(:, 2)); % losers who with larger values
winners = bsxfun(#times, ~cmask, rpairs(:, 1)) + bsxfun(#times, cmask, rpairs(:, 2)); % winners who with smaller values
R1 = rand(pairnum_1, D);
R2 = rand(pairnum_1, D);
R3 = rand(pairnum_1, D);
v(losers, :) = bsxfun(#times, R1, v(losers, :)) + bsxfun(#times, R2, x(winners, :) - x(losers, :)) + phi * bsxfun(#times, R3, bsxfun(#minus, swarm_center, x(losers, :)));
x(losers, :) = x(losers, :) + v(losers, :);
maskl = bsxfun(#lt, x(losers, :), lb);
masku = bsxfun(#gt, x(losers, :), ub);
mask = bsxfun(#lt, x(losers, :), lb) | bsxfun(#gt, x(losers, :), ub);
x(losers, :) = bsxfun(#times, ~mask, x(losers, :)) + bsxfun(#times, lb, maskl) + bsxfun(#times, ub, masku);
iter = iter + 1;
fprintf('Iter: %d\n', iter);
fprintf('Best fitness: %e\n', min(fx));
end
fprintf('Best fitness: %e\n', min(fx));
[minf, min_index] = min(fx);
minx = x(min_index, :);
end
(I didn't write C++ function.)
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#include <iomanip>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define rand_01 ((double) rand() / RAND_MAX) // generate 0~1 random numbers
#define PI 3.14159265359
const int numofdims = 500; // problem dimension
const int numofparticles = 250; // number of particles
const int halfswarm = numofparticles / 2;
const double phi = 0.1;
const int maxiter = 10000; // iteration number
double Sch(double X[], int d); // max(abs(x_i))
using namespace std;
int main(){
clock_t t1,t2;
t1=clock();
srand(time(0)); // random seed
double** X = new double*[numofparticles]; // X for storing all particles
for(int i=0; i<numofparticles; i++)
X[i] = new double[numofdims];
double** V = new double*[numofparticles]; // V for storing velocities
for(int i=0; i<numofparticles; i++)
V[i] = new double[numofdims];
double Xmin[numofdims] = {0}; // lower bounds
double Xmax[numofdims] = {0}; // upper bounds
double* fitnesses = new double[numofparticles]; // objective function values
for(int j=0; j<numofdims; j++)
{
Xmin[j] = -100;
Xmax[j] = 100;
}
for(int i=0; i<numofparticles; i++)
{
for(int j=0; j<numofdims; j++)
{
X[i][j] = Xmin[j] + rand_01 * (Xmax[j] - Xmin[j]); // initialize X
V[i][j] = 0; // initialize V
}
}
for(int i=0; i<numofparticles; i++)
{
fitnesses[i] = Sch(X[i], numofdims); //
}
double minfit = fitnesses[0]; // temporary minimal value
int minidx = 0; // temporary index of minimal value
int* idxofparticles = new int[numofparticles];
for(int i=0; i<numofparticles; i++)
idxofparticles[i] = i;
double* Xmean = new double[numofdims];
int* losers = new int[halfswarm]; // for saving losers indexes
for(int iter=0; iter<maxiter; iter++)
{
random_shuffle(idxofparticles, idxofparticles+numofparticles);
for(int j=0; j<numofdims; j++)
{
for(int i=0; i<numofparticles; i++)
{
Xmean[j] += X[i][j];
}
Xmean[j] = (double) Xmean[j] / numofparticles; // calculate swarm center
}
for(int i = 0; i < halfswarm; i++)
{
// indexes are now random
// compare 1st to (halfswarm+1)th, 2nd to (halfswarm+2)th, ...
if(fitnesses[idxofparticles[i]] < fitnesses[idxofparticles[i+halfswarm]])
{
losers[i] = idxofparticles[i+halfswarm];
for(int j = 0; j < numofdims; j++)
{
V[idxofparticles[i+halfswarm]][j] = rand_01 * V[idxofparticles[i+halfswarm]][j] + rand_01 * (X[idxofparticles[i]][j] - X[idxofparticles[i+halfswarm]][j]) + rand_01 * phi * (Xmean[j] - X[idxofparticles[i+halfswarm]][j]);
X[idxofparticles[i+halfswarm]][j] = min(max((X[idxofparticles[i+halfswarm]][j] + V[idxofparticles[i+halfswarm]][j]), Xmin[j]), Xmax[j]);
}
}
else
{
losers[i] = idxofparticles[i];
for(int j = 0; j < numofdims; j++)
{
V[idxofparticles[i]][j] = rand_01 * V[idxofparticles[i]][j] + rand_01 * (X[idxofparticles[i+halfswarm]][j] - X[idxofparticles[i]][j]) + rand_01 * phi * (Xmean[j] - X[idxofparticles[i]][j]);
X[idxofparticles[i]][j] = min(max((X[idxofparticles[i]][j] + V[idxofparticles[i]][j]), Xmin[j]), Xmax[j]);
}
}
}
// recalculate particles' values
for(int i=0; i<numofparticles; i++)
{
fitnesses[i] = Sch(X[i], numofdims);
if(fitnesses[i] < minfit)
{
minfit = fitnesses[i]; // update minimum
minidx = i; // update index
}
}
if(iter % 1000 == 0)
{
cout << scientific << endl;
cout << minfit << endl;
}
}
cout << scientific << endl;
cout << minfit << endl;
t2=clock();
delete [] X;
delete [] V;
delete [] fitnesses;
delete [] idxofparticles;
delete [] Xmean;
delete [] losers;
float diff ((float)t2-(float)t1);
float seconds = diff / CLOCKS_PER_SEC;
cout << "runtime: " << seconds << "s" <<endl;
return 0;
}
double Sch(double X[], int d)
{
double result=abs(X[0]);
for(int j=0; j<d; j++)
{
if(abs(X[j]) > result)
result = abs(X[j]);
}
return result;
}
So, finally, why can't my c++ code reproduce matlab's outcome? Thank you very much.
I am trying to implement this F(S) function:
bellow is my code but is not working:
double EnergyFunction::evaluate(vector<short> field) {
double e = 0.0;
for (int k = 1; k < field.size() - 1; k++){
double c = 0.0;
for (int i = 1; i < field.size() - k; i++) {
c += field[i] * field[i + k];
}
e += pow(c, 2);
}
double f = pow(field.size(), 2) / ( 2 * e );
return f;
}
For example F(S) function should return value 8644 for vector:
1,1,1,-1,-1,-1,1,-1,1,1,-1,1,-1,1,-1,1,-1,-1,1,1,1,1,-1,-1,-1,1,1,1,1,-1,1,-1,1,1,-1,-1,1,1,1,1,-1,-1,-1,1,-1,-1,1,-1,-1,1,1,-1,1,-1,-1,1,1,-1,1,-1,1,-1,1,-1,1,-1,1,1,-1,-1,-1,-1,-1,-1,1,-1,1,1,1,-1,1,1,-1,1,1,-1,1,-1,1,1,1,-1,-1,1,1,-1,-1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1,-1,1,-1,-1,1,-1,-1,1,-1,-1,-1,-1,-1,1,1,1,1,1,-1,-1,-1,1,-1,-1,1,-1,-1,1,-1,-1,1,-1,1,-1,-1,1,1,1,1,1,1,-1,1,-1,1,-1,1,1,1,1,1,1,-1,1,-1,-1,-1,1,-1,1,1,-1,-1,-1,-1,1,-1,-1,-1,1,1,-1,-1,1,1,1,-1,-1,1,1,1,1,-1,1,1,-1,1,-1,-1,1,-1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,1,1,-1,-1,-1,-1,-1,1,-1,-1,-1,1,1,-1,1,1,-1,-1,-1,1,-1,-1,1,-1,-1,-1,1,1,1,-1,-1,-1,-1,1,1,1,-1,1,-1,-1,1,-1,1,1,-1,-1,-1,-1,1,-1,1,1,1,1,1,1,-1,1,1,1,-1,-1,-1,-1,1,-1,1,1,1,1,-1,1,1,1,1,1,-1,-1,-1,1,-1,-1,1,1,1,-1,1,1,1,-1,1,1
I need another par of eyes to look at my code because I am a bit lost here. :)
after refactoring:
double EnergyFunction::evaluate(vector<short> field) {
double e = 0.0;
int l = field.size()
for (int k = 1; k < l; k++){
double c = 0.0;
for (int i = 0, j = k; j < l; i++, j++) {
c += field[i] * field[j];
}
e += c*c;
}
return l*l / ( e+e );
}
explanation:
1. we need to iterate (L-1) times
2. we need to shift the base and offset indexes until we reach the last one
3. c*c and e+e are quicker and easier to read
You are mapping variables into different ranges using the same names, which is always going to be confusing. Better is to keep ranges and names the same as in the math, and only subtract one for 0-base indexes at indexing time. Also might as well use L explicitly:
int L = field.size();
for (int k = 1; k <= L-1; k++){
...
for (int i = 1; i <= L-k; i++) {
c += field[i -1] * field[i+k -1];
...
everyone I am trying to implement patter matching with FFT but I am not sure what the result should be (I think I am missing something even though a read a lot of stuff about the problem and tried a lot of different implementations this one is the best so far). Here is my FFT correlation function.
void fft2d(fftw_complex**& a, int rows, int cols, bool forward = true)
{
fftw_plan p;
for (int i = 0; i < rows; ++i)
{
p = fftw_plan_dft_1d(cols, a[i], a[i], forward ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
}
fftw_complex* t = (fftw_complex*)fftw_malloc(rows * sizeof(fftw_complex));
for (int j = 0; j < cols; ++j)
{
for (int i = 0; i < rows; ++i)
{
t[i][0] = a[i][j][0];
t[i][1] = a[i][j][1];
}
p = fftw_plan_dft_1d(rows, t, t, forward ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (int i = 0; i < rows; ++i)
{
a[i][j][0] = t[i][0];
a[i][j][1] = t[i][1];
}
}
fftw_free(t);
}
int findCorrelation(int argc, char* argv[])
{
BMP bigImage;
BMP keyImage;
BMP result;
RGBApixel blackPixel = { 0, 0, 0, 1 };
const bool swapQuadrants = (argc == 4);
if (argc < 3 || argc > 4) {
cout << "correlation img1.bmp img2.bmp" << endl;
return 1;
}
if (!keyImage.ReadFromFile(argv[1])) {
return 1;
}
if (!bigImage.ReadFromFile(argv[2])) {
return 1;
}
//Preparations
const int maxWidth = std::max(bigImage.TellWidth(), keyImage.TellWidth());
const int maxHeight = std::max(bigImage.TellHeight(), keyImage.TellHeight());
const int rowsCount = maxHeight;
const int colsCount = maxWidth;
BMP bigTemp = bigImage;
BMP keyTemp = keyImage;
keyImage.SetSize(maxWidth, maxHeight);
bigImage.SetSize(maxWidth, maxHeight);
for (int i = 0; i < rowsCount; ++i)
for (int j = 0; j < colsCount; ++j) {
RGBApixel p1;
if (i < bigTemp.TellHeight() && j < bigTemp.TellWidth()) {
p1 = bigTemp.GetPixel(j, i);
} else {
p1 = blackPixel;
}
bigImage.SetPixel(j, i, p1);
RGBApixel p2;
if (i < keyTemp.TellHeight() && j < keyTemp.TellWidth()) {
p2 = keyTemp.GetPixel(j, i);
} else {
p2 = blackPixel;
}
keyImage.SetPixel(j, i, p2);
}
//Here is where the transforms begin
fftw_complex **a = (fftw_complex**)fftw_malloc(rowsCount * sizeof(fftw_complex*));
fftw_complex **b = (fftw_complex**)fftw_malloc(rowsCount * sizeof(fftw_complex*));
fftw_complex **c = (fftw_complex**)fftw_malloc(rowsCount * sizeof(fftw_complex*));
for (int i = 0; i < rowsCount; ++i) {
a[i] = (fftw_complex*)fftw_malloc(colsCount * sizeof(fftw_complex));
b[i] = (fftw_complex*)fftw_malloc(colsCount * sizeof(fftw_complex));
c[i] = (fftw_complex*)fftw_malloc(colsCount * sizeof(fftw_complex));
for (int j = 0; j < colsCount; ++j) {
RGBApixel p1;
p1 = bigImage.GetPixel(j, i);
a[i][j][0] = (0.299*p1.Red + 0.587*p1.Green + 0.114*p1.Blue);
a[i][j][1] = 0.0;
RGBApixel p2;
p2 = keyImage.GetPixel(j, i);
b[i][j][0] = (0.299*p2.Red + 0.587*p2.Green + 0.114*p2.Blue);
b[i][j][1] = 0.0;
}
}
fft2d(a, rowsCount, colsCount);
fft2d(b, rowsCount, colsCount);
result.SetSize(maxWidth, maxHeight);
for (int i = 0; i < rowsCount; ++i)
for (int j = 0; j < colsCount; ++j) {
fftw_complex& y = a[i][j];
fftw_complex& x = b[i][j];
double u = x[0], v = x[1];
double m = y[0], n = y[1];
c[i][j][0] = u*m + n*v;
c[i][j][1] = v*m - u*n;
int fx = j;
if (fx>(colsCount / 2)) fx -= colsCount;
int fy = i;
if (fy>(rowsCount / 2)) fy -= rowsCount;
float r2 = (fx*fx + fy*fy);
const double cuttoffCoef = (maxWidth * maxHeight) / 37992.;
if (r2<128 * 128 * cuttoffCoef)
c[i][j][0] = c[i][j][1] = 0;
}
fft2d(c, rowsCount, colsCount, false);
const int halfCols = colsCount / 2;
const int halfRows = rowsCount / 2;
if (swapQuadrants) {
for (int i = 0; i < halfRows; ++i)
for (int j = 0; j < halfCols; ++j) {
std::swap(c[i][j][0], c[i + halfRows][j + halfCols][0]);
std::swap(c[i][j][1], c[i + halfRows][j + halfCols][1]);
}
for (int i = halfRows; i < rowsCount; ++i)
for (int j = 0; j < halfCols; ++j) {
std::swap(c[i][j][0], c[i - halfRows][j + halfCols][0]);
std::swap(c[i][j][1], c[i - halfRows][j + halfCols][1]);
}
}
for (int i = 0; i < rowsCount; ++i)
for (int j = 0; j < colsCount; ++j) {
const double& g = c[i][j][0];
RGBApixel pixel;
pixel.Alpha = 0;
int gInt = 255 - static_cast<int>(std::floor(g + 0.5));
pixel.Red = gInt;
pixel.Green = gInt;
pixel.Blue = gInt;
result.SetPixel(j, i, pixel);
}
BMP res;
res.SetSize(maxWidth, maxHeight);
result.WriteToFile("result.bmp");
return 0;
}
Sample output
This question would probably be more appropriately posted on another site like cross validated (metaoptimize.com used to also be a good one, but it appears to be gone)
That said:
There's two similar operations you can perform with FFT: convolution and correlation. Convolution is used for determining how two signals interact with each-other, whereas correlation can be used to express how similar two signals are to each-other. Make sure you're doing the right operation as they're both commonly implemented throught a DFT.
For this type of application of DFTs you usually wouldn't extract any useful information in the fourier spectrum unless you were looking for frequencies common to both data sources or whatever (eg, if you were comparing two bridges to see if their supports are spaced similarly).
Your 3rd image looks a lot like the power domain; normally I see the correlation output entirely grey except where overlap occurred. Your code definitely appears to be computing the inverse DFT, so unless I'm missing something the only other explanation I've come up with for the fuzzy look could be some of the "fudge factor" code in there like:
if (r2<128 * 128 * cuttoffCoef)
c[i][j][0] = c[i][j][1] = 0;
As for what you should expect: wherever there are common elements between the two images you'll see a peak. The larger the peak, the more similar the two images are near that region.
Some comments and/or recommended changes:
1) Convolution & correlation are not scale invariant operations. In other words, the size of your pattern image can make a significant difference in your output.
2) Normalize your images before correlation.
When you get the image data ready for the forward DFT pass:
a[i][j][0] = (0.299*p1.Red + 0.587*p1.Green + 0.114*p1.Blue);
a[i][j][1] = 0.0;
/* ... */
How you grayscale the image is your business (though I would've picked something like sqrt( r*r + b*b + g*g )). However, I don't see you doing anything to normalize the image.
The word "normalize" can take on a few different meanings in this context. Two common types:
normalize the range of values between 0.0 and 1.0
normalize the "whiteness" of the images
3) Run your pattern image through an edge enhancement filter. I've personally made use of canny, sobel, and I think I messed with a few others. As I recall, canny was "quick'n dirty", sobel was more expensive, but I got comparable results when it came time to do correlation. See chapter 24 of the "dsp guide" book that's freely available online. The whole book is worth your time, but if you're low on time then at a minimum chapter 24 will help a lot.
4) Re-scale the output image between [0, 255]; if you want to implement thresholds, do it after this step because the thresholding step is lossy.
My memory on this one is hazy, but as I recall (edited for clarity):
You can scale the final image pixels (before rescaling) between [-1.0, 1.0] by dividing off the largest power spectrum value from the entire power spectrum
The largest power spectrum value is, conveniently enough, the center-most value in the power spectrum (corresponding to the lowest frequency)
If you divide it off the power spectrum, you'll end up doing twice the work; since FFTs are linear, you can delay the division until after the inverse DFT pass to when you're re-scaling the pixels between [0..255].
If after rescaling most of your values end up so black you can't see them, you can use a solution to the ODE y' = y(1 - y) (one example is the sigmoid f(x) = 1 / (1 + exp(-c*x) ), for some scaling factor c that gives better gradations). This has more to do with improving your ability to interpret the results visually than anything you might use to programmatically find peaks.
edit I said [0, 255] above. I suggest you rescale to [128, 255] or some other lower bound that is gray rather than black.
We need to change/reimplement standard DFT implementation in GSL, which is
int
FUNCTION(gsl_dft_complex,transform) (const BASE data[],
const size_t stride, const size_t n,
BASE result[],
const gsl_fft_direction sign)
{
size_t i, j, exponent;
const double d_theta = 2.0 * ((int) sign) * M_PI / (double) n;
/* FIXME: check that input length == output length and give error */
for (i = 0; i < n; i++)
{
ATOMIC sum_real = 0;
ATOMIC sum_imag = 0;
exponent = 0;
for (j = 0; j < n; j++)
{
double theta = d_theta * (double) exponent;
/* sum = exp(i theta) * data[j] */
ATOMIC w_real = (ATOMIC) cos (theta);
ATOMIC w_imag = (ATOMIC) sin (theta);
ATOMIC data_real = REAL(data,stride,j);
ATOMIC data_imag = IMAG(data,stride,j);
sum_real += w_real * data_real - w_imag * data_imag;
sum_imag += w_real * data_imag + w_imag * data_real;
exponent = (exponent + i) % n;
}
REAL(result,stride,i) = sum_real;
IMAG(result,stride,i) = sum_imag;
}
return 0;
}
In this implementation, GSL iterates over input vector twice for sample/input size. However, we need to construct for different frequency bins. For instance, we have 4096 samples, but we need to calculate DFT for 128 different frequencies. Could you help me to define or implement required DFT behaviour? Thanks in advance.
EDIT: We do not search for first m frequencies.
Actually, is below approach correct for finding DFT result with given frequency bin number?
N = sample size
B = frequency bin size
k = 0,...,127 X[k] = SUM(0,N){x[i]*exp(-j*2*pi*k*i/B)}
EDIT: I might have not explained the problem for DFT elaborately, nevertheless, I am happy to provide the answer below:
void compute_dft(const std::vector<double>& signal,
const std::vector<double>& frequency_band,
std::vector<double>& result,
const double sampling_rate)
{
if(0 == result.size() || result.size() != (frequency_band.size() << 1)){
result.resize(frequency_band.size() << 1, 0.0);
}
//note complex signal assumption
const double d_theta = -2.0 * PI * sampling_rate;
for(size_t k = 0; k < frequency_band.size(); ++k){
const double f_k = frequency_band[k];
double real_sum = 0.0;
double imag_sum = 0.0;
for(size_t n = 0; n < (signal.size() >> 1); ++n){
double theta = d_theta * f_k * (n + 1);
double w_real = cos(theta);
double w_imag = sin(theta);
double d_real = signal[2*n];
double d_imag = signal[2*n + 1];
real_sum += w_real * d_real - w_imag * d_imag;
imag_sum += w_real * d_imag + w_imag * d_real;
}
result[2*k] = real_sum;
result[2*k + 1] = imag_sum;
}
}
Assuming you just want the the first m output frequencies:
int
FUNCTION(gsl_dft_complex,transform) (const BASE data[],
const size_t stride,
const size_t n, // input size
const size_t m, // output size (m <= n)
BASE result[],
const gsl_fft_direction sign)
{
size_t i, j, exponent;
const double d_theta = 2.0 * ((int) sign) * M_PI / (double) n;
/* FIXME: check that m <= n and give error */
for (i = 0; i < m; i++) // for each of m output bins
{
ATOMIC sum_real = 0;
ATOMIC sum_imag = 0;
exponent = 0;
for (j = 0; j < n; j++) // for each of n input points
{
double theta = d_theta * (double) exponent;
/* sum = exp(i theta) * data[j] */
ATOMIC w_real = (ATOMIC) cos (theta);
ATOMIC w_imag = (ATOMIC) sin (theta);
ATOMIC data_real = REAL(data,stride,j);
ATOMIC data_imag = IMAG(data,stride,j);
sum_real += w_real * data_real - w_imag * data_imag;
sum_imag += w_real * data_imag + w_imag * data_real;
exponent = (exponent + i) % n;
}
REAL(result,stride,i) = sum_real;
IMAG(result,stride,i) = sum_imag;
}
return 0;
}