Simple speed up of C++ OpenMP kernel - c++

I have never worked with OpenMP or optimization of C++, so all help is welcome. I'm probably doing some very stupid things that slow down the process drastically. It doesn't need to be the fastest, but I think some easy tricks will significantly speed it up. Anyone? Thanks a lot!
This function calculates the standard deviation of a patch, given a kernel size and greyscale OpenCV image. The middle pixel of the patch is kept if it is below the given threshold, else it is rejected. This is done for each pixel except the border.
#include "stdafx.h"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/photo/photo.hpp"
#include <stdlib.h>
#include <stdio.h>
#include "utils.h"
#include <windows.h>
#include <string.h>
#include <math.h>
#include <numeric>
using namespace cv;
using namespace std;
Mat low_pass_filter(Mat img, int threshold, int kernelSize)
{
unsigned char *input = (unsigned char*)(img.data);
Mat output = Mat::zeros(img.size(), CV_8UC1);
unsigned char *output_ptr = (unsigned char*)(output.data);
#pragma omp parallel for
for (int i = (kernelSize - 1) / 2; i < img.rows - (kernelSize - 1) / 2; i++){
for (int j = (kernelSize - 1) / 2; j < img.cols - (kernelSize - 1) / 2; j++){
double sum, m, accum, stdev;
vector<double> v;
// Kernel Patch
for (int kx = i - (kernelSize - 1) / 2; kx <= i + (kernelSize - 1) / 2; kx++){
for (int ky = j - (kernelSize - 1) / 2; ky <= j + (kernelSize - 1) / 2; ky++){
v.push_back((double)input[img.step * kx + ky]);//.at<uchar>(kx, ky));
}
}
sum = std::accumulate(std::begin(v), std::end(v), 0.0);
m = sum / v.size();
accum = 0.0;
std::for_each(std::begin(v), std::end(v), [&](const double d) {
accum += (d - m) * (d - m);
});
stdev = sqrt(accum / (v.size() - 1));
if (stdev < threshold){
output_ptr[img.step * i + j] = input[img.step * i + j];
}
}
}
return output;
}

Vector v is not required. Instead of adding items to it, maintain accumulators of d and d*d, and then use variance = E(v²) / E(v)² so that your inner code becomes:
double sum = 0;
double sum2 = 0;
int n = kernelSize * kernelSize;
// Kernel Patch
for (int kx = ...) {
for (int ky = ...) {
sum += d;
sum2 += d*d;
}
}
double mean = sum/n;
double stddev = sqrt(sum2/n - mean*mean);
if (stddev < threshold) {
...;
}
After that, consider that the sum of elements centred around (x+1,y) can be found from the result for (x,y) simply by subtracting all the elements in the previous left-hand column, and adding all the elements in the new right-hand column. An analogous operation works vertically.
Also, check your compiler options - are you auto-vectorizing loops, and using SIMD instructions (if available)?

Related

CUDA Sort Z-Axis 3D Array C++/Thrust

I'm looking to sort a large 3D array along the z-axis.
Example array is X x Y x Z (1000x1000x5)
I'd like to sort along the z-axis so I'd perform 1000x1000 sorts for 5 element along the z-axis.
Edit Update: Tried an attempt to use thrust below. It's functional and I'd store the output back, but this is very slow since I'm sorting 5 elements at a time per (x,y) location:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
int main(){
int x = 1000, y = 1000, z = 5;
float*** unsorted_cube = new float** [x];
for (int i = 0; i < x; i++)
{
// Allocate memory blocks for
// rows of each 2D array
unsorted_cube[i] = new float* [y];
for (int j = 0; j < y; j++)
{
// Allocate memory blocks for
// columns of each 2D array
unsorted_cube[i][j] = new float[z];
}
}
for (int i = 0; i < x; i++)
{
for (int j = 0; j < y; j++)
{
unsorted_cube[i][j][0] = 4.0f;
unsorted_cube[i][j][1] = 3.0f;
unsorted_cube[i][j][2] = 1.0f;
unsorted_cube[i][j][3] = 5.0f;
unsorted_cube[i][j][4] = 2.0f;
}
}
for (int i = 0; i < 5; i++)
{
printf("unsorted_cube first 5 elements to sort at (0,0): %f\n", unsorted_cube[0][0][i]);
}
float* temp_input;
float* temp_output;
float* raw_ptr;
float raw_ptr_out[5];
cudaMalloc((void**)&raw_ptr, N_Size * sizeof(float));
for (int i = 0; i < x; i++)
{
for (int j = 0; j < y; j++)
{
temp_input[0] = unsorted_cube[i][j][0];
temp_input[1] = unsorted_cube[i][j][1];
temp_input[2] = unsorted_cube[i][j][2];
temp_input[3] = unsorted_cube[i][j][3];
temp_input[4] = unsorted_cube[i][j][4];
cudaMemcpy(raw_ptr, temp_input, 5 * sizeof(float), cudaMemcpyHostToDevice);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(raw_ptr);
thrust::sort(dev_ptr, dev_ptr + 5);
thrust::host_vector<float> host_vec(5);
thrust::copy(dev_ptr, dev_ptr + 5, raw_ptr_out);
if (i == 0 && j == 0)
{
for (int i = 0; i < 5; i++)
{
temp_output[i] = raw_ptr_out[i];
}
printf("sorted_cube[0,0,0] : %f\n", temp_output[0]);
printf("sorted_cube[0,0,1] : %f\n", temp_output[1]);
printf("sorted_cube[0,0,2] : %f\n", temp_output[2]);
printf("sorted_cube[0,0,3] : %f\n", temp_output[3]);
printf("sorted_cube[0,0,4] : %f\n", temp_output[4]);
}
}
}
}
Assuming that the data is in a format where the values in each xy-plane are consecutive in memory: data[((z * y_length) + y) * x_length + x] (which is be best for coalescing memory accesses on the GPU, as well)
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/zip_iterator.h>
void sort_in_z_dir(thrust::device_vector<float> &data,
int x_length, int y_length) { // z_length == 5
auto z_stride = x_length * y_length;
thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(
data.begin(),
data.begin() + z_stride,
data.begin() + 2 * z_stride,
data.begin() + 3 * z_stride,
data.begin() + 4 * z_stride)),
thrust::make_zip_iterator(thrust::make_tuple(
data.begin() + z_stride,
data.begin() + 2 * z_stride,
data.begin() + 3 * z_stride,
data.begin() + 4 * z_stride,
data.begin() + 5 * z_stride)),
[] __host__ __device__
(thrust::tuple<float, float, float, float, float> &values) {
float local_data[5] = {thrust::get<0>(values),
thrust::get<1>(values),
thrust::get<2>(values),
thrust::get<3>(values),
thrust::get<4>(values)};
thrust::sort(thrust::seq, local_data, local_data + 5);
thrust::get<0>(values) = local_data[0];
thrust::get<1>(values) = local_data[1];
thrust::get<2>(values) = local_data[2];
thrust::get<3>(values) = local_data[3];
thrust::get<4>(values) = local_data[4];
});
}
This solution is certainly very ugly in terms of hardcoding z_length. One can use some C++ template-"magic" to make z_length into a template parameter, but this seemed to be overkill for this answer about Thrust.
See Convert std::tuple to std::array C++11 and How to convert std::array to std::tuple? for examples on interfacing between arrays and tuples.
The good thing about this solution that up to the sorting algorithm itself it should be pretty much optimal performance-wise. I don't know if thrust::sort is optimized for such small input arrays, but you can replace it by any self written sorting algorithm as I proposed in the comments.
If you want to be able to use different z_length without all this hassle, you might prefer this solution, which sorts in global memory, which is far from optimal, and feels a bit hacky because it uses Thrust pretty much only to launch a kernel. Here you want to have the data ordered the other way around: data[((x * y_length) + y) * z_length + z]
#include <thrust/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
void sort_in_z_dir_alternative(thrust::device_vector<float> &data,
int x_length, int y_length, int z_length) {
int n_threads = x_length * y_length;
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n_threads),
[ddata = thrust::raw_pointer_cast(data.data()), z_length] __host__ __device__ (int idx) {
thrust::sort(thrust::seq,
ddata + z_length * idx,
ddata + z_length * (idx + 1));
});
}
If you are ok with z_length being a template parameter, this might be a solution that combines the best from both worlds (data format like in the first example):
#include <thrust/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
template <int z_length>
void sort_in_z_dir_middle_ground(thrust::device_vector<float> &data,
int x_length, int y_length) {
int n_threads = x_length * y_length; // == z_stride
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n_threads),
[ddata = thrust::raw_pointer_cast(data.data()),
z_length, n_threads] __host__ __device__ (int idx) {
float local_data[z_length];
#pragma unroll
for (int i = 0; i < z_length; ++i) {
local_data[i] = ddata[idx + i * n_threads];
}
thrust::sort(thrust::seq,
local_data,
local_data + z_length);
#pragma unroll
for (int i = 0; i < z_length; ++i) {
ddata[idx + i * n_threads] = local_data[i];
}
});
}

Matlab and C++ yield different outcomes when optimizing Schwefel function by an algorithm similar to PSO

This question might be long and I really appreciate your patience. The core problem is I used matlab and c++ to implement an optimization algorithm but they provided me different results(matlab's better).
I am recently studying some evolutionary algorithms and interested in one variant of PSO(Particle Swarm Optimization), which is called Competitive Swarm Optimizer(born in 2015). This is the paper link http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6819057.
The basic idea of this algorithm is to first generate some random particles in searching space and assign them random velocities. At each iteration, we randomly pair them and let every pair of particles compare their objective function values. Winners(with better objective values) keep status quo while losers update themselves by learning from winners(moving toward winners).
Suppose at iteration t, particle i and j are compared and i is better. Then we update particle j for iteration t+1 by following these formulas. If particle j is out of searching space, we simply pull it back to the boundary. R_1, R_2, R_3 are all random vectors uniformly drawn from [0, 1]; operation 'otimes' means elementwise product; phi is a parameter; x_bar is the center of swarm.
For example, suppose now I want to minimize a 500-d Schwefel function(minimize the maximal absolute element) and I use 250 particles, set phi=0.1, searching space is 500-d [-100, 100]. Matlab could return me something around 35 while C++ got stuck at 85 to 90. I cannot figure out what's the problem.
Let me attach my matlab and c++ code here.
Sch = #(x)max(abs(x))
lb = -100 * ones(1, 500);
ub = 100 * ones(1, 500);
swarmsize = 250;
phi = 0.1;
maxiter = 10000;
tic
cso(Sch, lb, ub, swarmsize, phi, maxiter);
toc
function [minf, minx] = cso(obj_fun, lb, ub, swarmsize, phi, maxiter)
assert(length(lb) == length(ub), 'Not equal length of bounds');
if all(ub - lb <= 0) > 0
error('Error. \n Upper bound must be greater than lower bound.')
end
vhigh = abs(ub - lb);
vlow = -vhigh;
S = swarmsize;
D = length(ub);
x = rand(S, D);
x = bsxfun(#plus, lb, bsxfun(#times, ub-lb, x)); % randomly initalize all particles
v = zeros([S D]); % set initial velocities to 0
iter = 0;
pairnum_1 = floor(S / 2);
losers = 1:S;
fx = arrayfun(#(K) obj_fun(x(K, :)), 1:S);
randperm_index = randperm(S);
while iter <= maxiter
fx(losers) = arrayfun(#(K) obj_fun(x(K, :)), losers);
swarm_center = mean(x); % calculate center all particles
randperm_index = randperm(S); % randomly permuate all particle indexes
rpairs = [randperm_index(1:pairnum_1); randperm_index(S-pairnum_1+1:S)]'; % random pair
cmask= (fx(rpairs(:, 1)) > fx(rpairs(:, 2)))';
losers = bsxfun(#times, cmask, rpairs(:, 1)) + bsxfun(#times, ~cmask, rpairs(:, 2)); % losers who with larger values
winners = bsxfun(#times, ~cmask, rpairs(:, 1)) + bsxfun(#times, cmask, rpairs(:, 2)); % winners who with smaller values
R1 = rand(pairnum_1, D);
R2 = rand(pairnum_1, D);
R3 = rand(pairnum_1, D);
v(losers, :) = bsxfun(#times, R1, v(losers, :)) + bsxfun(#times, R2, x(winners, :) - x(losers, :)) + phi * bsxfun(#times, R3, bsxfun(#minus, swarm_center, x(losers, :)));
x(losers, :) = x(losers, :) + v(losers, :);
maskl = bsxfun(#lt, x(losers, :), lb);
masku = bsxfun(#gt, x(losers, :), ub);
mask = bsxfun(#lt, x(losers, :), lb) | bsxfun(#gt, x(losers, :), ub);
x(losers, :) = bsxfun(#times, ~mask, x(losers, :)) + bsxfun(#times, lb, maskl) + bsxfun(#times, ub, masku);
iter = iter + 1;
fprintf('Iter: %d\n', iter);
fprintf('Best fitness: %e\n', min(fx));
end
fprintf('Best fitness: %e\n', min(fx));
[minf, min_index] = min(fx);
minx = x(min_index, :);
end
(I didn't write C++ function.)
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#include <iomanip>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define rand_01 ((double) rand() / RAND_MAX) // generate 0~1 random numbers
#define PI 3.14159265359
const int numofdims = 500; // problem dimension
const int numofparticles = 250; // number of particles
const int halfswarm = numofparticles / 2;
const double phi = 0.1;
const int maxiter = 10000; // iteration number
double Sch(double X[], int d); // max(abs(x_i))
using namespace std;
int main(){
clock_t t1,t2;
t1=clock();
srand(time(0)); // random seed
double** X = new double*[numofparticles]; // X for storing all particles
for(int i=0; i<numofparticles; i++)
X[i] = new double[numofdims];
double** V = new double*[numofparticles]; // V for storing velocities
for(int i=0; i<numofparticles; i++)
V[i] = new double[numofdims];
double Xmin[numofdims] = {0}; // lower bounds
double Xmax[numofdims] = {0}; // upper bounds
double* fitnesses = new double[numofparticles]; // objective function values
for(int j=0; j<numofdims; j++)
{
Xmin[j] = -100;
Xmax[j] = 100;
}
for(int i=0; i<numofparticles; i++)
{
for(int j=0; j<numofdims; j++)
{
X[i][j] = Xmin[j] + rand_01 * (Xmax[j] - Xmin[j]); // initialize X
V[i][j] = 0; // initialize V
}
}
for(int i=0; i<numofparticles; i++)
{
fitnesses[i] = Sch(X[i], numofdims); //
}
double minfit = fitnesses[0]; // temporary minimal value
int minidx = 0; // temporary index of minimal value
int* idxofparticles = new int[numofparticles];
for(int i=0; i<numofparticles; i++)
idxofparticles[i] = i;
double* Xmean = new double[numofdims];
int* losers = new int[halfswarm]; // for saving losers indexes
for(int iter=0; iter<maxiter; iter++)
{
random_shuffle(idxofparticles, idxofparticles+numofparticles);
for(int j=0; j<numofdims; j++)
{
for(int i=0; i<numofparticles; i++)
{
Xmean[j] += X[i][j];
}
Xmean[j] = (double) Xmean[j] / numofparticles; // calculate swarm center
}
for(int i = 0; i < halfswarm; i++)
{
// indexes are now random
// compare 1st to (halfswarm+1)th, 2nd to (halfswarm+2)th, ...
if(fitnesses[idxofparticles[i]] < fitnesses[idxofparticles[i+halfswarm]])
{
losers[i] = idxofparticles[i+halfswarm];
for(int j = 0; j < numofdims; j++)
{
V[idxofparticles[i+halfswarm]][j] = rand_01 * V[idxofparticles[i+halfswarm]][j] + rand_01 * (X[idxofparticles[i]][j] - X[idxofparticles[i+halfswarm]][j]) + rand_01 * phi * (Xmean[j] - X[idxofparticles[i+halfswarm]][j]);
X[idxofparticles[i+halfswarm]][j] = min(max((X[idxofparticles[i+halfswarm]][j] + V[idxofparticles[i+halfswarm]][j]), Xmin[j]), Xmax[j]);
}
}
else
{
losers[i] = idxofparticles[i];
for(int j = 0; j < numofdims; j++)
{
V[idxofparticles[i]][j] = rand_01 * V[idxofparticles[i]][j] + rand_01 * (X[idxofparticles[i+halfswarm]][j] - X[idxofparticles[i]][j]) + rand_01 * phi * (Xmean[j] - X[idxofparticles[i]][j]);
X[idxofparticles[i]][j] = min(max((X[idxofparticles[i]][j] + V[idxofparticles[i]][j]), Xmin[j]), Xmax[j]);
}
}
}
// recalculate particles' values
for(int i=0; i<numofparticles; i++)
{
fitnesses[i] = Sch(X[i], numofdims);
if(fitnesses[i] < minfit)
{
minfit = fitnesses[i]; // update minimum
minidx = i; // update index
}
}
if(iter % 1000 == 0)
{
cout << scientific << endl;
cout << minfit << endl;
}
}
cout << scientific << endl;
cout << minfit << endl;
t2=clock();
delete [] X;
delete [] V;
delete [] fitnesses;
delete [] idxofparticles;
delete [] Xmean;
delete [] losers;
float diff ((float)t2-(float)t1);
float seconds = diff / CLOCKS_PER_SEC;
cout << "runtime: " << seconds << "s" <<endl;
return 0;
}
double Sch(double X[], int d)
{
double result=abs(X[0]);
for(int j=0; j<d; j++)
{
if(abs(X[j]) > result)
result = abs(X[j]);
}
return result;
}
So, finally, why can't my c++ code reproduce matlab's outcome? Thank you very much.

Time dependent 1D Schrodinger equation C++

I wrote the code in C++ which solves the time-dependent 1D Schrodinger equation for the anharmonic potential V = x^2/2 + lambda*x^4, using Thomas algorithm. My code is working and I animate the results in Mathematica, to check what is going on. I test the code against the known solution for the harmonic potential (I put lambda = 0), but the animation shows that abs(Psi) is changing with time, and I know that is not correct for the harmonic potential. Actually, I see that in one point it time it becomes constant, but before that is oscillating.
So I understand that I need to have constant magnitude of the wave function over the time interval, but I don't know how to do it, or where am I doing mistake.
Here is my code and the animation for 100 time steps and 100 points on the grid.
#include <iostream>
#include <iomanip>
#include <cmath>
#include <vector>
#include <cstdlib>
#include <complex>
#include <fstream>
using namespace std;
// Mandatory parameters
const int L = 1; //length of domain in x direction
const int tmax = 10; //end time
const int nx = 100, nt = 100; //number of the grid points and time steps respectively
double lambda; //dictates the shape of the potential (we can use lambda = 0.0
// to test the code against the known solution for the harmonic
// oscillator)
complex<double> I(0.0, 1.0); //imaginary unit
// Derived parameters
double delta_x = 1. / (nx - 1);
//spacing between the grid points
double delta_t = 1. / (nt - 1);
//the time step
double r = delta_t / (delta_x * delta_x); //used to simplify expressions for
// the coefficients of the lhs and
// rhs of the matrix eqn
// Algorithm for solving the tridiagonal matrix system
vector<complex<double> > thomas_algorithm(vector<double>& a,
vector<complex<double> >& b,
vector<double>& c,
vector<complex<double> >& d)
{
// Temporary wave function
vector<complex<double> > y(nx + 1, 0.0);
// Modified matrix coefficients
vector<complex<double> > c_prime(nx + 1, 0.0);
vector<complex<double> > d_prime(nx + 1, 0.0);
// This updates the coefficients in the first row
c_prime[0] = c[0] / b[0];
d_prime[0] = d[0] / b[0];
// Create the c_prime and d_prime coefficients in the forward sweep
for (int i = 1; i < nx + 1; i++)
{
complex<double> m = 1.0 / (b[i] - a[i] * c_prime[i - 1]);
c_prime[i] = c[i] * m;
d_prime[i] = (d[i] - a[i] * d_prime[i - 1]) * m;
}
// This gives the value of the last equation in the system
y[nx] = d_prime[nx];
// This is the reverse sweep, used to update the solution vector
for (int i = nx - 1; i > 0; i--)
{
y[i] = d_prime[i] - c_prime[i] * y[i + 1];
}
return y;
}
void calc()
{
// First create the vectors to store the coefficients
vector<double> a(nx + 1, 1.0);
vector<complex<double> > b(nx + 1, 0.0);
vector<double> c(nx + 1, 1.0);
vector<complex<double> > d(nx + 1, 0.0);
vector<complex<double> > psi(nx + 1, 0.0);
vector<complex<double> > phi(nx + 1, 0.0);
vector<double> V(nx + 1, 0.0);
vector<double> x(nx + 1, 0);
vector<vector<complex<double> > > PSI(nt + 1,
vector<complex<double> >(nx + 1,
0.0));
vector<double> prob(nx + 1, 0);
// We don't have the first member of the left diagonal and the last member
// of the right diagonal
a[0] = 0.0;
c[nx] = 0.0;
for (int i = 0; i < nx + 1; i++)
{
x[i] = (-nx / 2) + i; // Values on the x axis
// Eigenfunction of the harmonic oscillator in the ground state
phi[i] = exp(-pow(x[i] * delta_x, 2) / 2) / (pow(M_PI, 0.25));
// Anharmonic potential
V[i] = pow(x[i] * delta_x, 2) / 2 + lambda * pow(x[i] * delta_x, 4);
// The main diagonal coefficients
b[i] = 2.0 * I / r - 2.0 + V[i] * delta_x * delta_x;
}
double sum0 = 0.0;
for (int i = 0; i < nx + 1; i++)
{
PSI[0][i] = phi[i]; // Initial condition for the wave function
sum0 += abs(pow(PSI[0][i], 2)); // Needed for the normalization
}
sum0 = sum0 * delta_x;
for (int i = 0; i < nx + 1; i++)
{
PSI[0][i] = PSI[0][i] / sqrt(sum0); // Normalization of the initial
// wave function
}
for (int j = 0; j < nt; j++)
{
PSI[j][0] = 0.0;
PSI[j][nx] = 0.0; // Boundary conditions for the wave function
d[0] = 0.0;
d[nx] = 0.0; // Boundary conditions for the rhs
// Fill in the current time step vector d representing the rhs
for (int i = 1; i < nx + 1; i++)
{
d[i] = PSI[j][i + 1]
+ (2.0 - 2.0 * I / r - V[i] * delta_x * delta_x) * PSI[j][i]
+ PSI[j][i - 1];
}
// Now solve the tridiagonal system
psi = thomas_algorithm(a, b, c, d);
for (int i = 1; i < nx; i++)
{
PSI[j + 1][i] = psi[i]; // Assign values to the wave function
}
for (int i = 0; i < nx + 1; i++)
{
// Probability density of the wave function in the next time step
prob[i] = abs(PSI[j + 1][i] * conj(PSI[j + 1][i]));
}
double sum = 0.0;
for (int i = 0; i < nx + 1; i++)
{
sum += prob[i] * delta_x;
}
for (int i = 0; i < nx + 1; i++)
{
// Normalization of the wave function in the next time step
PSI[j + 1][i] /= sqrt(sum);
}
}
// Opening files for writing the results
ofstream file_psi_re, file_psi_imag, file_psi_abs, file_potential,
file_phi0;
file_psi_re.open("psi_re.dat");
file_psi_imag.open("psi_imag.dat");
file_psi_abs.open("psi_abs.dat");
for (int i = 0; i < nx + 1; i++)
{
file_psi_re << fixed << x[i] << " ";
file_psi_imag << fixed << x[i] << " ";
file_psi_abs << fixed << x[i] << " ";
for (int j = 0; j < nt + 1; j++)
{
file_psi_re << fixed << setprecision(6) << PSI[j][i].real() << " ";
file_psi_imag << fixed << setprecision(6) << PSI[j][i].imag()
<< " ";
file_psi_abs << fixed << setprecision(6) << abs(PSI[j][i]) << " ";
}
file_psi_re << endl;
file_psi_imag << endl;
file_psi_abs << endl;
}
}
int main(int argc, char **argv)
{
calc();
return 0;
}
The black line is abs(psi), the red one is Im(psi) and the blue one is Re(psi).
(Bear in mind that my computational physics course was ten years ago now)
You say you are solving a time-dependent system, but I don't see any time-dependence (even if lambda != 0). In the Schrodinger Equation, if the potential function does not depend on time then the different equation is called separable because you can solve the time component and spatial component of the differential equation separately.
The general solution in that case is just the solution to the time-independent Schrodinger Equation multiplied by exp(-iE/h_bar). When you plot the magnitude of the probability that term just becomes 1 and so the probability doesn't change over time. In these cases people quite typically just ignore the time component altogether.
All this is to say that since your potential function doesn't depend on time then you aren't solving a time-dependent Schrodinger Equation. The Tridiagonal Matrix Algorithm can only be used to solve ordinary differential equations, whereas if your potential depended on time you would have a partial differential equation and would need a different method to solve it. Also as a result of that plotting the probability density over time is rarely interesting.
As for why your potential is not constant, numerical methods for finding eigenvalues and eigenvectors rarely produce the normalised eigenvectors naturally, so are you manually normalising your eigenvector before computing your probabilities?

Vectors and matrices in C++ for generating a spectrogram

This is my first attempt to generate a spectrogram of a sinusoidal signal with C++.
To generate the spectrogram:
I divided the real sinusoidal signal into B blocks
Applied Hanning window on each block (I assumed there is no overlap). This should give me the inputs for the fft, in[j][k] where k is the block number
Apply fft on in[j][k] for each block and store it.
Here is the script:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <fftw3.h>
#include <iostream>
#include <cmath>
#include <fstream>
using namespace std;
int main(){
int i;
int N = 500; // sampled
int Windowsize = 100;
double Fs = 200; // sampling frequency
double T = 1 / Fs; // sample time
double f = 50; // frequency
double *in;
fftw_complex *out;
double t[N]; // time vector
fftw_plan plan_forward;
std::vector<double> signal(N);
int B = N / Windowsize; //number of blocks
in = (double*)fftw_malloc(sizeof(double) * N);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
//Generating the signal
for(int i = 0; i < = N; i++){
t[i] = i * T;
signal[i] = 0.7 * sin(2 * M_PI * f * t[i]);// generate sine waveform
}
//Applying the Hanning window function on each block B
for(int k = 0; i <= B; k++){
for(int j = 0; j <= Windowsize; j++){
double multiplier = 0.5 * (1 - cos(2 * M_PI * j / (N-1))); // Hanning Window
in[j][k] = multiplier * signal[j];
}
plan_forward = fftw_plan_dft_r2c_1d (Windowsize, in, out, FFTW_ESTIMATE );
fftw_execute(plan_forward);
v[j][k]=(20 * log(sqrt(out[i][0] * out[i][0] + out[i][1] * out[i][1]))) / N;
}
fftw_destroy_plan(plan_forward);
fftw_free(in);
fftw_free(out);
return 0;
}
So, the question is: What is the correct way to declare in[j][k] and v[j][k] variables.
Update:I have declared my v [j] [k] as a matrix : double v [5][249]; according to this site :http://www.cplusplus.com/doc/tutorial/arrays/ so now my script looks like:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <fftw3.h>
#include <iostream>
#include <cmath>
#include <fstream>
using namespace std;
int main()
{
int i;
double y;
int N=500;//Number of pints acquired inside the window
double Fs=200;//sampling frequency
int windowsize=100;
double dF=Fs/N;
double T=1/Fs;//sample time
double f=50;//frequency
double *in;
fftw_complex *out;
double t[N];//time vector
double tt[5];
double ff[N];
fftw_plan plan_forward;
double v [5][249];
in = (double*) fftw_malloc(sizeof(double) * N);
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
plan_forward = fftw_plan_dft_r2c_1d ( N, in, out, FFTW_ESTIMATE );
for (int i=0; i<= N;i++)
{
t[i]=i*T;
in[i] =0.7 *sin(2*M_PI*f*t[i]);// generate sine waveform
}
for (int k=0; k< 5;k++){
for (int i = 0; i<windowsize; i++){
double multiplier = 0.5 * (1 - cos(2*M_PI*i/(windowsize-1)));//Hanning Window
in[i] = multiplier * in[i+k*windowsize];
fftw_execute ( plan_forward );
for (int i = 0; i<= (N/2); i++)
{
v[k][i]=(20*log10(sqrt(out[i][0]*out[i][0]+ out[i][1]*out[i] [1])));//Here I have calculated the y axis of the spectrum in dB
}
}
}
for (int k=0; k< 5;k++)//Center time for each block
{
tt[k]=(2*k+1)*T*(windowsize/2);
}
fstream myfile;
myfile.open("example2.txt",fstream::out);
myfile << "plot '-' using 1:2" << std::endl;
for (int k=0; k< 5;k++){
for (int i = 0; i<= ((N/2)-1); i++)
{
myfile << v[k][i]<< " " << tt[k]<< std::endl;
}
}
myfile.close();
fftw_destroy_plan ( plan_forward );
fftw_free ( in );
fftw_free ( out );
return 0;
}
I do not get errors anymore but the spectrogram plot is not right.
As indicated in FFTW's documentation, the size of the output (out in your case) when using fftw_plan_dft_r2c_1d is not the same as the size of the input. More specifically for an input of N real samples, the output consists of N/2+1 complex values. You may then allocate out with:
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (N/2 + 1));
For the spectrogram output you will then similarly have (N/2+1) magnitudes for each of the B blocks, resulting in the 2D array:
double** v = new double*[B];
for (int i = 0; i < B; i++){
v[i] = new double[(N/2+1)];
}
Also, note that you may reuse the input buffer in for each iteration (filling it with data for a new block). However since you have chosen to compute an N-point FFT and will be storing smaller blocks of Windowsize samples (in this case N=500 and Windowsize=100), make sure to initialize the remaining samples with zeros:
in = (double*)fftw_malloc(sizeof(double) * N);
for (int i = 0; i < N; i++){
in[i] = 0;
}
Note that in addition to the declaration and allocation of the in and v variables, the code you posted suffers from a few additional issues:
When computing the Hanning window, you should divide by the Windowsize-1 not N-1 (since in your case N correspond to the FFT size).
You are taking the FFT of the same block of signal over and over again since you are always indexing with j in the [0,Windowsize] range. You would most likely want to add an offset each time you process a different block.
Since the FFT size does not change, you only need to create the plan once. At the very least if you are going to create your plan at every iteration, you should similarly destroy it (with fftw_destroy_plan) at every iteration.
And a few additional points which may require some thoughts:
Scaling the log-scaled magnitudes by dividing by N might not do what you think. You are much more likely to want to scale the linear-scale magnitudes (ie. divide the magnitude before taking the logarithm). Note that this will result in a constant offset of the spectrum curve, which for many application is not that significant. If the scaling is important for your application, you may have a look at another answer of mine for more details.
The common formula 20*log10(x) typically used to convert linear scale to decibels uses a base-10 logarithm instead of the natural log (base e~2.7182) function which you've used. This would result in a multiplicative scaling (stretching), which may or may not be significant depending on your application.
To summarize, the following code might be more in line with what you are trying to do:
// Allocate & initialize buffers
in = (double*)fftw_malloc(sizeof(double) * N);
for (int i = 0; i < N; i++){
in[i] = 0;
}
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (N/2 + 1));
v = new (double*)[B];
for (int i = 0; i < B; i++){
v[i] = new double[(N/2+1)];
}
// Generate the signal
...
// Create the plan once
plan_forward = fftw_plan_dft_r2c_1d (Windowsize, in, out, FFTW_ESTIMATE);
// Applying the Hanning window function on each block B
for(int k = 0; k < B; k++){
for(int j = 0; j < Windowsize; j++){
// Hanning Window
double multiplier = 0.5 * (1 - cos(2 * M_PI * j / (Windowsize-1)));
in[j] = multiplier * signal[j+k*Windowsize];
}
fftw_execute(plan_forward);
for (int j = 0; j <= N/2; j++){
// Factor of 2 is to account for the fact that we are only getting half
// the spectrum (the other half is not return by a R2C plan due to symmetry)
v[k][j] = 2*(out[j][0] * out[j][0] + out[j][1] * out[j][1])/(N*N);
}
// DC component and at Nyquist frequency do not have a corresponding symmetric
// value, so should not have been doubled up above. Correct those special cases.
v[k][0] *= 0.5;
v[k][N/2] *= 0.5;
// Convert to decibels
for (int j = 0; j <= N/2; j++){
// 20*log10(sqrt(x)) is equivalent to 10*log10(x)
// also use some small epsilon (e.g. 1e-5) to avoid taking the log of 0
v[k][j] = 10 * log10(v[k][j] + epsilon);
}
}
// Clean up
fftw_destroy_plan(plan_forward);
fftw_free(in);
fftw_free(out);
// Delete this last one after you've done something useful with the spectrogram
for (int i = 0; i < B; i++){
delete[] v[i];
}
delete[] v;
Looks like you're missing the initial declaration for 'v' altogether, and 'in' is not declared properly.
See this page for a related question about creating 2D arrays in C++. As I understand, fftw_malloc() is basically new() or malloc() but aligns the variable properly for the FFTW algorithm.
Since you're not supplying 'v' to the anything related to FFTW, you could use standard malloc() for that.

C++: replicating matlab's interp1 spline interpolation function

Can anyone give me some direction to replicating MATLAB's interp1 function, using spline interpolation? I tried closely replicating the algorithm on the wikipedia page, but the results don't really match up.
#include <stdio.h>
#include <stdint.h>
#include <iostream>
#include <vector>
//MATLAB: interp1(x,test_array,query_points,'spline')
int main(){
int size = 10;
std::vector<float> test_array(10);
test_array[0] = test_array[4] = test_array[8] = 1;
test_array[1] = test_array[3] = test_array[5] = test_array[7] = test_array[9] = 4;
test_array[2] = test_array[6] = 7;
std::vector<float> query_points;
for (int i = 0; i < 10; i++)
query_points.push_back(i +.05);
int n = (size - 1);
std::vector<float> a(n+1);
std::vector<float> x(n+1); //sample_points vector
for (int i = 0; i < (n+1); i++){
x[i] = i + 1.0;
a[i] = test_array[i];
}
std::vector<float> b(n);
std::vector<float> d(n);
std::vector<float> h(n);
for (int i = 0; i < (n); ++i)
h[i] = x[i+1] - x[i];
std::vector<float> alpha(n);
for (int i = 1; i < n; ++i)
alpha[i] = ((3 / h[i]) * (a[i+1] - a[i])) - ((3 / h[i-1]) * (a[i] - a[i-1]));
std::vector<float> c(n+1);
std::vector<float> l(n+1);
std::vector<float> u(n+1);
std::vector<float> z(n+1);
l[0] = 1.0;
u[0] = z[0] = 0.0;
for (int i = 1; i < n; ++i){
l[i] = (2 * (x[i+1] - x[i-1])) - (h[i-1] * u[i-1]);
u[i] = h[i] / l[i];
z[i] = (alpha[i] - (h[i-1] * z[i-1])) / l[i];
}
l[n] = 1.0;
z[n] = c[n] = 0.0;
for (int j = (n - 1); j >= 0; j--){
c[j] = z[j] - (u[j] * c[j+1]);
b[j] = ((a[j+1] - a[j]) / h[j]) - ((h[j] / 3) * (c[j+1] + (2 * c[j])));
d[j] = (c[j+1] - c[j]) / (3 * h[j]);
}
std::vector<float> output_array(10);
for (int i = 0; i < n-1; i++){
float eval_point = (query_points[i] - x[i]);
output_array[i] = a[i] + (eval_point * b[i]) + ( eval_point * eval_point * c[i]) + (eval_point * eval_point * eval_point * d[i]);
std::cout << output_array[i] << std::endl;
}
system("pause");
return 0;
}
In hindsight, your code seems to be coded properly referring to the Wikipedia article. However, there is something you need to know about interp1 which I don't think you have taken into account when using it to check your answers.
MATLAB's interp1 when you specify the spline flag assumes that the end point conditions are not-a-knot. The algorithm specified on Wikipedia is the code for a natural spline.
As such, this is probably why your points do not match up. FWIW, consult: http://www.cs.tau.ac.il/~turkel/notes/numeng/spline_note.pdf and look at the diagram on the last page. You'll see that not-a-knot splines and natural splines bear the same shape, but have different y-values when your data consists of just the end points of your spline. However, should you have data points in between the end points, all of the different kinds of splines (more or less) have the same y values.
For the sake of completeness, here is the figure extracted from the PDF notes I referenced above:
If you want to use natural splines, use csape instead of interp1. This provides a cubic spline with end conditions. You call csape like this:
pp = csape(x,y);
x and y are the control points defined for your spline. By default, this returns a natural spline, which is what you're after, and is a struct of type ppform. You can then figure out what the spline evaluates to by using fnval:
yval = fnval(pp, xval);
xval and yval is the input x co-ordinate and the output evaluated for the spline at this particular x.
Use this, then check to see if your code matches up with the values provided by csape.
Minor Note
You need the Curve Fitting Toolbox in MATLAB to use csape. If you don't have this, then unfortunately this method will not work.
I think the interp1 is supported by MATLAB CODER.
Just use the CODER to generate the C code and you have what you need.