When trying to compile OpenACC code with GCC-9.3.0 (g++) configured with --enable-languages=c,c++,lto --disable-multilib the following code does not use multiple cores, whereas if the same code is compiled with the pgc++ compiler it does use multiple cores.
g++ compilation: g++ -lgomp -Ofast -o jsolve -fopenacc jsolvec.cpp
pgc++ compilation: pgc++ -o jsolvec.exe jsolvec.cpp -fast -Minfo=opt -ta=multicore
Code from OpenACC Tutorial1/solver https://github.com/OpenACCuserGroup/openacc-users-group.git:
// Jacobi iterative method for solving a system of linear equations
// This is guaranteed to converge if the matrix is diagonally dominant,
// so we artificially force the matrix to be diagonally dominant.
// See https://en.wikipedia.org/wiki/Jacobi_method
//
// We solve for vector x in Ax = b
// Rewrite the matrix A as a
// lower triangular (L),
// upper triangular (U),
// and diagonal matrix (D).
//
// Ax = (L + D + U)x = b
//
// rearrange to get: Dx = b - (L+U)x --> x = (b-(L+U)x)/D
//
// we can do this iteratively: x_new = (b-(L+U)x_old)/D
// build with TYPE=double (default) or TYPE=float
// build with TOLERANCE=0.001 (default) or TOLERANCE= any other value
// three arguments:
// vector size
// maximum iteration count
// frequency of printing the residual (every n-th iteration)
#include <cmath>
#include <omp.h>
#include <cstdlib>
#include <iostream>
#include <iomanip>
using std::cout;
#ifndef TYPE
#define TYPE double
#endif
#define TOLERANCE 0.001
void
init_simple_diag_dom(int nsize, TYPE* A)
{
int i, j;
// In a diagonally-dominant matrix, the diagonal element
// is greater than the sum of the other elements in the row.
// Scale the matrix so the sum of the row elements is close to one.
for (i = 0; i < nsize; ++i) {
TYPE sum;
sum = (TYPE)0;
for (j = 0; j < nsize; ++j) {
TYPE x;
x = (rand() % 23) / (TYPE)1000;
A[i*nsize + j] = x;
sum += x;
}
// Fill diagonal element with the sum
A[i*nsize + i] += sum;
// scale the row so the final matrix is almost an identity matrix
for (j = 0; j < nsize; j++)
A[i*nsize + j] /= sum;
}
} // init_simple_diag_dom
int
main(int argc, char **argv)
{
int nsize; // A[nsize][nsize]
int i, j, iters, max_iters, riter;
double start_time, elapsed_time;
TYPE residual, err, chksum;
TYPE *A, *b, *x1, *x2, *xnew, *xold, *xtmp;
// set matrix dimensions and allocate memory for matrices
nsize = 0;
if (argc > 1)
nsize = atoi(argv[1]);
if (nsize <= 0)
nsize = 1000;
max_iters = 0;
if (argc > 2)
max_iters = atoi(argv[2]);
if (max_iters <= 0)
max_iters = 5000;
riter = 0;
if (argc > 3)
riter = atoi(argv[3]);
if (riter <= 0)
riter = 200;
cout << "nsize = " << nsize << ", max_iters = " << max_iters << "\n";
A = new TYPE[nsize*nsize];
b = new TYPE[nsize];
x1 = new TYPE[nsize];
x2 = new TYPE[nsize];
// generate a diagonally dominant matrix
init_simple_diag_dom(nsize, A);
// zero the x vectors, random values to the b vector
for (i = 0; i < nsize; i++) {
x1[i] = (TYPE)0.0;
x2[i] = (TYPE)0.0;
b[i] = (TYPE)(rand() % 51) / 100.0;
}
start_time = omp_get_wtime();
//
// jacobi iterative solver
//
residual = TOLERANCE + 1.0;
iters = 0;
xnew = x1; // swap these pointers in each iteration
xold = x2;
while ((residual > TOLERANCE) && (iters < max_iters)) {
++iters;
// swap input and output vectors
xtmp = xnew;
xnew = xold;
xold = xtmp;
#pragma acc parallel loop
for (i = 0; i < nsize; ++i) {
TYPE rsum = (TYPE)0;
#pragma acc loop reduction(+:rsum)
for (j = 0; j < nsize; ++j) {
if (i != j) rsum += A[i*nsize + j] * xold[j];
}
xnew[i] = (b[i] - rsum) / A[i*nsize + i];
}
//
// test convergence, sqrt(sum((xnew-xold)**2))
//
residual = 0.0;
#pragma acc parallel loop reduction(+:residual)
for (i = 0; i < nsize; i++) {
TYPE dif;
dif = xnew[i] - xold[i];
residual += dif * dif;
}
residual = sqrt((double)residual);
if (iters % riter == 0 ) cout << "Iteration " << iters << ", residual is " << residual << "\n";
}
elapsed_time = omp_get_wtime() - start_time;
cout << "\nConverged after " << iters << " iterations and " << elapsed_time << " seconds, residual is " << residual << "\n";
//
// test answer by multiplying my computed value of x by
// the input A matrix and comparing the result with the
// input b vector.
//
err = (TYPE)0.0;
chksum = (TYPE)0.0;
for (i = 0; i < nsize; i++) {
TYPE tmp;
xold[i] = (TYPE)0.0;
for (j = 0; j < nsize; j++)
xold[i] += A[i*nsize + j] * xnew[j];
tmp = xold[i] - b[i];
chksum += xnew[i];
err += tmp * tmp;
}
err = sqrt((double)err);
cout << "Solution error is " << err << "\n";
if (err > TOLERANCE)
cout << "****** Final Solution Out of Tolerance ******\n" << err << " > " << TOLERANCE << "\n";
delete A;
delete b;
delete x1;
delete x2;
return 0;
}
It's not yet supported in GCC to use OpenACC to schedule parallel loops onto multicore CPUs. Using OpenMP works for that, of course, and you can have code with mixed OpenACC (for GPU offloading, as already present in your code) and OpenMP directives (for CPU parallelization, not yet present in your code), so that the respective mechanism will be used depending on whether compiling with -fopenacc vs. -fopenmp.
Like PGI are doing, it certainly can be supported in GCC; we'll certainly be able to implement that, but it has not yet been scheduled, has not yet been funded for GCC.
Related
So I have implemented a fully connected one hidden layer neural network in C++ using Eigen for matrix multiplication. It uses minibatch gradient descent.
However, my model cannot get above 50% accuracy on mnist. I have tried learning rates from between 0.0001 and 10. The model does overfit on training sizes < 100 (with ~90% accuracy which is still pretty bad), albeit extremely slowly.
What might be causing this low accuracy and extremely slow learning? My main concern is that the backpropagation is incorrect. Furthermore, I would prefer not to add any other optimization techniques (learning rate schedule, regularization, etc.).
Feed forward and backprop code:
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
Full program code:
#include <iostream>
#include <fstream>
#include <math.h>
#include <cstdlib>
#include <Eigen/Dense>
#include <vector>
#include <string>
using namespace Eigen;
#define N 30
#define epsilon 0.7
#define epoch 1000
//sizes
const int minibatch_size = 10;
const int training_size = 10000;
const int val_size = 10;
unsigned int num, magic, rows, cols;
//images
unsigned int image[training_size][28][28];
unsigned int val_image[val_size][28][28];
//labels
unsigned int label[training_size];
unsigned int val_label[val_size];
//inputs
MatrixXd X(784, training_size);
MatrixXd Y = MatrixXd::Zero(10, training_size);
//minibatch
MatrixXd mbX(784, minibatch_size);
MatrixXd mbY = MatrixXd::Zero(10, minibatch_size);
//validation
MatrixXd Xv(784, val_size);
MatrixXd Yv = MatrixXd::Zero(10, val_size);
//Image processing courtesy of https://stackoverflow.com/users/11146076/%e5%bc%a0%e4%ba%91%e9%93%ad
unsigned int in(std::ifstream& icin, unsigned int size) {
unsigned int ans = 0;
for (int i = 0; i < size; i++) {
unsigned char x;
icin.read((char*)&x, 1);
unsigned int temp = x;
ans <<= 8;
ans += temp;
}
return ans;
}
void input(std::string ipath, std::string lpath, std::string ipath2, std::string lpath2) {
std::ifstream icin;
//training data
icin.open(ipath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < training_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
image[i][x][y] = in(icin, 1);
X(val, i) = image[i][x][y]/255;
val++;
}
}
}
icin.close();
//training labels
icin.open(lpath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < training_size; i++) {
label[i] = in(icin, 1);
Y(label[i], i) = 1;
}
icin.close();
//validation data
icin.open(ipath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < val_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
val_image[i][x][y] = in(icin, 1);
Xv(val, i) = val_image[i][x][y]/255;
val++;
}
}
}
icin.close();
//validation labels
icin.open(lpath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < val_size; i++) {
val_label[i] = in(icin, 1);
Yv(val_label[i], i) = 1;
}
icin.close();
}
//Neural Network calculations
MatrixXd sigmoid(MatrixXd m) {
m *= -1;
return (1/(1 + m.array().exp())).matrix();
}
MatrixXd sigmoid_derivative(MatrixXd m) {
return (sigmoid(m).array() * (1 - sigmoid(m).array())).matrix();
}
//Initialize weights and biases
//hidden layer
VectorXd b1 = MatrixXd::Zero(N, 1);
MatrixXd w1 = MatrixXd::Random(N, 784);
//output
VectorXd b2 = MatrixXd::Zero(10, 1);
MatrixXd w2 = MatrixXd::Random(10, N);
//Initialize intermediate values
MatrixXd z1, z2, a1, a2, z1v, z2v, a1v, a2v;
MatrixXd ones = MatrixXd::Constant(minibatch_size, 1, 1);
int main() {
input("C:\\Users\\Aaron\\Documents\\Test\\train-images-idx3-ubyte\\train-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\train-labels-idx1-ubyte\\train-labels.idx1-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-images-idx3-ubyte\\t10k-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-labels-idx1-ubyte\\t10k-labels.idx1-ubyte");
std::cout << "Finished Image Processing" << std::endl;
//std::cout << w1 << std::endl;
std::vector<double> val_ac;
std::vector<double> c;
std::vector<int> order;
for (int i = 0; i < training_size; i++) {
order.push_back(i);
}
for (int i = 0; i < epoch; i++) {
//feed forward
std::random_shuffle(order.begin(), order.end());
for (int j = 0; j < training_size/minibatch_size; j++) {
for (int k = 0; k < minibatch_size; k++) {
int index = order[j * minibatch_size + k];
mbX.col(k) = X.col(index);
mbY.col(k) = Y.col(index);
}
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
//std::cout << err << std::endl;
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
//std::cout << err << std::endl;
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
}
//validation
z1 = (w1 * X).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
double cost = 1/((double) training_size) * ((a2 - Y).array() * (a2 - Y).array()).matrix().sum();
c.push_back(cost);
int correct = 0;
for (int i = 0; i < training_size; i++) {
double maxP = -1;
int na;
for (int j = 0; j < 10; j++) {
if (a2(j, i) > maxP) {
maxP = a2(j, i);
na = j;
}
}
if (na == label[i]) correct++;
}
val_ac.push_back(((double) correct) / ((double) training_size));
std::cout << "Finished Epoch " << i + 1 << std::endl;
std::cout << "Cost: " << cost << std::endl;
std::cout << "Accuracy: " << ((double) correct) / ((double) training_size) << std::endl;
}
//plot accuracy
FILE * gp = _popen("gnuplot", "w");
fprintf(gp, "set terminal wxt size 600,400 \n");
fprintf(gp, "set grid \n");
fprintf(gp, "set title '%s' \n", "NN");
fprintf(gp, "plot '-' w line, '-' w lines \n");
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, c[i]);
}
fprintf(gp, "e\n");
//validation accuracy
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, val_ac[i]);
}
fprintf(gp, "e\n");
fflush(gp);
system("pause");
_pclose(gp);
return 0;
}
UPD
Here is a graph of the accuracy on the training dataset (green) and the loss (purple)
https://i.stack.imgur.com/Ya2yR.png
Here is a graph of the loss for the training data and validation data:
https://imgur.com/a/4gmFCrk
The loss of the validation data is increasing past a certain point, which shows signs of overfitting. However, the accuracy still remains abysmal even on the training data.
unsigned int val_image[val_size][28][28];
Xv(val, i) = val_image[i][x][y]/255;
Can you try again with Xv(val, i) = val_image[i][x][y] / 255.0;
There too:
X(val, i) = image[i][x][y]/255;
With the code as written, Xv is 0 very often, and 1, when the image as value 255. With a floating point division, you'll get value between 0.0 and 1.0.
You'll need to check your code for other places where you may be dividing integers.
N.b.: In C++, 240/255 is 0.
I made a function that makes the inverse and then another multithreaded, as long I have to make inverse of arrays >2000 x 2000.
A 1000x1000 array unthreated takes 2.5 seconds (on a i5-4460 4 cores 2.9ghz)
and multithreaded takes 7.25 seconds
I placed the multithreads in the part that most time consumption is taken. Whai is wrong?
Is due vectors are used instead of 2 dimensions arrays?
This is the minimum code to test both versions:
#include<iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <chrono>
#include <thread>
const int NUCLEOS = 8;
#ifdef __linux__
#include <unistd.h> //usleep()
typedef std::chrono::system_clock t_clock; //try to use high_resolution_clock on new linux x64 computer!
#else
typedef std::chrono::high_resolution_clock t_clock;
#pragma warning(disable:4996)
#endif
using namespace std;
std::chrono::time_point<t_clock> start_time, stop_time = start_time; char null_char = '\0';
void timer(char *title = 0, int data_size = 1) { stop_time = t_clock::now(); double us = (double)chrono::duration_cast<chrono::microseconds>(stop_time - start_time).count(); if (title) printf("%s time = %7lgms = %7lg MOPs\n", title, (double)us*1e-3, (double)data_size / us); start_time = t_clock::now(); }
//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord);
//returns inverse of x, x is not modified, not threaded
vector< vector<double> > inverse(vector< vector<double> > x)
{
if (x.size() != x[0].size())
{
cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
}
size_t dim = x.size();
int i, j, ord;
vector< vector<double> > y(dim,vector<double>(dim,0));//initializes output = 0
//init_2Dvector(y, dim, dim);
//1. Unity array y:
for (i = 0; i < dim; i++)
{
y[i][i] = 1.0;
}
double diagon, coef;
double *ptrx, *ptry, *ptrx2, *ptry2;
for (ord = 0; ord<dim; ord++)
{
//2 Hacemos diagonal de x =1
int i2;
if (fabs(x[ord][ord])<1e-15) //If that element is 0, a line that contains a non zero is added
{
for (i2 = ord + 1; i2<dim; i2++)
{
if (fabs(x[i2][ord])>1e-15) break;
}
if (i2 >= dim)
return{};//error, returns null
for (i = 0; i<dim; i++)//added a line without 0
{
x[ord][i] += x[i2][i];
y[ord][i] += y[i2][i];
}
}
diagon = 1.0/x[ord][ord];
ptry = &y[ord][0];
ptrx = &x[ord][0];
for (i = 0; i < dim; i++)
{
*ptry++ *= diagon;
*ptrx++ *= diagon;
}
//uses the same function but not threaded:
colum_zero(x,y,0,dim,dim,ord);
}//end ord
return y;
}
//threaded version
vector< vector<double> > inverse_th(vector< vector<double> > x)
{
if (x.size() != x[0].size())
{
cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
}
int dim = (int) x.size();
int i, ord;
vector< vector<double> > y(dim, vector<double>(dim, 0));//initializes output = 0
//init_2Dvector(y, dim, dim);
//1. Unity array y:
for (i = 0; i < dim; i++)
{
y[i][i] = 1.0;
}
std::thread tarea[NUCLEOS];
double diagon;
double *ptrx, *ptry;// , *ptrx2, *ptry2;
for (ord = 0; ord<dim; ord++)
{
//2 Hacemos diagonal de x =1
int i2;
if (fabs(x[ord][ord])<1e-15) //If a diagonal element=0 it is added a column that is not 0 the diagonal element
{
for (i2 = ord + 1; i2<dim; i2++)
{
if (fabs(x[i2][ord])>1e-15) break;
}
if (i2 >= dim)
return{};//error, returns null
for (i = 0; i<dim; i++)//It is looked for a line without zero to be added to make the number a non zero one to avoid later divide by 0
{
x[ord][i] += x[i2][i];
y[ord][i] += y[i2][i];
}
}
diagon = 1.0 / x[ord][ord];
ptry = &y[ord][0];
ptrx = &x[ord][0];
for (i = 0; i < dim; i++)
{
*ptry++ *= diagon;
*ptrx++ *= diagon;
}
int pos0 = 0, N1 = dim;//initial array position
if ((N1<1) || (N1>5000))
{
cout << "It is detected out than 1-5000 simulations points=" << N1 << " ABORT or press enter to continue" << endl; getchar();
}
//cout << "Initiation of " << NUCLEOS << " threads" << endl;
for (int thread = 0; thread<NUCLEOS; thread++)
{
int pos1 = (int)((thread + 1)*N1 / NUCLEOS);//next position
tarea[thread] = std::thread(colum_zero, std::ref(x), std::ref(y), pos0, pos1, dim, ord);//ojo, coil current=1!!!!!!!!!!!!!!!!!!
pos0 = pos1;//next thread will work at next point
}
for (int thread = 0; thread<NUCLEOS; thread++)
{
tarea[thread].join();
//cout << "Thread num: " << thread << " end\n";
}
}//end ord
return y;
}
//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord)
{
double coef;
double *ptrx, *ptry, *ptrx2, *ptry2;
//Hacemos '0' la columna ord salvo elemento diagonal:
for (int i = pos0; i<pos1; i++)//Begin to end for every thread
{
if (i == ord) continue;
coef = x[i][ord];//element to make 0
if (fabs(coef)<1e-15) continue; //If already zero, it is avoided
ptry = &y[i][0];
ptry2 = &y[ord][0];
ptrx = &x[i][0];
ptrx2 = &x[ord][0];
for (int j = 0; j < dim; j++)
{
*ptry++ = *ptry - coef * (*ptry2++);//1ª matriz
*ptrx++ = *ptrx - coef * (*ptrx2++);//2ª matriz
}
}
}
void test_6_inverse(int dim)
{
vector< vector<double> > vec1(dim, vector<double>(dim));
for (int i=0;i<dim;i++)
for (int j = 0; j < dim; j++)
{
vec1[i][j] = (-1.0 + 2.0*rand() / RAND_MAX) * 10000;
}
vector< vector<double> > vec2,vec3;
double ini, end;
ini = (double)clock();
vec2 = inverse(vec1);
end = (double)clock();
cout << "=== Time inverse unthreaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
ini=end;
vec3 = inverse_th(vec1);
end = (double)clock();
cout << "=== Time inverse threaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
cout<<vec2[2][2]<<" "<<vec3[2][2]<<endl;//to make the sw to do de inverse
cout << endl;
}
int main()
{
test_6_inverse(1000);
cout << endl << "=== END ===" << endl; getchar();
return 1;
}
After looking deeper in the code of the colum_zero() function I have seen that one thread rewrites in the data to be used by another threads, so the threads are not INDEPENDENT from each other. Fortunately the compiler detect it and avoid it.
Conclusions:
It is not recommended to try Gauss-Jordan method alone to make multithreads
If somebody detects that in multithread is slower and the initial function is spreaded correctly for every thread, perhaps is due one thread results are used by another
The main function inverse() works and can be used by other programmers, so this question should not be deleted
Non answered question:
What is a matrix inverse method that could be spreaded in a lot of independent threads to be used in a gpu?
This question might be long and I really appreciate your patience. The core problem is I used matlab and c++ to implement an optimization algorithm but they provided me different results(matlab's better).
I am recently studying some evolutionary algorithms and interested in one variant of PSO(Particle Swarm Optimization), which is called Competitive Swarm Optimizer(born in 2015). This is the paper link http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6819057.
The basic idea of this algorithm is to first generate some random particles in searching space and assign them random velocities. At each iteration, we randomly pair them and let every pair of particles compare their objective function values. Winners(with better objective values) keep status quo while losers update themselves by learning from winners(moving toward winners).
Suppose at iteration t, particle i and j are compared and i is better. Then we update particle j for iteration t+1 by following these formulas. If particle j is out of searching space, we simply pull it back to the boundary. R_1, R_2, R_3 are all random vectors uniformly drawn from [0, 1]; operation 'otimes' means elementwise product; phi is a parameter; x_bar is the center of swarm.
For example, suppose now I want to minimize a 500-d Schwefel function(minimize the maximal absolute element) and I use 250 particles, set phi=0.1, searching space is 500-d [-100, 100]. Matlab could return me something around 35 while C++ got stuck at 85 to 90. I cannot figure out what's the problem.
Let me attach my matlab and c++ code here.
Sch = #(x)max(abs(x))
lb = -100 * ones(1, 500);
ub = 100 * ones(1, 500);
swarmsize = 250;
phi = 0.1;
maxiter = 10000;
tic
cso(Sch, lb, ub, swarmsize, phi, maxiter);
toc
function [minf, minx] = cso(obj_fun, lb, ub, swarmsize, phi, maxiter)
assert(length(lb) == length(ub), 'Not equal length of bounds');
if all(ub - lb <= 0) > 0
error('Error. \n Upper bound must be greater than lower bound.')
end
vhigh = abs(ub - lb);
vlow = -vhigh;
S = swarmsize;
D = length(ub);
x = rand(S, D);
x = bsxfun(#plus, lb, bsxfun(#times, ub-lb, x)); % randomly initalize all particles
v = zeros([S D]); % set initial velocities to 0
iter = 0;
pairnum_1 = floor(S / 2);
losers = 1:S;
fx = arrayfun(#(K) obj_fun(x(K, :)), 1:S);
randperm_index = randperm(S);
while iter <= maxiter
fx(losers) = arrayfun(#(K) obj_fun(x(K, :)), losers);
swarm_center = mean(x); % calculate center all particles
randperm_index = randperm(S); % randomly permuate all particle indexes
rpairs = [randperm_index(1:pairnum_1); randperm_index(S-pairnum_1+1:S)]'; % random pair
cmask= (fx(rpairs(:, 1)) > fx(rpairs(:, 2)))';
losers = bsxfun(#times, cmask, rpairs(:, 1)) + bsxfun(#times, ~cmask, rpairs(:, 2)); % losers who with larger values
winners = bsxfun(#times, ~cmask, rpairs(:, 1)) + bsxfun(#times, cmask, rpairs(:, 2)); % winners who with smaller values
R1 = rand(pairnum_1, D);
R2 = rand(pairnum_1, D);
R3 = rand(pairnum_1, D);
v(losers, :) = bsxfun(#times, R1, v(losers, :)) + bsxfun(#times, R2, x(winners, :) - x(losers, :)) + phi * bsxfun(#times, R3, bsxfun(#minus, swarm_center, x(losers, :)));
x(losers, :) = x(losers, :) + v(losers, :);
maskl = bsxfun(#lt, x(losers, :), lb);
masku = bsxfun(#gt, x(losers, :), ub);
mask = bsxfun(#lt, x(losers, :), lb) | bsxfun(#gt, x(losers, :), ub);
x(losers, :) = bsxfun(#times, ~mask, x(losers, :)) + bsxfun(#times, lb, maskl) + bsxfun(#times, ub, masku);
iter = iter + 1;
fprintf('Iter: %d\n', iter);
fprintf('Best fitness: %e\n', min(fx));
end
fprintf('Best fitness: %e\n', min(fx));
[minf, min_index] = min(fx);
minx = x(min_index, :);
end
(I didn't write C++ function.)
#include <cstring>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <ctime>
#include <iomanip>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define rand_01 ((double) rand() / RAND_MAX) // generate 0~1 random numbers
#define PI 3.14159265359
const int numofdims = 500; // problem dimension
const int numofparticles = 250; // number of particles
const int halfswarm = numofparticles / 2;
const double phi = 0.1;
const int maxiter = 10000; // iteration number
double Sch(double X[], int d); // max(abs(x_i))
using namespace std;
int main(){
clock_t t1,t2;
t1=clock();
srand(time(0)); // random seed
double** X = new double*[numofparticles]; // X for storing all particles
for(int i=0; i<numofparticles; i++)
X[i] = new double[numofdims];
double** V = new double*[numofparticles]; // V for storing velocities
for(int i=0; i<numofparticles; i++)
V[i] = new double[numofdims];
double Xmin[numofdims] = {0}; // lower bounds
double Xmax[numofdims] = {0}; // upper bounds
double* fitnesses = new double[numofparticles]; // objective function values
for(int j=0; j<numofdims; j++)
{
Xmin[j] = -100;
Xmax[j] = 100;
}
for(int i=0; i<numofparticles; i++)
{
for(int j=0; j<numofdims; j++)
{
X[i][j] = Xmin[j] + rand_01 * (Xmax[j] - Xmin[j]); // initialize X
V[i][j] = 0; // initialize V
}
}
for(int i=0; i<numofparticles; i++)
{
fitnesses[i] = Sch(X[i], numofdims); //
}
double minfit = fitnesses[0]; // temporary minimal value
int minidx = 0; // temporary index of minimal value
int* idxofparticles = new int[numofparticles];
for(int i=0; i<numofparticles; i++)
idxofparticles[i] = i;
double* Xmean = new double[numofdims];
int* losers = new int[halfswarm]; // for saving losers indexes
for(int iter=0; iter<maxiter; iter++)
{
random_shuffle(idxofparticles, idxofparticles+numofparticles);
for(int j=0; j<numofdims; j++)
{
for(int i=0; i<numofparticles; i++)
{
Xmean[j] += X[i][j];
}
Xmean[j] = (double) Xmean[j] / numofparticles; // calculate swarm center
}
for(int i = 0; i < halfswarm; i++)
{
// indexes are now random
// compare 1st to (halfswarm+1)th, 2nd to (halfswarm+2)th, ...
if(fitnesses[idxofparticles[i]] < fitnesses[idxofparticles[i+halfswarm]])
{
losers[i] = idxofparticles[i+halfswarm];
for(int j = 0; j < numofdims; j++)
{
V[idxofparticles[i+halfswarm]][j] = rand_01 * V[idxofparticles[i+halfswarm]][j] + rand_01 * (X[idxofparticles[i]][j] - X[idxofparticles[i+halfswarm]][j]) + rand_01 * phi * (Xmean[j] - X[idxofparticles[i+halfswarm]][j]);
X[idxofparticles[i+halfswarm]][j] = min(max((X[idxofparticles[i+halfswarm]][j] + V[idxofparticles[i+halfswarm]][j]), Xmin[j]), Xmax[j]);
}
}
else
{
losers[i] = idxofparticles[i];
for(int j = 0; j < numofdims; j++)
{
V[idxofparticles[i]][j] = rand_01 * V[idxofparticles[i]][j] + rand_01 * (X[idxofparticles[i+halfswarm]][j] - X[idxofparticles[i]][j]) + rand_01 * phi * (Xmean[j] - X[idxofparticles[i]][j]);
X[idxofparticles[i]][j] = min(max((X[idxofparticles[i]][j] + V[idxofparticles[i]][j]), Xmin[j]), Xmax[j]);
}
}
}
// recalculate particles' values
for(int i=0; i<numofparticles; i++)
{
fitnesses[i] = Sch(X[i], numofdims);
if(fitnesses[i] < minfit)
{
minfit = fitnesses[i]; // update minimum
minidx = i; // update index
}
}
if(iter % 1000 == 0)
{
cout << scientific << endl;
cout << minfit << endl;
}
}
cout << scientific << endl;
cout << minfit << endl;
t2=clock();
delete [] X;
delete [] V;
delete [] fitnesses;
delete [] idxofparticles;
delete [] Xmean;
delete [] losers;
float diff ((float)t2-(float)t1);
float seconds = diff / CLOCKS_PER_SEC;
cout << "runtime: " << seconds << "s" <<endl;
return 0;
}
double Sch(double X[], int d)
{
double result=abs(X[0]);
for(int j=0; j<d; j++)
{
if(abs(X[j]) > result)
result = abs(X[j]);
}
return result;
}
So, finally, why can't my c++ code reproduce matlab's outcome? Thank you very much.
I wrote the code in C++ which solves the time-dependent 1D Schrodinger equation for the anharmonic potential V = x^2/2 + lambda*x^4, using Thomas algorithm. My code is working and I animate the results in Mathematica, to check what is going on. I test the code against the known solution for the harmonic potential (I put lambda = 0), but the animation shows that abs(Psi) is changing with time, and I know that is not correct for the harmonic potential. Actually, I see that in one point it time it becomes constant, but before that is oscillating.
So I understand that I need to have constant magnitude of the wave function over the time interval, but I don't know how to do it, or where am I doing mistake.
Here is my code and the animation for 100 time steps and 100 points on the grid.
#include <iostream>
#include <iomanip>
#include <cmath>
#include <vector>
#include <cstdlib>
#include <complex>
#include <fstream>
using namespace std;
// Mandatory parameters
const int L = 1; //length of domain in x direction
const int tmax = 10; //end time
const int nx = 100, nt = 100; //number of the grid points and time steps respectively
double lambda; //dictates the shape of the potential (we can use lambda = 0.0
// to test the code against the known solution for the harmonic
// oscillator)
complex<double> I(0.0, 1.0); //imaginary unit
// Derived parameters
double delta_x = 1. / (nx - 1);
//spacing between the grid points
double delta_t = 1. / (nt - 1);
//the time step
double r = delta_t / (delta_x * delta_x); //used to simplify expressions for
// the coefficients of the lhs and
// rhs of the matrix eqn
// Algorithm for solving the tridiagonal matrix system
vector<complex<double> > thomas_algorithm(vector<double>& a,
vector<complex<double> >& b,
vector<double>& c,
vector<complex<double> >& d)
{
// Temporary wave function
vector<complex<double> > y(nx + 1, 0.0);
// Modified matrix coefficients
vector<complex<double> > c_prime(nx + 1, 0.0);
vector<complex<double> > d_prime(nx + 1, 0.0);
// This updates the coefficients in the first row
c_prime[0] = c[0] / b[0];
d_prime[0] = d[0] / b[0];
// Create the c_prime and d_prime coefficients in the forward sweep
for (int i = 1; i < nx + 1; i++)
{
complex<double> m = 1.0 / (b[i] - a[i] * c_prime[i - 1]);
c_prime[i] = c[i] * m;
d_prime[i] = (d[i] - a[i] * d_prime[i - 1]) * m;
}
// This gives the value of the last equation in the system
y[nx] = d_prime[nx];
// This is the reverse sweep, used to update the solution vector
for (int i = nx - 1; i > 0; i--)
{
y[i] = d_prime[i] - c_prime[i] * y[i + 1];
}
return y;
}
void calc()
{
// First create the vectors to store the coefficients
vector<double> a(nx + 1, 1.0);
vector<complex<double> > b(nx + 1, 0.0);
vector<double> c(nx + 1, 1.0);
vector<complex<double> > d(nx + 1, 0.0);
vector<complex<double> > psi(nx + 1, 0.0);
vector<complex<double> > phi(nx + 1, 0.0);
vector<double> V(nx + 1, 0.0);
vector<double> x(nx + 1, 0);
vector<vector<complex<double> > > PSI(nt + 1,
vector<complex<double> >(nx + 1,
0.0));
vector<double> prob(nx + 1, 0);
// We don't have the first member of the left diagonal and the last member
// of the right diagonal
a[0] = 0.0;
c[nx] = 0.0;
for (int i = 0; i < nx + 1; i++)
{
x[i] = (-nx / 2) + i; // Values on the x axis
// Eigenfunction of the harmonic oscillator in the ground state
phi[i] = exp(-pow(x[i] * delta_x, 2) / 2) / (pow(M_PI, 0.25));
// Anharmonic potential
V[i] = pow(x[i] * delta_x, 2) / 2 + lambda * pow(x[i] * delta_x, 4);
// The main diagonal coefficients
b[i] = 2.0 * I / r - 2.0 + V[i] * delta_x * delta_x;
}
double sum0 = 0.0;
for (int i = 0; i < nx + 1; i++)
{
PSI[0][i] = phi[i]; // Initial condition for the wave function
sum0 += abs(pow(PSI[0][i], 2)); // Needed for the normalization
}
sum0 = sum0 * delta_x;
for (int i = 0; i < nx + 1; i++)
{
PSI[0][i] = PSI[0][i] / sqrt(sum0); // Normalization of the initial
// wave function
}
for (int j = 0; j < nt; j++)
{
PSI[j][0] = 0.0;
PSI[j][nx] = 0.0; // Boundary conditions for the wave function
d[0] = 0.0;
d[nx] = 0.0; // Boundary conditions for the rhs
// Fill in the current time step vector d representing the rhs
for (int i = 1; i < nx + 1; i++)
{
d[i] = PSI[j][i + 1]
+ (2.0 - 2.0 * I / r - V[i] * delta_x * delta_x) * PSI[j][i]
+ PSI[j][i - 1];
}
// Now solve the tridiagonal system
psi = thomas_algorithm(a, b, c, d);
for (int i = 1; i < nx; i++)
{
PSI[j + 1][i] = psi[i]; // Assign values to the wave function
}
for (int i = 0; i < nx + 1; i++)
{
// Probability density of the wave function in the next time step
prob[i] = abs(PSI[j + 1][i] * conj(PSI[j + 1][i]));
}
double sum = 0.0;
for (int i = 0; i < nx + 1; i++)
{
sum += prob[i] * delta_x;
}
for (int i = 0; i < nx + 1; i++)
{
// Normalization of the wave function in the next time step
PSI[j + 1][i] /= sqrt(sum);
}
}
// Opening files for writing the results
ofstream file_psi_re, file_psi_imag, file_psi_abs, file_potential,
file_phi0;
file_psi_re.open("psi_re.dat");
file_psi_imag.open("psi_imag.dat");
file_psi_abs.open("psi_abs.dat");
for (int i = 0; i < nx + 1; i++)
{
file_psi_re << fixed << x[i] << " ";
file_psi_imag << fixed << x[i] << " ";
file_psi_abs << fixed << x[i] << " ";
for (int j = 0; j < nt + 1; j++)
{
file_psi_re << fixed << setprecision(6) << PSI[j][i].real() << " ";
file_psi_imag << fixed << setprecision(6) << PSI[j][i].imag()
<< " ";
file_psi_abs << fixed << setprecision(6) << abs(PSI[j][i]) << " ";
}
file_psi_re << endl;
file_psi_imag << endl;
file_psi_abs << endl;
}
}
int main(int argc, char **argv)
{
calc();
return 0;
}
The black line is abs(psi), the red one is Im(psi) and the blue one is Re(psi).
(Bear in mind that my computational physics course was ten years ago now)
You say you are solving a time-dependent system, but I don't see any time-dependence (even if lambda != 0). In the Schrodinger Equation, if the potential function does not depend on time then the different equation is called separable because you can solve the time component and spatial component of the differential equation separately.
The general solution in that case is just the solution to the time-independent Schrodinger Equation multiplied by exp(-iE/h_bar). When you plot the magnitude of the probability that term just becomes 1 and so the probability doesn't change over time. In these cases people quite typically just ignore the time component altogether.
All this is to say that since your potential function doesn't depend on time then you aren't solving a time-dependent Schrodinger Equation. The Tridiagonal Matrix Algorithm can only be used to solve ordinary differential equations, whereas if your potential depended on time you would have a partial differential equation and would need a different method to solve it. Also as a result of that plotting the probability density over time is rarely interesting.
As for why your potential is not constant, numerical methods for finding eigenvalues and eigenvectors rarely produce the normalised eigenvectors naturally, so are you manually normalising your eigenvector before computing your probabilities?
I am having trouble explaining/understanding the following phenomenon:
To test fftw3 i am using the 2d poisson test case:
laplacian(f(x,y)) = - g(x,y) with periodic boundary conditions.
After applying the fourier transform to the equation we obtain : F(kx,ky) = G(kx,ky) /(kx² + ky²) (1)
if i take g(x,y) = sin (x) + sin(y) , (x,y) \in [0,2 \pi] i have immediately f(x,y) = g(x,y)
which is what i am trying to obtain with the fft :
i compute G from g with a forward Fourier transform
From this i can compute the Fourier transform of f with (1).
Finally, i compute f with the backward Fourier transform (without forgetting to normalize by 1/(nx*ny)).
In practice, the results are pretty bad?
(For instance, the amplitude for N = 256 is twice the amplitude obtained with N = 512)
Even worse, if i try g(x,y) = sin(x)*sin(y) , the curve has not even the same form of the solution.
(note that i must change the equation; i divide by two the laplacian in this case : (1) becomes F(kx,ky) = 2*G(kx,ky)/(kx²+ky²)
Here is the code:
/*
* fftw test -- double precision
*/
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <fftw3.h>
using namespace std;
int main()
{
int N = 128;
int i, j ;
double pi = 3.14159265359;
double *X, *Y ;
X = (double*) malloc(N*sizeof(double));
Y = (double*) malloc(N*sizeof(double));
fftw_complex *out1, *in2, *out2, *in1;
fftw_plan p1, p2;
double L = 2.*pi;
double dx = L/(N - 1);
in1 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
out2 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
out1 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
in2 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
p1 = fftw_plan_dft_2d(N, N, in1, out1, FFTW_FORWARD,FFTW_MEASURE );
p2 = fftw_plan_dft_2d(N, N, in2, out2, FFTW_BACKWARD,FFTW_MEASURE);
for(i = 0; i < N; i++){
X[i] = -pi + i*dx ;
for(j = 0; j < N; j++){
Y[j] = -pi + j*dx ;
in1[i*N + j][0] = sin(X[i]) + sin(Y[j]) ; // row major ordering
//in1[i*N + j][0] = sin(X[i]) * sin(Y[j]) ; // 2nd test case
in1[i*N + j][1] = 0 ;
}
}
fftw_execute(p1); // FFT forward
for ( i = 0; i < N; i++){ // f = g / ( kx² + ky² )
for( j = 0; j < N; j++){
in2[i*N + j][0] = out1[i*N + j][0]/ (i*i+j*j+1e-16);
in2[i*N + j][1] = out1[i*N + j][1]/ (i*i+j*j+1e-16);
//in2[i*N + j][0] = 2*out1[i*N + j][0]/ (i*i+j*j+1e-16); // 2nd test case
//in2[i*N + j][1] = 2*out1[i*N + j][1]/ (i*i+j*j+1e-16);
}
}
fftw_execute(p2); //FFT backward
// checking the results computed
double erl1 = 0.;
for ( i = 0; i < N; i++) {
for( j = 0; j < N; j++){
erl1 += fabs( in1[i*N + j][0] - out2[i*N + j][0]/N/N )*dx*dx;
cout<< i <<" "<< j<<" "<< sin(X[i])+sin(Y[j])<<" "<< out2[i*N+j][0]/N/N <<" "<< endl; // > output
}
}
cout<< erl1 << endl ; // L1 error
fftw_destroy_plan(p1);
fftw_destroy_plan(p2);
fftw_free(out1);
fftw_free(out2);
fftw_free(in1);
fftw_free(in2);
return 0;
}
I can't find any (more) mistakes in my code (i installed the fftw3 library last week) and i don't see a problem with the maths either but i don't think it's the fft's fault. Hence my predicament. I am all out of ideas and all out of google as well.
Any help solving this puzzle would be greatly appreciated.
note :
compiling : g++ test.cpp -lfftw3 -lm
executing : ./a.out > output
and i use gnuplot in order to plot the curves :
(in gnuplot ) splot "output" u 1:2:4 ( for the computed solution )
Here are some little points to be modified :
You need to account for all small frequencies, including the negative ones ! Index i corresponds to the frequency 2PI i/N but also to the frequency 2PI (i-N)/N. In the Fourier space, the end of the array matters as much as the beginning ! In our case, we keep the smallest frequency : it's 2PI i/N for the first half of the array, and 2PI(i-N)/N on the second half.
Of course, as Paul said, N-1 should be Nin double dx = L/(N - 1); => double dx = L/(N ); N-1 does not correspond to a continious periodic signal. It woud be hard to use it as a test case...
Scaling...I did it empirically
The result i obtain is closer to the expected one, for both cases. Here is the code :
/*
* fftw test -- double precision
*/
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <fftw3.h>
using namespace std;
int main()
{
int N = 128;
int i, j ;
double pi = 3.14159265359;
double *X, *Y ;
X = (double*) malloc(N*sizeof(double));
Y = (double*) malloc(N*sizeof(double));
fftw_complex *out1, *in2, *out2, *in1;
fftw_plan p1, p2;
double L = 2.*pi;
double dx = L/(N );
in1 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
out2 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
out1 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
in2 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N*N) );
p1 = fftw_plan_dft_2d(N, N, in1, out1, FFTW_FORWARD,FFTW_MEASURE );
p2 = fftw_plan_dft_2d(N, N, in2, out2, FFTW_BACKWARD,FFTW_MEASURE);
for(i = 0; i < N; i++){
X[i] = -pi + i*dx ;
for(j = 0; j < N; j++){
Y[j] = -pi + j*dx ;
in1[i*N + j][0] = sin(X[i]) + sin(Y[j]) ; // row major ordering
// in1[i*N + j][0] = sin(X[i]) * sin(Y[j]) ; // 2nd test case
in1[i*N + j][1] = 0 ;
}
}
fftw_execute(p1); // FFT forward
for ( i = 0; i < N; i++){ // f = g / ( kx² + ky² )
for( j = 0; j < N; j++){
double fact=0;
in2[i*N + j][0]=0;
in2[i*N + j][1]=0;
if(2*i<N){
fact=((double)i*i);
}else{
fact=((double)(N-i)*(N-i));
}
if(2*j<N){
fact+=((double)j*j);
}else{
fact+=((double)(N-j)*(N-j));
}
if(fact!=0){
in2[i*N + j][0] = out1[i*N + j][0]/fact;
in2[i*N + j][1] = out1[i*N + j][1]/fact;
}else{
in2[i*N + j][0] = 0;
in2[i*N + j][1] = 0;
}
//in2[i*N + j][0] = out1[i*N + j][0];
//in2[i*N + j][1] = out1[i*N + j][1];
// in2[i*N + j][0] = out1[i*N + j][0]*(1.0/(i*i+1e-16)+1.0/(j*j+1e-16)+1.0/((N-i)*(N-i)+1e-16)+1.0/((N-j)*(N-j)+1e-16))*N*N;
// in2[i*N + j][1] = out1[i*N + j][1]*(1.0/(i*i+1e-16)+1.0/(j*j+1e-16)+1.0/((N-i)*(N-i)+1e-16)+1.0/((N-j)*(N-j)+1e-16))*N*N;
//in2[i*N + j][0] = 2*out1[i*N + j][0]/ (i*i+j*j+1e-16); // 2nd test case
//in2[i*N + j][1] = 2*out1[i*N + j][1]/ (i*i+j*j+1e-16);
}
}
fftw_execute(p2); //FFT backward
// checking the results computed
double erl1 = 0.;
for ( i = 0; i < N; i++) {
for( j = 0; j < N; j++){
erl1 += fabs( in1[i*N + j][0] - out2[i*N + j][0]/(N*N))*dx*dx;
cout<< i <<" "<< j<<" "<< sin(X[i])+sin(Y[j])<<" "<< out2[i*N+j][0]/(N*N) <<" "<< endl; // > output
// cout<< i <<" "<< j<<" "<< sin(X[i])*sin(Y[j])<<" "<< out2[i*N+j][0]/(N*N) <<" "<< endl; // > output
}
}
cout<< erl1 << endl ; // L1 error
fftw_destroy_plan(p1);
fftw_destroy_plan(p2);
fftw_free(out1);
fftw_free(out2);
fftw_free(in1);
fftw_free(in2);
return 0;
}
This code is far from being perfect, it is neither optimized nor beautiful. But it gives almost what is expected.
Bye,