The following fluid simulation is a translation of a paper by Stam. Something truly terrible has happened. Each time the program is run with a low DIFF=0.01, the values start off small and then rapidly expand, or "blow up". I have checked the math routines carefully. Since the code starts off with one 0.5, mathematically it is multiplying and adding a bunch of zeros, so the end result should be close to zero density and other vectors.
The code is quite long, so I've separated it into chunks and removed extra code. Minus all the beginning and SDL code there are only about 120 lines. I have spent a few hours trying changes to no avail, so help is greatly appreciated.
After some experimentation I believe there may be some floating-point error when DIFF is set too low. When the value is increased from 0.01 to 0.02, the values don't blow up. I don't think this is the entire issue, though.
To be clear, the current answers by 1201ProgramAlarm and vidstige do not resolve the problem.
Sections in bold are important parts, the rest is for completeness.
Beginning stuff, skip
#include <SDL2/SDL.h>
#include <stdio.h>
#include <iostream>
#include <algorithm>
#define IX(i,j) ((i)+(N+2)*(j))
using namespace std;
// Constants
const int SCREEN_WIDTH = 600;
const int SCREEN_HEIGHT = 600; // Should match SCREEN_WIDTH
const int N = 20; // Grid size
const int SIM_LEN = 1000;
const int DELAY_LENGTH = 40; // ms
const float VISC = 0.01;
const float dt = 0.1;
const float DIFF = 0.01;
const bool DISPLAY_CONSOLE = false; // Console or graphics
const bool DRAW_GRID = false; // implement later
const int nsize = (N+2)*(N+2);
Math routines Diffuse routines divide by 1+4*a. Does this imply density must be <= 1?
void set_bnd(int N, int b, vector<float> &x)
{
// removed
}
inline void lin_solve(int N, int b, vector<float> &x, vector<float> &x0, float a, float c)
{
for (int k=0; k<20; k++)
{
for (int i=1; i<=N; i++)
{
for (int j=1; j<=N; j++)
{
x[IX(i,j)] = (x0[IX(i,j)] + a*(x[IX(i-1,j)]+x[IX(i+1,j)]+x[IX(i,j-1)]+x[IX(i,j+1)])) / c;
}
}
set_bnd ( N, b, x );
}
}
// Add forces
void add_source(vector<float> &x, vector<float> &s, float dt)
{
for (int i=0; i<nsize; i++) x[i] += dt*s[i];
}
// Diffusion with Gauss-Seidel relaxation
void diffuse(int N, int b, vector<float> &x, vector<float> &x0, float diff, float dt)
{
float a = dt*diff*N*N;
lin_solve(N, b, x, x0, a, 1+4*a);
}
// Backwards advection
void advect(int N, int b, vector<float> &d, vector<float> &d0, vector<float> &u, vector<float> &v, float dt)
{
float dt0 = dt*N;
for (int i=1; i<=N; i++)
{
for (int j=1; j<=N; j++)
{
float x = i - dt0*u[IX(i,j)];
float y = j - dt0*v[IX(i,j)];
if (x<0.5) x=0.5; if (x>N+0.5) x=N+0.5;
int i0=(int)x; int i1=i0+1;
if (y<0.5) y=0.5; if (y>N+0.5) y=N+0.5;
int j0=(int)y; int j1=j0+1;
float s1 = x-i0; float s0 = 1-s1; float t1 = y-j0; float t0 = 1-t1;
d[IX(i,j)] = s0*(t0*d0[IX(i0,j0)] + t1*d0[IX(i0,j1)]) +
s1*(t0*d0[IX(i1,j0)] + t1*d0[IX(i1,j1)]);
}
}
set_bnd(N, b, d);
}
}
void project(int N, vector<float> &u, vector<float> &v, vector<float> &p, vector<float> &div)
{
float h = 1.0/N;
for (int i=1; i<=N; i++)
{
for (int j=1; j<=N; j++)
{
div[IX(i,j)] = -0.5*h*(u[IX(i+1,j)] - u[IX(i-1,j)] +
v[IX(i,j+1)] - v[IX(i,j-1)]);
p[IX(i,j)] = 0;
}
}
set_bnd(N, 0, div); set_bnd(N, 0, p);
lin_solve(N, 0, p, div, 1, 4);
for (int i=1; i<=N; i++)
{
for (int j=1; j<=N; j++)
{
u[IX(i,j)] -= 0.5*(p[IX(i+1,j)] - p[IX(i-1,j)])/h;
v[IX(i,j)] -= 0.5*(p[IX(i,j+1)] - p[IX(i,j-1)])/h;
}
}
set_bnd(N, 1, u); set_bnd(N, 2, v);
}
Density and velocity solver
// Density solver
void dens_step(int N, vector<float> &x, vector<float> &x0, vector<float> &u, vector<float> &v, float diff, float dt)
{
add_source(x, x0, dt);
swap(x0, x); diffuse(N, 0, x, x0, diff, dt);
swap(x0, x); advect(N, 0, x, x0, u, v, dt);
}
// Velocity solver: addition of forces, viscous diffusion, self-advection
void vel_step(int N, vector<float> &u, vector<float> &v, vector<float> &u0, vector<float> &v0, float visc, float dt)
{
add_source(u, u0, dt); add_source(v, v0, dt);
swap(u0, u); diffuse(N, 1, u, u0, visc, dt);
swap(v0, v); diffuse(N, 2, v, v0, visc, dt);
project(N, u, v, u0, v0);
swap(u0, u); swap(v0, v);
advect(N, 1, u, u0, u0, v0, dt); advect(N, 2, v, v0, u0, v0, dt);
project(N, u, v, u0, v0);
}
I considered floating-point inconsistencies, but after compiling with -ffloat-store the problem still persisted.
The problem is related to a lack of normalization in add_source().
When your density becomes sufficiently stationary (x0 very similar in distribution to x, up to a scale factor), then add_source() effectively multiplies x by about 1+dt, leading to your exponential blowup. High values of DIFF mask this effect by weighing x more heavily over x0 in lin_solve(), meaning that the effective multiplier is brought closer to 1, but is still above 1.
The effect, then is that with every iteration, more mass is added. If it cannot "spread out" fast enough at the edges, it will start piling up. Once the density becomes perfectly stationary, it will increase in mass at an exponential rate determined by 1+dt/(4a).
With your given settings (dt=0.1, a=0.1*0.01*20*20=0.4), this is 1+0.1/1.6 ~ 1.06.
The fix is to normalize in add_source:
x[i] = (x[i]+dt*s[i])/(1.0f+dt);
, or to compute the c argument to lin_solve() as 1+4*a+dt. Either will force the mass to drop.
One source of trouble is in lin_solve. Your i and j loops start at zero, but you reference IX(i-1,j), which will access the out of bounds array element x[-1].
Seeing this I immediately felt I had to answer. I read this article way back when it was published. I've implemented his stuff on Android and just love it. I even met the fellow when speaking at UmeƄ in the early 2000s, and he's a very friendly fellow. And tall. :)
So to the problem. You are not doing a velocity propagation step, I think this is critical for not "blowing up" if I remember correctly.
Related
I'm writing a program which gets the distances from five ultrasonic sensors and uses a trilateration algorithm on every combination of distance values. It first outputs all the coordinates it recieves from the trilateration algorithm, then it filters out values which are too small or too large. However, it keeps outputting values from previous trilateration cycles as the array has not been reset.
#include "NewPing.h"
#include<stdio.h>
#include<stdlib.h>
#define TP0 11
#define EP0 13
#define TP1 3
#define EP1 4
#define TP2 6
#define EP2 2
#define TP3 5
#define EP3 7
#define TP4 10
#define EP4 8
#define MAX_DISTANCE 150
//sensor coordinates
int xcords [5] = {3, 45, 87, 87, 3};
int ycords [5] = {0, 16, 0, 58, 58};
int r = 3;
int n = 5;
float x = 0;
float y = 0;
int MAX_X = 85;
int MAX_Y = 62;
int sensor;
int count = 0;
int resultCount = 0;
bool reading = false;
void setup() {
Serial.begin (9600);
}
void
loop() {
while(count < 1){
count += 1;
int distances [3];
for (int r = 0; r<5; r++){
distances[r] = getValue(r);
if (distances[r] >= MAX_DISTANCE || distances[r] <= 2) {
getValue(r);
}
//Serial.print("Distance =");
//Serial.print(distances[r]);
//Serial.println(" cm ");
}
printCombination(distances, xcords, ycords, n, r);
//trilateration(distances[0], distances[1], distances[2]);
delay(5000);
}
}
int combinationUtil(int distances[], int data[], int dataCords[], int start, int end, int index, int r, float xresults[], float yresults[]);
// Needed for qsort. See http://w...content-available-to-author-only...s.com/reference/cstdlib/qsort/
int compare (const void * a, const void * b)
{ return ( *(int*)a - *(int*)b ); }
// The main function that prints all combinations of size r
// in arr[] of size n. This function mainly uses combinationUtil()
int printCombination(int distances[], int xcords[], int ycords[], int n, int r)
{
// A temporary array to store all combination one by one
int data[r];
int xdataCords[r];
int ydataCords[r];
float xresults[10];// the arrays causing the issue
float yresults[10];
// Sort array to handle duplicates
qsort (distances, n, sizeof(int), compare);
// Print all combination using temprary array 'data[]'
combinationUtil(distances, data, xdataCords, ydataCords, 0, n-1, 0, r, xresults, yresults);
sorting(xresults,yresults);
resultCount = 0;
Serial.print("\n");
}
/* distances[] ---> Input Array
data[] ---> Temporary array to store current combination
start & end ---> Staring and Ending indexes in distances[]
index ---> Current index in data[]
r ---> Size of a combination to be printed */
float* combinationUtil(int distances[], int data[], int xdataCords[], int ydataCords[], int start, int end, int index, int r, float xresults[], float yresults[])
{
// Current combination is ready to be printed, print it
if (index == r)
{
trilateration(data[0], data[1], data[2], xdataCords[0], xdataCords[1], xdataCords[2], ydataCords[0], ydataCords[1], ydataCords[2], xresults, yresults);
resultCount++;
}
// replace index with all possible elements. The condition
// "end-i+1 >= r-index" makes sure that including one element
// at index will make a combination with remaining elements
// at remaining positions
for (int i=start; i<=end && end-i+1 >= r-index; i++)
{
data[index] = distances[i];
xdataCords[index] = xcords[i];
ydataCords[index] = ycords[i];
combinationUtil(distances, data, xdataCords, ydataCords, i+1, end, index+1, r, xresults, yresults);
// Remove duplicates
while (distances[i] == distances[i+1])
i++;
}
}
float* trilateration(int d1,int d2, int d3, int x1, int x2, int x3, int y1, int y2, int y3, float xresults[], float yresults[]){
float va = ((sq(d2) - sq(d3)) - (sq(x2) - sq(x3)) - (sq(y2) - sq(y3))) / (2) ;
float vb = ((sq(d2) - sq(d1)) - (sq(x2) - sq(x1)) - (sq(y2) - sq(y1))) / (2) ;
y = ((vb*(x3-x2)) - (va*(x1-x2))) / (((y1-y2)*(x3-x2)) - ((y3-y2)*(x1-x2)));
x = (va - (y*(y3-y2))) / (x3-x2);
//Serial.print(d1);
//Serial.print(d2);
//Serial.print(d3);
Serial.print("The coordinates are: ");
Serial.print(abs(x));
xresults[resultCount] = abs(x);
Serial.print(", ");
Serial.println(abs(y));
yresults[resultCount] = abs(y);
//sorting(xresults, yresults);
return xresults, yresults;
}
void sorting(float xresults[], float yresults[]){
float xfinal[10];
float yfinal[10];
qsort (xresults, 10, sizeof(int), compare);
qsort (yresults, 10, sizeof(int), compare);
for (int i = 0; i<10; i++){
if (xresults[i] > 3 && xresults[i] < MAX_X){
xfinal[i] = xresults[i];
Serial.print("x=");
Serial.print(xresults[i]);
Serial.print("\n");
}
if (yresults[i] > 10 && yresults[i] < MAX_Y){
yfinal[i] = yresults[i];
Serial.print("y=");
Serial.print(yresults[i]);
Serial.print("\n");
}
}
}
float getValue(int sensor){
NewPing sonar0(TP0, EP0, MAX_DISTANCE);
NewPing sonar1(TP1, EP1, MAX_DISTANCE);
NewPing sonar2(TP2, EP2, MAX_DISTANCE);
NewPing sonar3(TP3, EP3, MAX_DISTANCE);
NewPing sonar4(TP4, EP4, MAX_DISTANCE);
switch(sensor){
case 0:
return sonar0.ping_cm();
case 1:
return sonar1.ping_cm();
case 2:
return sonar2.ping_cm();
case 3:
return sonar3.ping_cm();
case 4:
return sonar4.ping_cm();
}
}
the arrays "xresults and yresults" is the array causing the problem. You can see that I have redefined it every time printCombination is called so it should be cleared right?
This is my output:
The coordinates are: 40.33, 34.88
The coordinates are: 58.62, 8.00
The coordinates are: 42.09, 8.00
The coordinates are: nan, 7.74
The coordinates are: 59.14, 24.17
The coordinates are: 47.81, 13.41
The coordinates are: 47.81, 7.74
x=40.25
x=58.56
y=24.16
x=47.81
y=13.41
x=47.81
x=32.99
New distance call
The coordinates are: 40.65, 35.72
The coordinates are: 58.62, 8.00
The coordinates are: 42.09, 8.00
The coordinates are: nan, 7.28
The coordinates are: 60.07, 25.03
The coordinates are: 47.81, 13.41
The coordinates are: 47.81, 7.28
x=40.50
x=58.56
y=35.56
x=47.81
y=13.41
x=47.81
x=32.99
New distance call
The coordinates are: 24.98, 15.69
The coordinates are: 50.48, 8.00
The coordinates are: 112.35, 8.00
The coordinates are: nan, 25.91
The coordinates are: 14.66, 84.64
The coordinates are: 91.00, 12.26
The coordinates are: 91.00, 25.91
x=14.63
x=50.31
y=15.70
y=12.26
y=25.91
x=32.99
As you can see there are values which are not in the list of coordinates appearing in the print out after the coordinates.
I'm learning C++ and am doing something I'm comfortable with in java to start out. Particle simulation and flocking using a quadtree to cheaply find particles in a region. Everything is working but when I use the quadtree to get the particles from a region it's really slow (about 1s for 5000 calls).
I tried replacing the vector with an array and measuring the execution time of various parts of the function.
Am I making any rooky mistakes like unnecessarily copying objects etc.? I'm using 5000 particles, I can't imagine 1fps is the fastest it can go.
Full code for minimal reproducable example as per request:
main.cpp
#include <string>
#include <iostream>
#include <random>
#include <chrono>
#include <thread>
#include <cmath>
#include "Particle.h"
#include "Quadtree.h"
// Clock
using namespace std::chrono;
using namespace std::this_thread;
// Global constants
const int SCREEN_WIDTH = 640;
const int SCREEN_HEIGHT = 480;
const int desiredFPS = 30;
const int frameTimeMS = int(1000 / (double)desiredFPS);
const int numberOfParticles = 5000;
// Random number generation
std::random_device dev;
std::mt19937 rng(dev());
std::uniform_real_distribution<> dist(0, 1);
Particle particles[numberOfParticles];
Quadtree quadtree = Quadtree(0, 0, SCREEN_WIDTH, SCREEN_HEIGHT);
int main(int argc, char* args[])
{
for (int i = 0; i < numberOfParticles; i++)
{
particles[i] = Particle(dist(rng) * SCREEN_WIDTH, dist(rng) * SCREEN_HEIGHT);
}
// Clock for making all frames equally long and achieving the desired framerate when possible
auto lapStartTime = system_clock::now();
// Main loop
for (int i = 0; i < 1; i++)
{
// Insert the particles into the quadtree
quadtree = Quadtree(0, 0, SCREEN_WIDTH, SCREEN_HEIGHT);
for (int i = 0; i < numberOfParticles; i++)
{
quadtree.insert(&particles[i]);
}
double neighbourhoodRadius = 40;
for (int i = 0; i < numberOfParticles; i++)
{
// THIS IS THE PART THAT IS SLOW
std::vector<Particle*> neighbours = quadtree.getCircle(
particles[i].x,
particles[i].y,
neighbourhoodRadius
);
}
// Update clocks
auto nextFrameTime = lapStartTime + milliseconds(frameTimeMS);
sleep_until(nextFrameTime);
lapStartTime = nextFrameTime;
}
return 0;
}
Quadtree.h
#pragma once
#include <vector>
#include "Particle.h"
#include "Rect.h"
class Quadtree
{
public:
const static int capacity = 10; // Capacity of any section
Quadtree(double px, double py, double width, double height);
Quadtree(Rect r);
bool insert(Particle* p); // Add a particle to the tree
std::vector<Particle*> getCircle(double px, double py, double r);
int numberOfItems(); // Total amount in the quadtree
private:
std::vector<Particle*> particles; // Particles stored by this section
std::vector<Quadtree> sections; // Branches (only if split)
Rect area; // Region occupied by the quadtree
bool isSplit() { return sections.size() > 0; }
void split(); // Split the quadtree into 4 branches
};
Quadtree.cpp
#include <iostream>
#include "Quadtree.h"
Quadtree::Quadtree(double px, double py, double width, double height)
{
area = Rect(px, py, width, height);
sections = {};
particles = {};
}
Quadtree::Quadtree(Rect r)
{
area = r;
sections = {};
particles = {};
}
bool Quadtree::insert(Particle* p)
{
if (area.intersectPoint(p->x, p->y))
{
if (!isSplit() && particles.size() < capacity)
{
particles.push_back(p);
}
else
{
if (!isSplit()) // Capacity is reached and tree is not split yet
{
split();
}
// That this is a reference is very important!
// Otherwise a copy of the tree will be modified
for (Quadtree& s : sections)
{
if (s.insert(p))
{
return true;
}
}
}
return true;
}
else
{
return false;
}
}
std::vector<Particle*> Quadtree::getCircle(double px, double py, double r)
{
std::vector<Particle*> selection = {};
if (!isSplit())
{
// Add all particles from this section that lie within the circle
for (Particle* p : particles)
{
double a = px - p->x;
double b = py - p->y;
if (a * a + b * b <= r * r)
{
selection.push_back(p);
}
}
}
else
{
// The section is split so add all the particles from the
// branches together
for (Quadtree& s : sections)
{
// Check if the branch and the circle even have any intersection
if (s.area.intersectRect(Rect(px - r, py - r, 2 * r, 2 * r)))
{
// Get the particles from the branch and add them to selection
std::vector<Particle*> branchSelection = s.getCircle(px, py, r);
selection.insert(selection.end(), branchSelection.begin(), branchSelection.end());
}
}
}
return selection;
}
void Quadtree::split()
{
sections.push_back(Quadtree(area.getSection(2, 2, 0, 0)));
sections.push_back(Quadtree(area.getSection(2, 2, 0, 1)));
sections.push_back(Quadtree(area.getSection(2, 2, 1, 0)));
sections.push_back(Quadtree(area.getSection(2, 2, 1, 1)));
std::vector<Particle*> oldParticles{ particles };
particles.clear();
for (Particle* p : oldParticles)
{
bool success = insert(p);
}
}
int Quadtree::numberOfItems()
{
if (!isSplit())
{
return particles.size();
}
else
{
int result = 0;
for (Quadtree& q : sections)
{
result += q.numberOfItems();
}
return result;
}
}
Particle.h
#pragma once
class Particle {
public:
double x;
double y;
Particle(double px, double py) : x(px), y(py) {}
Particle() = default;
};
Rect.h
#pragma once
class Rect
{
public:
double x;
double y;
double w;
double h;
Rect(double px, double py, double width, double height);
Rect() : x(0), y(0), w(0), h(0) {}
bool intersectPoint(double px, double py);
bool intersectRect(Rect r);
Rect getSection(int rows, int cols, int ix, int iy);
};
Rect.cpp
#include "Rect.h"
Rect::Rect(double px, double py, double width, double height)
{
x = px;
y = py;
w = width;
h = height;
}
bool Rect::intersectPoint(double px, double py)
{
return px >= x && px < x + w && py >= y && py < y + h;
}
bool Rect::intersectRect(Rect r)
{
return x + w >= r.x && y + h >= r.y && x <= r.x + r.w && y <= r.y + r.w;
}
Rect Rect::getSection(int cols, int rows, int ix, int iy)
{
return Rect(x + ix * w / cols, y + iy * h / rows, w / cols, h / rows);
}
So... In the original code creating the quadtree takes about 0.001s (relatively insignificant), and the neighbor search takes about 0.06s - here is our culprit (as mentioned by the OP).
Passing the std::vector<Particle*> neighbours as a reference to the getCircle function, gets rid of the insert call at the end of the function as well as new vector allocations (hi to everyone saying "oh, it will be optimized away automatically"). The time is reduced to 0.011s.
The nieghbours vector can be taken out of the main loop, and cleared after use, so that it only resizes on the first frame.
I do not see any more immediately obvious targets (without doing a complete rewrite). Maybe I will add something later.
I decided to approach this more systematically: I added an #if switch for every change I made and actually recorded some statistics, instead of eyeballing it. (Evey change is added incrementally, times include tree construction).
original
by reference
out of loop
min time:
0.0638s
0.0127s
0.0094s
avg time:
0.0664s
0.0136s
0.0104s
max time:
0.0713s
0.0157s
0.0137s
All measurements were done on my machine, with optimized build, using QueryPerfoemanceCounter.
I did end up rewriting the whole thing...
Got rid of vectors.
The Quadtree::particles is now Particle* particles[capacity] with a count.
sections is a pointer; isSplit just checks if sections is 0.
Since the total (or maximum) number of particles is known, the number of particles that can be returned by getCircle can't be more than that. So I allocate that much outside of the main loop to store neighbours. Adding another result involves just bumping a pointer (without even a check in release). And resetting it after use is done by setting the count to 0 (see arena or bump allocator).
The maximum number of quadtree nodes can be inferred from the number of particles. So, similarly, splitting just bumps the pointer by 4.
Trying to precompute the Rect in getCircle, or put px, py, r (and/or that rect as well) in a struct (passed as value or reference) does not yield any improvement (or is detremental). (was suggested by Goswin von Brederlow).
Then I flipped the recursion (was suggested by Ted Lyngmo). The temporary stack is, again, preallocated. And then I did the same thing for insert.
rewrite
non-recursive
insert as well
min_time:
0.0077
0.0069
0.0068
avg_time:
0.0089
0.0073
0.0070
max_time:
0.0084
0.0078
0.0074
So in the end the most impactful thing was the very first - not inserting and not creating unnecessary vectors every call, but instead passing the same one by reference.
One last thing - might want to store the quadtree particles separately, since most of the time getCircle is traversing nodes, where particles are not stored.
Otherwise, I do not see how to improve this any more. At this point it would require someone actually smart or crazy...
When using Rcpp,I create a function named rpois_rcpp and l try to call it below in genDataList function, an error occurs and said :
"no matching function for call to 'cpprbinom',
candidate function not viable: no known conversion from 'arma::vec' (aka 'Col') to 'Rcpp::NumericVector' (aka 'Vector<14>') for 3rd argument
arma::vec cpprbinom(int n, double size, NumericVector prob).
Can someone help me ,thanks!
Here is my code:
//create a random matrix X with covariance matrix sigma
// [[Rcpp::export]]
arma::mat mvrnormArma(const int n, arma::vec mu, const int p, const
double rho) {
arma::mat sigma(p, p, arma::fill::zeros);
for (int i = 0; i < sigma.n_rows; ++i) {
for (int j = 0; j < sigma.n_cols; ++j) {
sigma(i,j) = pow(rho, abs((i + 1) - (j + 1)));
}
}
int ncols = sigma.n_cols;
arma::mat Y = arma::randn(n, ncols);
return arma::repmat(mu, 1, n).t() + Y * arma::chol(sigma);
}
//create a vector sampled from poisson distribution with mean vector
//lambda
// [[Rcpp::export]]
arma::vec rpois_rcpp( NumericVector &lambda) {
int n= lambda.length();
unsigned int lambda_i = 0;
IntegerVector sim(n);
for (unsigned int i = 0; i < n; i++) {
sim[i] = R::rpois(lambda[lambda_i]);
// update lambda_i to match next realized value with correct mean
lambda_i++;
}
return as<arma::vec>(sim);
}
//create a vector sampled from binomial distribution with probability
vector prob
// [[Rcpp::export]]
arma::vec cpprbinom(int n, double size, NumericVector prob) {
NumericVector v = no_init(n);
std::transform( prob.begin(), prob.end(), v.begin(), [=](double p){
return R::rbinom(size, p); });
return as<arma::vec>(v);}
// [[Rcpp::export]]44
List genDataList(int n, arma::vec& mu, int p, double rho,
arma::vec& beta, const double SNR, const std::string &
Test_case) {
arma::mat U, V, data, normData, Projection;
arma::vec s, y, means, noise;
data = mvrnormArma(n, mu, p, rho);
normData = arma::normalise(data,2,0);
arma::svd_econ(U,s,V,normData,"right");
Projection = V * trans(V);
beta = Projection * beta;
if(Test_case == "gaussian")
{
means=normData * beta;
y = means + arma::randn(n) * sqrt(arma::var(means) / SNR);}
else if (Test_case == "poisson")
{
means=exp(normData * beta);
y = rpois_rcpp(means);}
else
{
means=exp(normData * beta)/(1 + exp(normData * beta));
y = cpprbinom(n,1,means);}
List ret;
ret["data"] = data;
ret["normData"] = normData;
ret["V"] = V;
ret["beta"] = beta;
ret["y"] = y;
return ret;
}
Thanks for adding your code. When I tried to compile, I got the same error as you, but also an error for the line calling rpois_rcpp()
invalid initialization of reference to type 'Rcpp::NumericVector&'
Pretty much everything seems to be in arma, except the R bindings and calls to the R:: namespace, which takes doubles, ints, etc. It seems the easiest thing to do (to my mind), is just take arma::vec as arguments instead:
arma::vec rpois_rcpp( arma::vec &lambda) {
int n= lambda.n_elem;
and
arma::vec cpprbinom(int n, double size, arma::vec prob) {
You never utilize the fact that lambda and prob are Rcpp::NumericVectors specifically, you just use doubles from them, so this seems the easiest route to me. After those changes, your code compiles fine on my machine. I don't have any test cases to make sure they run as you'd expect, but I imagine they will.
I am trying to optimise some code which runs unreasonably slowly for what is required. The top answer here describes the method I am trying (although I am not 100% sure I am implementing it correctly).
Only a few lines show up repeatedly on the top of the call stack as I pause the program randomly, however I do not know how I could increase the codes performance given these lines.
The essential function of the code is updating a lattice of points repeatedly using the values of the points surrounding a given point. The relevant code for the first line that comes up:
The class definition:
template<typename T> class lattice{
private:
const unsigned int N; //size
std::vector<std::vector<T>> lattice_points =
std::vector<std::vector<T>>(N,std::vector<T>(N)); //array of points
protected:
static double mod(double, double) ;
public:
lattice(unsigned int);
lattice(const lattice&);
lattice& operator=(const lattice&);
~lattice() {};
T point(int, int) const;
void set(int, int, T);
unsigned int size() const;
};
These lines show up quite often:
template <typename T>
T lattice<T>::point(int x, int y) const {
return (*this).lattice_points[x % N][y % N]; //mod for periodic boundaries
};
template <typename T>
void lattice<T>::set(int x, int y, T val) {
this->lattice_points[x % N][y % N] = val; //mod for periodic boundaries
};
They are used here:
angle_lattice update_lattice(const angle_lattice& lat, const parameters& par, double dt) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(-0.5,0.5);
double sqrtdt = sqrt(dt);
angle_lattice new_lat(lat.size());
int N = lat.size();
for(int i=0; i < N; i++) {
for(int j=0; j < N; j++) {
double val = lat.point(i,j)+
dt*(-par.Dx*( sin_points(lat, i, j, i+1, j) + sin_points(lat, i, j, i-1, j) )
-par.Dy*( sin_points(lat, i, j, i, j+1) + sin_points(lat, i, j, i, j-1) )
-par.Lx/2*( cos_points(lat, i, j, i+1, j) + cos_points(lat, i, j, i-1, j) -2)
-par.Ly/2*( cos_points(lat, i, j, i, j+1) + cos_points(lat, i, j, i, j-1) -2))
+sqrtdt*2*M_PI*par.Cl*dis(gen);
new_lat.set(i,j,val);
}
}
return new_lat;
};
double sin_points(const angle_lattice& lat, int i1, int j1, int i2, int j2) {
return sin(lat.point(i1, j1) - lat.point(i2, j2));
};
double cos_points(const angle_lattice& lat, int i1, int j1, int i2, int j2) {
return cos(lat.point(i1, j1) - lat.point(i2, j2));
};
here angle_lattice is just a lattice where the template parameter is a angle. The set function is overloaded so that the angle is mod 2pi. The only other two functions that appear in the call stack are cos_points and sin_points , as well as generating the random number, but I assume the latter cannot be helped.
Is there anything that can be done? Help would be appreciated.
Edit: I changed the code following some of the suggestions and now the cosine and sine calculation are the highest. I am not sure what
I am trying to solve differential heat equation using Thomas algorithm.
Physical problem: We have plug, left side is having temperature 0, right side temperature is 1.
For Thomas algorithm I have written a function, which accept three QVector and int value amount of equations.
This is my code:
#include <QCoreApplication>
#include <QVector>
#include <QDebug>
#include <iostream>
using std::cin;
void enterIn(QVector<float> &Array, int Amount_of_elements){
int transit;
for(int i=0;i<Amount_of_elements;i++){
cin>>transit;
Array.push_back(transit);
}
}
QVector<float> shuttle_method(const QVector<float> &below_main_diagonal,
QVector<float> &main_diagonal,
const QVector<float> &beyond_main_diagonal,
const QVector<float> &free_term,
const int N){
QVector <float> c;
QVector <float> d;
for(int i=0;i<N;i++){
main_diagonal[i]*=(-1);
}
QVector<float> x; //result
c.push_back(beyond_main_diagonal[0]/main_diagonal[0]);
d.push_back(-free_term[0]/main_diagonal[0]);
for(int i=1;i<=N-2;i++){
c.push_back(beyond_main_diagonal[i]/(main_diagonal[i]-below_main_diagonal[i]*c[i-1]));
d.push_back( (below_main_diagonal[i]*d[i-1] - free_term[i]) / (main_diagonal[i]- below_main_diagonal[i]*c[i-1]) );
}
x.resize(N);
//qDebug()<<x.size()<<endl;
int n=N-1;
x[n]=(below_main_diagonal[n]*d[n-1]-free_term[n])/(main_diagonal[n]-below_main_diagonal[n]*c[n-1]);
for(int i=n-1;i>=0;i--){
x[i]=c[i]*x[i+1]+d[i];
// qDebug()<<x[i]<<endl;
}
return x;
}
int main()
{
QVector <float> alpha; // below
QVector <float> beta; // main diagonal * (-1)
QVector <float> gamma; // beyond
QVector <float> b; // free term
QVector<float> T;
int cells_x=40; //amount of equations
alpha.resize(cells_x);
beta.resize(cells_x);
gamma.resize(cells_x);
b.resize(cells_x);
T.resize(cells_x);
float dt=0.2,h=0.1;
alpha[0]=0;
for(int i=1;i<cells_x;i++){
alpha[i]= -dt/(h*h);
}
for(int i=0;i<cells_x;i++){
beta[i] = (2*dt)/(h*h)+1;
}
for(int i=0;i<cells_x-1;i++){
gamma[i]= -dt/(h*h);
}
gamma[cells_x-1]=0;
qDebug()<<"alpha= "<<endl<<alpha.size()<<alpha<<endl<<"beta = "<<endl<<beta.size()<<beta<<endl<<"gamma= "<<gamma.size()<<gamma<<endl;
for(int i=0;i<cells_x-1;i++){
T[i]=0;
}
T[cells_x-1]=1;
qDebug()<<endl<<endl<<T<<endl;
//qDebug()<< shuttle_method(alpha,beta,gamma,b,N);
QVector<float> Tn;
Tn.resize(cells_x);
Tn = shuttle_method(alpha,beta,gamma,T,cells_x);
Tn[0]=0;Tn[cells_x-1]=1;
for(int stepTime = 0; stepTime < 50; stepTime++){
Tn = shuttle_method(alpha,beta,gamma,Tn,cells_x);
Tn[0]=0;
Tn[cells_x-1]=1;
qDebug()<<Tn<<endl;
}
return 0;
}
My problem is:
when I compile and run it I am getting this:
Tn <20 items> QVector<float>
0 float
0.000425464 float
0.000664658 float
0.000937085 float
0.00125637 float
0.00163846 float
0.00210249 float
0.00267163 float
0.00337436 float
0.00424581 float
0.00532955 float
0.00667976 float
0.00836396 float
0.0104664 float
0.0130921 float
0.0163724 float
0.0204714 float
0.0255939 float
0.0319961 float
Tn <20 items> QVector<float>
0 float
-0.000425464 float
0.000643385 float
-0.000926707 float
0.00120951 float
-0.00161561 float
0.00202056 float
-0.00263167 float
0.00324078 float
-0.00418065 float
0.00511726 float
-0.00657621 float
0.00802998 float
-0.0103034 float
0.0125688 float
-0.0161171 float
0.0196527 float
-0.0251945 float
0.0307164 float
1 float
Tn <20 items> QVector<float>
0 float
0.000425464 float
0.000664658 float
0.000937085 float
0.00125637 float
0.00163846 float
0.00210249 float
0.00267163 float
0.00337436 float
0.00424581 float
0.00532955 float
0.00667976 float
0.00836396 float
0.0104664 float
0.0130921 float
0.0163724 float
0.0204714 float
0.0255939 float
0.0319961 float
Tn <20 items> QVector<float>
0 float
-0.000425464 float
0.000643385 float
-0.000926707 float
0.00120951 float
-0.00161561 float
0.00202056 float
-0.00263167 float
0.00324078 float
-0.00418065 float
0.00511726 float
-0.00657621 float
0.00802998 float
-0.0103034 float
0.0125688 float
-0.0161171 float
0.0196527 float
-0.0251945 float
0.0307164 float
1 float
Again and again in loop.
I have no idea why I am getting this.
Maybe my mistake is in assign Tn result of my Thomas-method-function?
or in realization of Thomas method? or in boundary conditions?
I got it!
Boundary conditions must be acting to vectors
QVector<float> below_main_diagonal,
QVector<float> main_diagonal,
QVector<float> beyond_main_diagonal
so that T[0] must be 0 and T[N-1] must be 1. We can do it this way:
main_diagonal.first()=1;
main_diagonal.last()=1;
beyond_main_diagonal.first()=0;
below_main_diagonal.last()=0;
and due to this T[0] will be always equal to zero and T[N-1] will be equal to 1;
And in the article where I read about Thomas method the first step was to negate main diagonal, I have done it, but then in the end of the function I must do reverse thing, so:
for(int i(0);i<N;++i){
main_diagonal[i]*=(-1);
}
and we can use this function again, this is not absolutely optimal, but it is working stable.
Then, the whole code will be look like this:
#include <QCoreApplication>
#include <QVector>
#include <QDebug>
#include <iostream>
QVector<float> Thomas_Algorithm( QVector<float> &below_main_diagonal ,
QVector<float> &main_diagonal ,
QVector<float> &beyond_main_diagonal ,
QVector<float> &free_term,
const int N){
QVector<float> x; //vector of result
// checking of input data
if(below_main_diagonal.size()!=main_diagonal.size()||
main_diagonal.size()!=beyond_main_diagonal.size()||
free_term.size()!=main_diagonal.size())
{ qDebug()<<"Error!\n"
"Error with accepting Arrays! Dimensities are different!"<<endl;
x.resize(0);
return x;
}
if(below_main_diagonal[0]!=0){
qDebug()<< "Error!\n"
"First element of below_main_diagonal must be equal to zero!"<<endl;
x.resize(0);
return x;
}
if(beyond_main_diagonal.last()!=0){
qDebug()<< "Error!\n"
"Last element of beyond_main_diagonal must be equal to zero!"<<endl;
x.resize(0);
return x;
}
// end of checking
QVector <float> c;
QVector <float> d;
for(int i=0;i<N;i++){
main_diagonal[i]*=(-1);
}
c.push_back(beyond_main_diagonal[0]/main_diagonal[0]);
d.push_back(-free_term[0]/main_diagonal[0]);
for(int i=1;i<=N-2;i++){
c.push_back(beyond_main_diagonal[i]/(main_diagonal[i]-below_main_diagonal[i]*c[i-1]));
d.push_back( (below_main_diagonal[i]*d[i-1] - free_term[i]) /
(main_diagonal[i]- below_main_diagonal[i]*c[i-1]) );
}
x.resize(N);
int n=N-1;
x[n]=(below_main_diagonal[n]*d[n-1]-free_term[n])/(main_diagonal[n]-below_main_diagonal[n]*c[n-1]);
for(int i=n-1;i>=0;i--){
x[i]=c[i]*x[i+1]+d[i];
}
for(int i(0);i<N;++i){
main_diagonal[i]*=(-1);
}
return x;
}
int main()
{
QVector <float> alpha; // below
QVector <float> beta; // main diagonal * (-1)
QVector <float> gamma; // beyond
QVector <float> b; // free term
QVector<float> T;
int cells_x=30; // amount of steps
alpha.resize(cells_x);
beta.resize(cells_x);
gamma.resize(cells_x);
T.resize(cells_x );
float dt=0.2,h=0.1;
alpha[0]=0;
for(int i=1;i<cells_x-1;i++){
alpha[i]= -dt/(h*h);
}
alpha[cells_x-1]=0;
beta[0]=1;
for(int i=1;i<cells_x-1;i++){
beta[i] = (2*dt)/(h*h)+1;
}
beta[cells_x-1]=1;
gamma[0]=0;
for(int i=1;i<cells_x-1;i++){
gamma[i]= -dt/(h*h);
}
gamma[cells_x-1]=0;
for(int i=0;i<cells_x-1;i++){
T[i]=0;
}
T[cells_x-1]=1;
QVector<float>Tn;
Tn.resize(cells_x);
Tn= Thomas_Algorithm(alpha,beta,gamma,T,cells_x);
// boundary conditions!
beta.first()=1;
beta.last()=1;
gamma.first()=0;
alpha.last()=0;
// and then due to bc we always have T[0]=0 and T[n]=1
for(int stepTime=0;stepTime<100;stepTime++){
Tn = Thomas_Algorithm(alpha,beta,gamma,Tn,cells_x);
qDebug()<<"stepTime = "<<stepTime<<endl<<endl;
qDebug()<<Tn<<endl;
// boundary conditions!
beta.first()=1;
beta.last()=1;
gamma.first()=0;
alpha.last()=0;
// and then due to bc we always have T[0]=0 and T[n]=1
}
return 0;
}
and in the last step we are going to get absolutely "physical" results:
Tn <30 items> QVector<float>
0 float
0.0344828 float
0.0689656 float
0.103448 float
0.137931 float
0.172414 float
0.206897 float
0.24138 float
0.275862 float
0.310345 float
0.344828 float
0.379311 float
0.413793 float
0.448276 float
0.482759 float
0.517242 float
0.551724 float
0.586207 float
0.62069 float
0.655173 float
0.689655 float
0.724138 float
0.758621 float
0.793104 float
0.827586 float
0.862069 float
0.896552 float
0.931035 float
0.965517 float
1 float