How to make the following Halide code more efficient?

How to make the following Halide code more efficient? - c++

The code snippet below is running slower than expected. The authors of this paper http://www.cvlibs.net/publications/Geiger2010ACCV.pdf compute support_points of a 900x700 image in 118 ms. I have implemented their algorithm below in Halide.
In my algorithm, the nested for loops over length and width iterate over xi and yi, which are points in output_x and output_y (defined previously but not shown below). Over each iteration of the nested for loops, a vector top_k is computed and pushed_back into support_points.
Computing this pipeline even for left_buffer.width() == 20 and left_buffer.height() == 20 takes 500 ms. Thus this implementation is several orders of magnitude slower:
...
int k = 4; // # of support points
vector<pair<Expr, Expr>> support_points(k * left_buffer.width() * left_buffer.height());
// Calculate support pixel for each
Func support("support");
support(x, y) = Tuple(i32(0), i32(0), f32(0));
for (int yi = 0; yi < left_buffer.height(); yi++) {
for (int xi = 0; xi < left_buffer.width() - 2; xi++) {
bool left = xi < left_buffer.width() / 4;
bool center = (xi >= left_buffer.width() / 4 && xi < left_buffer.width() * 3 / 4);
bool right = xi >= left_buffer.width() * 3 / 4;
vector <pair<Expr, Expr>> scan_range;
pair <Expr, Expr> scan_height(0, (Expr) left_buffer.height());
pair <Expr, Expr> scan_width;
int which_pred = 0;
if (left) {
scan_width = make_pair((Expr) 0, (Expr) left_buffer.width() / 2);
which_pred = 0;
}
else if (center) {
scan_width = make_pair((Expr) xi - left_buffer.width() / 4, (Expr) left_buffer.width() / 2);
which_pred = 1;
}
else if (right) {
scan_width = make_pair((Expr) left_buffer.width() / 2, (Expr) left_buffer.width() / 2);
which_pred = 2;
}
else {
cout<<"Error"<<endl;
}
scan_range = {scan_width, scan_height};
// cout<<"xi "<<xi<<endl;
// cout<<"yi "<<yi<<endl;
// cout<<"scan_width= "<<scan_width.first<<" "<<scan_width.second<<endl;
// cout<<"scan_height= "<<scan_height.first<<" "<<scan_height.second<<endl;
RDom scanner(scan_range);
Expr predicate[3] = {scanner.x != xi && scanner.y != yi, scanner.x != 0 && scanner.y != 0, scanner.x != xi && scanner.y != yi};
scanner.where(predicate[which_pred]);
std::vector<Expr> top_k(k * 3);
for (int i = 0; i < k; i++) { // say we want top 4 support points.
top_k[3*i] = 10000.0f;
top_k[3*i+1] = 0;
top_k[3*i+2] = 0;
}
Func argmin("argmin");
argmin() = Tuple(top_k);
Expr next_val = abs(output_x(xi, yi) - output_x(scanner.x, scanner.y)) + abs(output_y(xi, yi) - output_y(scanner.x, scanner.y));
Expr next_x = scanner.x;
Expr next_y = scanner.y;
top_k = Tuple(argmin()).as_vector();
// Insert a single element into a sorted list without actually branching
top_k.push_back(next_val);
top_k.push_back(next_x);
top_k.push_back(next_y);
for (int i = k; i > 0; i--) {
Expr prev_val = top_k[(i-1)*3];
Expr prev_x = top_k[(i-1)*3 + 1];
Expr prev_y = top_k[(i-1)*3 + 2];
Expr should_swap = top_k[i*3] < prev_val;
top_k[(i-1)*3] = select(should_swap, top_k[i*3], prev_val);
top_k[(i-1)*3 + 1] = select(should_swap, top_k[i*3 + 1], prev_x);
top_k[(i-1)*3 + 2] = select(should_swap, top_k[i*3 + 2], prev_y);
top_k[i*3] = select(should_swap, prev_val, top_k[i*3]);
top_k[i*3 + 1] = select(should_swap, prev_x, top_k[i*3 + 1]);
top_k[i*3 + 2] = select(should_swap, prev_y, top_k[i*3 + 2]);
}
// Discard the k+1th element
top_k.pop_back(); top_k.pop_back(); top_k.pop_back();
bool cond = xi == 10 && yi == 10;
cout << xi << " "<< yi << " " << cond << endl;
Expr e = argmin()[0];
e = print_when(cond, e, "<- argmin() val");
argmin() = Tuple(top_k);
argmin.compute_root();
// argmin.trace_stores();
argmin.compile_to_lowered_stmt("argmin.html", {}, HTML);
Realization real = argmin.realize();
for (int i = 0; i < k; i++) {
pair<Expr, Expr> c(top_k[3*i+1], top_k[3*i+2]);
support_points.push_back(c);
}
}
}
double t2 = current_time();
cout<<(t2-t1)/100<<" ms"<<endl;
cout<<"executed"<<endl;
}
How can I improve efficiency?

It looks like you may be getting a bit confused between the stages of your program. With Halide, your C++ code that works with Exprs, Funcs, etc. is not actually evaluating anything, it is constructing a Halide program, which you can then compile and run. That means that the C++ for loops, std::vectors, etc. that you're using are all happening at program construction time (essentially compile time) of the Halide program. You might think of it like C++ templates, which evaluate at compile time, vs. the C++ code they construct, which evaluate at the run time of your program: the C++ code you're writing here is equivalent to template code with respect to the Halide program that you are building.
This gets a bit more confusing with the ability to JIT-compile and evaluate a Halide program inside of the same C++ program that builds it (realize).
As it is, I suspect the above program doesn't actually compute the results you expect it to. After the double for loop, what are you hoping to do with support_points? What you have built there is a big array of expressions (pieces of code), not concrete values. And you are JIT-compiling and running a new piece of Halide code each time around those loops (i.e., for every pixel).
I think you may have an easier time understanding what you are building if you stick to ahead-of-time compilation (compile_to_file or generators) for now. That makes the two stages—Halide code generation time, and the runtime of that code inside a separate program—very distinct.

Related

Add values serially in the same SIMD register

I'm trying to convert this to AVX2:
// parallel arrays
int16_t* Nums = ...
int16_t* Capacities = ...
int** Data = ...
int* freePointer = ...
for (int i = 0; i < n; i++)
{
if (Nums[i] == 0)
Capacities[i] = 0;
else
{
Data[i] = freePointer;
freePointer += Capacities[i];
}
}
But didn't get too far:
for (int i = 0; i < n; i += 4) // 4 as Data is 64 bits
{
const __m256i nums = _mm256_loadu_si256((__m256i*)&Nums[i]);
const __m256i bZeroes = _mm256_cmpeq_epi16(nums, ZEROES256);
const __m256i capacities = _mm256_loadu_si256((__m256i*)&Capacities[i]);
const __m256i zeroedCapacities = _mm256_andnot_si256(bZeroes, capacities);
_mm256_storeu_si256((__m256i*)&Capacities[i], zeroedCapacities);
}
Stuck at the else branch, not sure how to add (prefix sum?...) Capacities into freePointer and assign the "serial" results to Data in the same 256-bit SIMD register.
My terminology is probably off, I hope the code gets across what I'm trying to accomplish.
lane0: freePointer
lane1: freePointer + Capacities[i + 0]
lane2: freePointer + Capacities[i + 0] + Capacities[i + 1]
lane3: freePointer + Capacities[i + 0] + Capacities[i + 1] + Capacities[i + 2]
Basically this is what I want to do in as few instructions as possible, if at all possible. Target is AVX2.

You can find a lot of details here: https://stackoverflow.com/a/69452433/5021064
Here you can plug in any type instead of T and U see the resulting asm for x86 and arm

Why is this ellipse drawing program so very slow?

I have a program to draw a grid of ellipses with a uniform phase distribution. However, it is very slow.
I'd like my code to be faster, so that I can use, for example, N = 150, M = 150.
How can I speed up this code?
N = 10;
M = 10;
y = 1;
x = 1;
a = 1;
b = 2;
for k = 1:N
for m = 1:N
w = rand(1,1);
for l = 1:N
for s = 1:N
if(((l-x)*cos(w*pi)+(s-y)*sin(w*pi)).^2/a^2 + (-(l-x)*sin(w*pi) + (s-y)*cos(w*pi)).^2/b.^2 <= 1)
f(l,s) = 1*(cos(0.001)+i*sin(0.001));
end
end
end
y = y+4;
end
y = 1;
x = x+5;
end
image(arg(f),'CDataMapping','scaled');
This is what the code produces:
Updated:
N = 10;
M = 10;
y = 1;
x = 1;
a = 1;
b = 2;
for x = 1:5:N
for y = 1:4:N
w = rand(1);
for l = 1:N
for s = 1:N
if(((l-x).*cos(w.*pi)+(s-y).*sin(w.*pi)).^2/a.^2 + (-(l-x).*sin(w.*pi) + (s-y).*cos(w.*pi)).^2/b.^2 <= 1)
f(l,s) = cos(0.001)+i.*sin(0.001);
end
end
end
end
y = 1;
end
image(arg(f),'CDataMapping','scaled');

There are many things you can do to speed up the computation. One important one is to remove loops and replace them with vectorized code. Octave works much faster when doing many computations as once, as opposed to one at a time.
For example, instead of
for l = 1:N
for s = 1:N
if(((l-x).*cos(w.*pi)+(s-y).*sin(w.*pi)).^2/a.^2 + (-(l-x).*sin(w.*pi) + (s-y).*cos(w.*pi)).^2/b.^2 <= 1)
f(l,s) = cos(0.001)+i.*sin(0.001);
end
end
end
one can write
l = 1:N;
s = (1:N).';
index = ((l-x).*cos(w.*pi)+(s-y).*sin(w.*pi)).^2/a.^2 + (-(l-x).*sin(w.*pi) + (s-y).*cos(w.*pi)).^2/b.^2 <= 1;
f(index) = cos(0.001)+i.*sin(0.001);
However, here we still do too much work because we compute index at locations that we know will be outside the extend of the ellipse. Ideally we'd find a smaller region around each point (x,y) that we know the ellipse fits into.
Another important thing to do is preallocate the array. f grows within the loop iterations. Instead, one should create f with its final size before the loop starts.
There are also many redundant computations being made. For example w.*pi is computed multiple times, and the cos and sin of it too. You also assign cos(0.001)+i.*sin(0.001) to output pixels over and over again, this could be a constant computed once.
The following code runs in MATLAB in a tiny fraction of a second (though Octave will be a lot slower). I've also separated N and M properly (so the output is not always square) and made the step sizes a variable for improved understanding of the code. I'm assigning 1 to the ellipse elements, you can replace them by your constant by multiplying f = f * (cos(0.001)+i*sin(0.001)).
N = 150;
M = 200;
a = 5;
b = 10;
x_step = 25;
y_step = 25;
f = zeros(N,M);
for x = x_step/2:x_step:M
for y = y_step/2:y_step:N
phi = rand(1)*pi;
cosphi = cos(phi);
sinphi = sin(phi);
l = (1:M)-x;
s = (1:N).'-y;
index = (l*cosphi+s*sinphi).^2/a.^2 + (-l*sinphi + s*cosphi).^2/b.^2 <= 1;
f(index) = 1;
end
end

I'm not 100% sure what you're trying to do. See the code below and let me know. It took me about 30 s to run the 150 x 150 case. Not sure if that's fast enough
[h,k] = meshgrid(0:150, 0:150);
a = 0.25;
b = 0.5;
phi = reshape( linspace( 0 , 2*pi , numel(h) ), size(h));
theta = linspace(0,2*pi,50)';
x = a*cos(theta);
y = b*sin(theta);
h = h(:);
k = k(:);
phi = phi(:);
ellipseX = arrayfun(#(H,K,F) x*cos(F)-y*sin(F) + H , h,k,phi, 'uni', 0);
ellipseY = arrayfun(#(H,K,F) x*sin(F)+y*cos(F) + K , h,k,phi, 'uni', 0);
ellipse = [ellipseX, ellipseY, repmat({'r'}, size(ellipseX))]';
fill(ellipse{:})

Exponential operator in C++ loop

I wrote C++ codes and matlab codes to test speed. My C++ code is:
int nrow = dim[0], ncol = dim[1];
double tmp, ldot;
for (int k = ncol - 1; k >= 0; --k){
grad[k] = 0;
for (int j = nrow - 1; j >= 0; --j){
tmp = exp(eta[j + nrow * k]);
ldot = (-Z[j + nrow * k] + tmp / (1 + tmp));
grad[k] += A[j] * ldot;
}
}
My matlab code is:
prob = exp(eta);
prob = prob./(1+prob);
ldot = prob - Z;
grad=sum(repmat(A,1,nGWAS).*ldot);
I run each code 100 times, it took over 5 seconds for C++ and only 1.2 seconds for matlab.
Anyone can help my here? Thanks.

The folks at matlab know very well how to optimize matrix access.
You chose to access it column by column. My initial guess is that the matrix is laid out in memory row by row. This causes your code to run over the whole matrix ncol times. Cache misses all over the place.

SSE optimization of Gaussian blur

I'm working on a school project , I have to optimize part of code in SSE, but I'm stuck on one part for few days now.
I dont see any smart way of using vector SSE instructions(inline assembler / instric f) in this code(its a part of guassian blur algorithm). I would be glad if somebody could give me just a small hint
for (int x = x_start; x < x_end; ++x) // vertical blur...
{
float sum = image[x + (y_start - radius - 1)*image_w];
float dif = -sum;
for (int y = y_start - 2*radius - 1; y < y_end; ++y)
{ // inner vertical Radius loop
float p = (float)image[x + (y + radius)*image_w]; // next pixel
buffer[y + radius] = p; // buffer pixel
sum += dif + fRadius*p;
dif += p; // accumulate pixel blur
if (y >= y_start)
{
float s = 0, w = 0; // border blur correction
sum -= buffer[y - radius - 1]*fRadius; // addition for fraction blur
dif += buffer[y - radius] - 2*buffer[y]; // sum up differences: +1, -2, +1
// cut off accumulated blur area of pixel beyond the border
// assume: added pixel values beyond border = value at border
p = (float)(radius - y); // top part to cut off
if (p > 0)
{
p = p*(p-1)/2 + fRadius*p;
s += buffer[0]*p;
w += p;
}
p = (float)(y + radius - image_h + 1); // bottom part to cut off
if (p > 0)
{
p = p*(p-1)/2 + fRadius*p;
s += buffer[image_h - 1]*p;
w += p;
}
new_image[x + y*image_w] = (unsigned char)((sum - s)/(weight - w)); // set blurred pixel
}
else if (y + radius >= y_start)
{
dif -= 2*buffer[y];
}
} // for y
} // for x

One more feature you can use is logical operations and masks:
for example instead of:
// process only 1
if (p > 0)
p = p*(p-1)/2 + fRadius*p;
you can write
// processes 4 floats
const __m128 &mask = _mm_cmplt_ps(p,0);
const __m128 &notMask = _mm_cmplt_ps(0,p);
const __m128 &p_tmp = ( p*(p-1)/2 + fRadius*p );
p = _mm_add_ps(_mm_and_ps(p_tmp, mask), _mm_and_ps(p, notMask)); // = p_tmp & mask + p & !mask
Also I can recommend you to use a special libraries, which overloads instructions. For example: http://code.compeng.uni-frankfurt.de/projects/vc
dif variable makes iterations of inner loop dependent. You should try to parallelize the outer loop. But with out instructions overloading the code will become unmanageable then.
Also consider rethinking the whole algorithm. Current one doesn't look paralell. May be you can neglect precision, or increase scalar time a bit?

6 dimensional integral by Trapezoid in Fortran using Fortran 90

I need to calculate six dimensional integrals using Trapezoid in Fortran 90 in an efficient way. Here is an example of what I need to do:
Where F is a numerical (e.g. not analytical) function which is to be integrated over x1 to x6, variables. I have initially coded a one dimension subroutine:
SUBROUTINE trapzd(f,mass,x,nstep,deltam)
INTEGER nstep,i
DOUBLE PRECISION mass(nstep+1),f(nstep+1),x,deltam
x=0.d0
do i=1,nstep
x=x+deltam*(f(i)+f(i+1))/2.d0
end do
return
END
Which seems to work fine with one dimension, however, I don't know how to scale this up to six dimensions. Can I re-use this six times, once for every dimension or shall I write a new subroutine?
If you have a fully coded (no library/API use) version of this in another language like Python, MATLAB or Java, I'd be very glad to have a look and get some ideas.
P.S. This is not school homework. I am a PhD student in Biomedicine and this is part of my research in modeling stem cell activities. I do not have a deep background of coding and mathematics.
Thank you in advance.

You could look at the Monte Carlo Integration chapter of the GNU Scientific Library (GSL). Which is both a library, and, since it is open source, source code that you can study.

Look at section 4.6 of numerical recipes for C.
Step one is to reduce the problem using, symmetry and analytical dependencies.
Step two is to chain the solution like this:
f2(x2,x3,..,x6) = Integrate(f(x,x2,x3..,x6),x,1,x1end)
f3(x3,x4,..,x6) = Integrate(f2(x,x3,..,x6),x,1,x2end)
f4(x4,..,x6) = ...
f6(x6) = Integrate(I4(x,x6),x,1,x5end)
result = Integrate(f6(x),x,1,x6end)

Direct evaluation of multiple integrals is computationally challenging. It might be better to use Monte Carlo, perhaps using importance sampling. However brute force direct integration is sometimes of interest for validation of methods.
The integration routine I use is "QuadMo" written by Luke Mo about 1970. I made it recursive and put it in a module. QuadMo refines the mesh were needed to get the requested integration accuracy. Here is a program that does an n-dimensional integral using QuadMo.
Here is the validation of the program using a Gaussian centered at 0.5 with SD 0.1 in all dimensions for nDim up to 6, using a G95 compile. It runs in a couple of seconds.
nDim ans expected nlvl
1 0.249 0.251 2
2 6.185E-02 6.283E-02 2 2
3 1.538E-02 1.575E-02 2 2 2
4 3.826E-03 3.948E-03 2 2 2 2
5 9.514E-04 9.896E-04 2 2 2 2 2
6 2.366E-04 2.481E-04 2 2 2 2 2 2
Here is the code:
!=======================================================================
module QuadMo_MOD
implicit none
integer::QuadMo_MinLvl=6,QuadMo_MaxLvl=24
integer,dimension(:),allocatable::QuadMo_nlvlk
real*8::QuadMo_Tol=1d-5
real*8,save,dimension(:),allocatable::thet
integer,save::nDim
abstract interface
function QuadMoFunct_interface(thet,k)
real*8::QuadMoFunct_interface
real*8,intent(in)::thet
integer,intent(in),optional::k
end function
end interface
abstract interface
function MultIntFunc_interface(thet)
real*8::MultIntFunc_interface
real*8,dimension(:),intent(in)::thet
end function
end interface
procedure(MultIntFunc_interface),pointer :: stored_func => null()
contains
!----------------------------------------------------------------------
recursive function quadMoMult(funct,lower,upper,k) result(ans)
! very powerful integration routine written by Luke Mo
! then at the Stanford Linear Accelerator Center circa 1970
! QuadMo_Eps is error tolerance
! QuadMo_MinLvl determines initial grid of 2**(MinLvl+1) + 1 points
! to avoid missing a narrow peak, this may need to be increased.
! QuadMo_Nlvl returns number of subinterval refinements required beyond
! QuadMo_MaxLvl
! Modified by making recursive and adding argument k
! for multiple integrals (GuthrieMiller#gmail.com)
real*8::ans
procedure(QuadMoFunct_interface) :: funct
real*8,intent(in)::lower,upper
integer,intent(in),optional::k
real*8::Middle,Left,Right,eps,est,fLeft,fMiddle,fRight
& ,fml,fmr,rombrg,coef,estl,estr,estint,area,abarea
real*8::valint(50,2), Middlex(50), Rightx(50), fmx(50), frx(50)
& ,fmrx(50), estrx(50), epsx(50)
integer retrn(50),i,level
level = 0
QuadMo_nlvlk(k) = 0
abarea = 0
Left = lower
Right = upper
if(present(k))then
fLeft = funct(Left,k)
fMiddle = funct((Left+Right)/2,k)
fRight = funct(Right,k)
else
fLeft = funct(Left)
fMiddle = funct((Left+Right)/2)
fRight = funct(Right)
endif
est = 0
eps = QuadMo_Tol
100 level = level+1
Middle = (Left+Right)/2
coef = Right-Left
if(coef.ne.0) go to 150
rombrg = est
go to 300
150 continue
if(present(k))then
fml = funct((Left+Middle)/2,k)
fmr = funct((Middle+Right)/2,k)
else
fml = funct((Left+Middle)/2)
fmr = funct((Middle+Right)/2)
endif
estl = (fLeft+4*fml+fMiddle)*coef
estr = (fMiddle+4*fmr+fRight)*coef
estint = estl+estr
area= abs(estl)+ abs(estr)
abarea=area+abarea- abs(est)
if(level.ne.QuadMo_MaxLvl) go to 200
QuadMo_nlvlk(k) = QuadMo_nlvlk(k)+1
rombrg = estint
go to 300
200 if(( abs(est-estint).gt.(eps*abarea)).or.
1(level.lt.QuadMo_MinLvl)) go to 400
rombrg = (16*estint-est)/15
300 level = level-1
i = retrn(level)
valint(level, i) = rombrg
go to (500, 600), i
400 retrn(level) = 1
Middlex(level) = Middle
Rightx(level) = Right
fmx(level) = fMiddle
fmrx(level) = fmr
frx(level) = fRight
estrx(level) = estr
epsx(level) = eps
eps = eps/1.4d0
Right = Middle
fRight = fMiddle
fMiddle = fml
est = estl
go to 100
500 retrn(level) = 2
Left = Middlex(level)
Right = Rightx(level)
fLeft = fmx(level)
fMiddle = fmrx(level)
fRight = frx(level)
est = estrx(level)
eps = epsx(level)
go to 100
600 rombrg = valint(level,1)+valint(level,2)
if(level.gt.1) go to 300
ans = rombrg /12
end function quadMoMult
!-----------------------------------------------------------------------
recursive function MultInt(k,func) result(ans)
! MultInt(nDim,func) returns multi-dimensional integral from 0 to 1
! in all dimensions of function func
! variable QuadMo_Mod: nDim needs to be set initially to number of dimensions
procedure(MultIntFunc_interface) :: func
real*8::ans
integer,intent(in)::k
stored_func => func
if(k.eq.nDim)then
if(allocated(thet))deallocate(thet)
allocate (thet(nDim))
if(allocated(QuadMo_nlvlk))deallocate(QuadMo_nlvlk)
allocate(QuadMo_nlvlk(nDim))
endif
if(k.eq.0)then
ans=func(thet)
return
else
ans=QuadMoMult(MultIntegrand,0d0,1d0,k)
endif
end function MultInt
!-----------------------------------------------------------------------
recursive function MultIntegrand(thetARG,k) result(ans)
real*8::ans
real*8,intent(in)::thetARG
integer,optional,intent(in)::k
if(present(k))then
thet(k)=thetARG
else
write(*,*)'MultIntegrand: not expected, k not present!'
stop
endif
ans=MultInt(k-1,stored_func)
end function MultIntegrand
!-----------------------------------------------------------------------
end module QuadMo_MOD
!=======================================================================
module test_MOD
use QuadMo_MOD
implicit none
contains
!-----------------------------------------------------------------------
real*8 function func(thet) ! multidimensional function
! this is the function defined in nDim dimensions
! in this case a Gaussian centered at 0.5 with SD 0.1
real*8,intent(in),dimension(:)::thet
func=exp(-sum(((thet-5d-1)/1d-1)
& *((thet-5d-1)/1d-1))/2)
end function func
!-----------------------------------------------------------------------
end module test_MOD
!=======================================================================
! test program to evaluate multiple integrals
use test_MOD
implicit none
real*8::ans
! these values are set for speed, not accuracy
QuadMo_MinLvl=2
QuadMo_MaxLvl=3
QuadMo_Tol=1d-1
write(*,*)' nDim ans expected nlvl'
do nDim=1,6
! expected answer is (0.1 sqrt(2pi))**nDim
ans=MultInt(nDim,func)
write(*,'(i10,2(1pg10.3),999(i3))')nDim,ans,(0.250663)**nDim
& ,QuadMo_nlvlk
enddo
end
!-----------------------------------------------------------------------

double MultInt(int k);
double MultIntegrand(double thetARG, int k);
double quadMoMult(double(*funct)(double, int), double lower, double upper, int k);
double funkn(double *thet);
int QuadMo_MinLvl = 2;
int QuadMo_MaxLvl = 3;
double QuadMo_Tol = 0.1;
int *QuadMo_nlvlk;
double *thet;
int nDim;
//double MultInt(int k, double(*func)(double *))
double MultInt(int k)
{
//MultInt(nDim, func) returns multi - dimensional integral from 0 to 1
//in all dimensions of function func
double ans;
if (k == 0)
{
ans = funkn(thet);
}
else
{
ans = quadMoMult(MultIntegrand, 0.0, 1.0, k); //limits hardcoded here
}
return ans;
}
double MultIntegrand(double thetARG, int k)
{
double ans;
if (k > 0)
thet[k] = thetARG;
else
printf("\n***MultIntegrand: not expected, k not present!***\n");
//Recursive call
//ans = MultInt(k - 1, func);
ans = MultInt(k - 1);
return ans;
}
double quadMoMult(double(*funct)(double, int), double lower, double upper, int k)
{
//Integration routine written by Luke Mo
//Stanford Linear Accelerator Center circa 1970
//QuadMo_Eps is error tolerance
//QuadMo_MinLvl determines initial grid of 2 * *(MinLvl + 1) + 1 points
//to avoid missing a narrow peak, this may need to be increased.
//QuadMo_Nlvl returns number of subinterval refinements required beyond
//QuadMo_MaxLvl
//Modified by making recursive and adding argument k
//for multiple integrals(GuthrieMiller#gmail.com)
double ans;
double Middle, Left, Right, eps, est, fLeft, fMiddle, fRight;
double fml, fmr, rombrg, coef, estl, estr, estint, area, abarea;
double valint[51][3], Middlex[51], Rightx[51], fmx[51], frx[51]; //Jack up arrays
double fmrx[51], estrx[51], epsx[51];
int retrn[51];
int i, level;
level = 0;
QuadMo_nlvlk[k] = 0;
abarea = 0.0;
Left = lower;
Right = upper;
if (k > 0)
{
fLeft = funct(Left, k);
fMiddle = funct((Left + Right) / 2, k);
fRight = funct(Right, k);
}
else
{
fLeft = funct(Left,0);
fMiddle = funct((Left + Right) / 2,0);
fRight = funct(Right,0);
}
est = 0.0;
eps = QuadMo_Tol;
l100:
level = level + 1;
Middle = (Left + Right) / 2;
coef = Right - Left;
if (coef != 0.0)
goto l150;
rombrg = est;
goto l300;
l150:
if (k > 0)
{
fml = funct((Left + Middle) / 2.0, k);
fmr = funct((Middle + Right) / 2.0, k);
}
else
{
fml = funct((Left + Middle) / 2.0, 0);
fmr = funct((Middle + Right) / 2.0, 0);
}
estl = (fLeft + 4 * fml + fMiddle)*coef;
estr = (fMiddle + 4 * fmr + fRight)*coef;
estint = estl + estr;
area = abs(estl) + abs(estr);
abarea = area + abarea - abs(est);
if (level != QuadMo_MaxLvl)
goto l200;
QuadMo_nlvlk[k] = QuadMo_nlvlk[k] + 1;
rombrg = estint;
goto l300;
l200:
if ((abs(est - estint) > (eps*abarea)) || (level < QuadMo_MinLvl))
goto l400;
rombrg = (16 * estint - est) / 15;
l300:
level = level - 1;
i = retrn[level];
valint[level][i] = rombrg;
if (i == 1)
goto l500;
if (i == 2)
goto l600;
l400:
retrn[level] = 1;
Middlex[level] = Middle;
Rightx[level] = Right;
fmx[level] = fMiddle;
fmrx[level] = fmr;
frx[level] = fRight;
estrx[level] = estr;
epsx[level] = eps;
eps = eps / 1.4;
Right = Middle;
fRight = fMiddle;
fMiddle = fml;
est = estl;
goto l100;
l500:
retrn[level] = 2;
Left = Middlex[level];
Right = Rightx[level];
fLeft = fmx[level];
fMiddle = fmrx[level];
fRight = frx[level];
est = estrx[level];
eps = epsx[level];
goto l100;
l600:
rombrg = valint[level][1] + valint[level][2];
if (level > 1)
goto l300;
ans = rombrg / 12.0;
return ans;
}
double funkn(double *thet)
{
//in this case a Gaussian centered at 0.5 with SD 0.1
double *sm;
double sum;
sm = new double[nDim];
sum = 0.0;
for (int i = 1; i <= nDim; i++)
{
sm[i] = (thet[i] - 0.5) / 0.1;
sm[i] *= sm[i];
sum = sum + sm[i];
}
return exp(-sum / 2.0);
}
int main() {
double ans;
printf("\nnDim ans expected nlvl\n");
for (nDim = 1; nDim <= 6; nDim++)
{
//expected answer is(0.1 sqrt(2pi))**nDim
QuadMo_nlvlk = new int[nDim + 1]; //array for x values
thet = new double[nDim + 1]; //array for x values
ans = MultInt(nDim);
printf("\n %d %f %f ", nDim, ans, pow((0.250663),nDim));
for (int j=1; j<=nDim; j++)
printf(" %d ", QuadMo_nlvlk[nDim]);
printf("\n");
}
return 0;
}
Declare relevant parameters globally
int QuadMo_MinLvl = 2;
int QuadMo_MaxLvl = 3;
double QuadMo_Tol = 0.1;
int *QuadMo_nlvlk;
double *thet;
int nDim;
This coding is much clearer than the obfuscated antiquated fortran coding, with some tweaking the integral limits and tolerances could be parameterised!!
There are better algorithms to use with adaptive techniques and which handle singularities on the surfaces etc....

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js