convert from recursive to iterative function cuda c++ - c++

I'm working on a genetic program in which I am porting some of the heavy lifting into CUDA. (Previously just OpenMP).
It's not running very fast, and I'm getting an error related to the recursion:
Stack size for entry function '_Z9KScoreOnePdPiS_S_P9CPPGPNode' cannot be statically determined
I've added a lump of the logic which runs on CUDA. I believe its enough to show how its working. I'd be happy to hear about other optimizations I could add, but I would really like to take the recursion if it will speed things up.
Examples on how this could be achieved are very welcome.
__device__ double Fadd(double a, double b) {
return a + b;
};
__device__ double Fsubtract(double a, double b) {
return a - b;
};
__device__ double action (int fNo, double aa , double bb, double cc, double dd) {
switch (fNo) {
case 0 :
return Fadd(aa,bb);
case 1 :
return Fsubtract(aa,bb);
case 2 :
return Fmultiply(aa,bb);
case 3 :
return Fdivide(aa,bb);
default:
return 0.0;
}
}
__device__ double solve(int node,CPPGPNode * dev_m_Items,double * var_set) {
if (dev_m_Items[node].is_terminal) {
return var_set[dev_m_Items[node].tNo];
} else {
double values[4];
for (unsigned int x = 0; x < 4; x++ ) {
if (x < dev_m_Items[node].fInputs) {
values[x] = solve(dev_m_Items[node].children[x],dev_m_Items,var_set);
} else {
values[x] = 0.0;
}
}
return action(dev_m_Items[node].fNo,values[0],values[1],values[2],values[3]);
}
}
__global__ void KScoreOne(double *scores,int * root_nodes,double * targets,double * cases,CPPGPNode * dev_m_Items) {
int pid = blockIdx.x;
// We only work if this node needs to be calculated
if (root_nodes[pid] != -1) {
for (unsigned int case_no = 0; case_no < FITNESS_CASES; case_no ++) {
double result = solve(root_nodes[pid],dev_m_Items,&cases[case_no]);
double target = targets[case_no];
scores[pid] += abs(result - target);
}
}
}
I'm having trouble making any stack examples work for a large tree structure, which is what this solves.

I've solved this issue now. It was not quite a case of placing the recursive arguments into a stack but it was a very similar system.
As part of the creation of the node tree, I append each node each to into a vector. I now solve the problem in reverse using http://en.wikipedia.org/wiki/Reverse_Polish_notation, which fits very nicely as each node contains either a value or a function to perform.
It's also ~20% faster than the recursive version, so I'm pleased!

Related

Heap block modified past request error in C++

I have and QT application with class Solver that solve some numeric problem (Bairstow method for finding roots of the polynomial by finding its distribution to trinomials) but while for smaller instances (5 parameters, in array tabA) it work fine, but when I tried this for larger instances (7 parameters) application crashed.
After I run the debugger I get the following message:
Heap block at 02989F58 modified at 02989F80 past request 20
I'm not exactly sure what is about (well I suppose I get stack overflow but I'm not sure where and how) but it points to line delete[] Q; delete[] W; here is how debugger pointed it:
And here is code of that class methods (the error occurs in main method int Bairstow(*parameters*), which work on class fields and returns number which indicate is solution was found, is not existing or can't be found in given number of iteration)
Here's the main method:
int solver::Bairstow(int stopien,double const *tabA, double *tabP,double *tabR, double eps, int N, double p_,double r_)
{
int i,q, Iter, indpziel=0;
i=stopien;
while(i>=0 && tabA[i]==0)
{
i--;
}
if (i<2) { return 1; }
double *A=new double[stopien +1]
for (i=0;i<=stopien;i++)
{
A[i]=tabA[i];
}
double *Q=new double[stopien-1 +1];
double *w=new double[stopien-3 +1];
double Reszta[2];
double dqdp[2], dqdr[2];
double p,r, a,b,c,d;
while (stopien>=2)
{
p=p_, r=r_;
Iter=O;
do
{
PodzielWiel(stopien,A,p,r,Q,Reszta);
if (fabs(Reszta[1])<eps && fabs(Reszta[0])<eps)
{
break;
}
PodzielWiel(Stopien-2,Q,p,r,W,dqdr);
for (i=Stopien-2;i>=0;i--)
{
Q[i+1]=Q[i];
}
Q[0]=0;
PodzielWiel(Stopien-1,Q,p,r,W,dqdp);
q=LiczMOdwrotna(dqdp[0],dqdr[0],dqdp[1],dqdr[1],a,b,c,d);
if (q==1)
{
delete[] Q;
delete[] W;
return 2;
}
p = p-(a*Reszta[0]+b*Reszta[1]);
r = r-(c*Reszta[0]+d*Reszta[1]);
} while (++Iter<N);
if (Iter==N) return 3;
tabP[IndDziel]=p;
tabR[IndDziel]=r;
IndDziel++;
Stopien-=2;
for (i=0;i<=Stopien;i++)
{
A[i]=Q[i];
}
}
delete[] Q; delete[] W; //that's the line debugger pointed to
return 0;
}
And two helper methods:
for polynomial division (by trinomial x^2-px-r)
int solver::PodzielWiel(int stopien,double const *tabA, double p,double r, double *Q,double *R)
{
if (Stopien<O) return 1;
int i;
for (i=0;i<=stopien-2;i++)
{
Q[i]=0;
}
while (stopien>=0 && tabA[stopien]==0)
{
stopien--;
}
if (stopien<2)
{
R[0]=tabA[0];
R[1]=tabA[1];
return 0;
}
double *A=new double[Stopien +1];
for (i=0;i<=stopien;i++)
{
A[i]=tabA[i];
}
Q[stopien-2]=A[stopien];
if (stopien>2)
{
for(i=stopien; i>1; i--)
{
Q[i-2]=A[i];
A[i-1]+=Q[i-2]*p;
A[i-2]+=Q[i-2]*r;
}
R[1]=A[1];
R[0]=A[0] ;
}
else
{
R[1]=A[1]+p*Q[0] ;
R[0]=A[0]+r*Q[0] ;
}
delete [] A;
return 0;
}
for inversing the 2x2 matrix(first 4 parameters are inputs and next 4 outputs as it):
int Solver::LiczMOdwrotna(double x,double y,double w,double z, double &a,double &b,double &c,double &d)
{
if (x*z==y*w)
{
return 1;
}
if (x*w!=0)
{
c=1/(y-z*x/w);
a=-z*c/w;
d=1/(z-y*w/x);
b=-y*d/x;
}
else if (y*z!=0)
{
a=1/(x-w*y/z);
c=-w*a/z;
b=1/(w-z*x/y);
d=-x*b/y;
}
else if (x==0 && z==0)
{
c=1/y;
d=0;
a=0;
b=1/w;
}
else if (y==0 && w==0)
{
a=1/x;
b=0;
c=0;
d=1/z;
}
return 0;
}
And sory for maybe poor formatting but I have to use OCR software as copying from QT creator was impossible... even after using show in explorer, save file as txt in new localization (to make it visible outside QT Creator) and then doing it again... I still couldn't copy anything...

Write a function that may return either one or more values

Suppose I want to write a function that, say, returns the sum of f(x) for x in a certain range.
double func() {
double sum = 0.;
for (int i=0; i<100; i++) {
sum += f(i);
}
return sum;
}
But sometimes, in addition to the final sum I also need the partial terms, so I can do
pair<vector<double>,double> func_terms() {
double sum = 0.;
vector<double> terms(100);
for (int i=0; i<100; i++) {
terms[i] = f(i);
sum += terms[i];
}
return {terms, sum};
}
The thing is, this is code duplication. This seems very harmless in this example, but suppose the function is much larger (which it is in the situation that prompted me to ask this), and the two versions differ in just a handful of lines lines (in this example the logic is the same only the latter version stores the term in a vector before adding to sum, and returns a pair with that vector; any other logic is equivalent). Then I will have to write and maintain two nearly-identical versions of the same function, differing only in a couple lines and in the return statement. My question is if there is an idiom/pattern/best practice to deal with this kind of problem. Something that would let me share the common code between the two versions.
In short: I can write two functions and have to maintain two nearly-identical versions. Or I can just use the latter but that will be very wasteful whenever I just need the sum, which is unacceptable. What's the best pattern to deal with this?
I reckon that with C++17 one can do something like
template<bool partials>
double func(vector<double>* terms=nullptr) {
double sum = 0.;
if constexpr (partials)
*terms = vector<double>(100);
for (int i=0; i<100; i++) {
if constexpr (partials) {
(*terms)[i] = f(i);
sum += (*terms)[i];
} else {
sum += f(i);
}
}
return sum;
}
Which comes very close to what I intended, apart from using pointers (I can't use references because terms may be empty).
Your question title says "Write a function that may return either one or more values", but it's more than that; as your example shows, the function may also do a lot of different things long before a result is returned. There really is no general solution to such a broad problem.
However, for the specific case you've explained I'd like to offer a low-tech solution. You could simply implement both functions in terms of a third function and give that third function a parameter to determine whether the extra functionality is performed or not.
Here is a C++17 example, in which that third function is called func_impl and more or less hidden inside a namespace to make life easier for the client of func and func_terms:
namespace detail {
enum class FuncOption {
WithTerms,
WithoutTerms
};
std::tuple<std::vector<double>, double> func_impl(FuncOption option) {
auto const withTerms = option == FuncOption::WithTerms;
double sum = 0.;
std::vector<double> terms(withTerms ? 100 : 0);
for (int i = 0; i < 100; ++i) {
auto const result = f(i);
if (withTerms) {
terms[i] = result;
}
sum += result;
}
return std::make_tuple(terms, sum);
}
}
double func() {
using namespace detail;
return std::get<double>(func_impl(FuncOption::WithTerms));
}
std::tuple<std::vector<double>, double> func_terms() {
using namespace detail;
return func_impl(FuncOption::WithoutTerms);
}
Whether that's too low-tech is up to you and depends on your exact problem.
Here was a solution that suggested to pass an optional pointer to vector and to fill it only if present. I deleted it as other answers mention it as well and as the latter solution looks much more elegant.
You can abstract your calculation to iterators, so callers remain very simple and no code is copied:
auto make_transform_counting_iterator(int i) {
return boost::make_transform_iterator(
boost::make_counting_iterator(i),
f);
}
auto my_begin() {
return make_transform_counting_iterator(0);
}
auto my_end() {
return make_transform_counting_iterator(100);
}
double only_sum() {
return std::accumulate(my_begin(), my_end(), 0.0);
}
std::vector<double> fill_terms() {
std::vector<double> result;
std::copy(my_begin(), my_end(), std::back_inserter(result));
return result;
}
One of the simple way is to write a common function and use input parameter to do condition. Like this:
double logic(vector<double>* terms) {
double sum = 0.;
for (int i=0; i<100; i++) {
if (terms != NULL) {
terms.push_back(i);
}
sum += terms[i];
}
return sum;
}
double func() {
return logic(NULL);
}
pair<vector<double>,double> func_terms() {
vector<double> terms;
double sum = logic(&ret);
return {terms, sum};
}
this method is used in many conditions. The Logic can be very complicated and with many input options. You can use the same logic through different parameters.
But in most cases, We need not that much return values but just different input parameter.
If you are not for:
std::pair<std::vector<double>, double> func_terms() {
std::vector<double> terms(100);
for (int i = 0; i != 100; ++i) {
terms[i] = f(i);
}
return {terms, std::accumulate(terms.begin(), terms.end(), 0.)};
}
then maybe:
template <typename Accumulator>
Accumulator& func_helper(Accumulator& acc) {
for (int i=0; i<100; i++) {
acc(f(i));
}
return acc;
}
double func()
{
double sum = 0;
func_helper([&sum](double d) { sum += d; });
return sum;
}
std::pair<std::vector<double>, double> func_terms() {
double sum = 0.;
std::vector<double> terms;
func_helper([&](double d) {
sum += d;
terms.push_back(d);
});
return {terms, sum};
}
The simplest solution for this situation I think would be something like this:
double f(int x) { return x * x; }
auto terms(int count) {
auto res = vector<double>{};
generate_n(back_inserter(res), count, [i=0]() mutable {return f(i++);});
return res;
}
auto func_terms(int count) {
const auto ts = terms(count);
return make_pair(ts, accumulate(begin(ts), end(ts), 0.0));
}
auto func(int count) {
return func_terms(count).second;
}
Live version.
But this approach gives func() different performance characteristics to your original version. There are ways around this with the current STL but this highlights an area where the STL is not ideal for composability. The Ranges v3 library offers a better approach to composing algorithms for this type of problem and is in the process of standardization for a future version of C++.
In general there is often a tradeoff between composability / reuse and optimal performance. At its best C++ lets us have our cake and eat it too but this is an example where there is work underway to give standard C++ better approaches to handle this sort of situation.
I worked out an OOP solution, where a base class always compute sum and makes the current term available to derived classes, this way:
class Func
{
public:
Func() { sum = 0.; }
void func()
{
for (int i=0; i<100; i++)
{
double term = f(i);
sum += term;
useCurrentTerm(term);
}
}
double getSum() const { return sum; }
protected:
virtual void useCurrentTerm(double) {} //do nothing
private:
double f(double d){ return d * 42;}
double sum;
};
So a derived class can implement the virtual method and espose extra properties (other than sum):
class FuncWithTerms : public Func
{
public:
FuncWithTerms() { terms.reserve(100); }
std::vector<double> getTerms() const { return terms; }
protected:
void useCurrentTerm(double t) { terms.push_back(t); }
private:
std::vector<double> terms;
};
If one doesn't want to expose these classes, could fall back to functions and use them as a façade (yet two functions, but very manageable, now):
double sum_only_func()
{
Func f;
f.func();
return f.getSum();
}
std::pair<std::vector<double>, double> with_terms_func()
{
FuncWithTerms fwt;
fwt.func();
return { fwt.getTerms(), fwt.getSum() };
}

Margin of optimisation in C++ armadillo code

I am trying to migrate a quite complex matlab funciton in c++ using the library armadillo but I have serious problems in terms of performance (my c++ version is much slower than the matlab one which is a bit odd). I was wondering whether any of you can spot a point where I can improve my code and possibly can give me some suggestions. The problem is created by a bit in which I try to minimise a function. This is the matlab code
lambda = fminbnd(#SSE, lower_limit,upper_limit,options, y0, x);
function L=SSE(lambda, alpha, y)
N=size(y,2);
len=size(y,1);
for i=1:N
z(:,i)=jglog(y(:,i),alpha,lambda);
end
s = 0;
mean_spec=mean(z,2);
for i=1:N
for j=1:len
s = s + (z(j,i)-mean_spec(j,1))^2;
end
end
L=s;
end
function z=glog(y,alpha,lambda) % Glog transform
z=log((y-alpha)+sqrt((y-alpha).^2+lambda));
end
function [zj, gmn]=jglog(y,y0,lambda)
z=glog(y,y0,lambda);
gmn=exp(mean(log(sqrt((y-y0).^2+lambda))));
zj=z.*gmn;
end
Following this post I downloaded the c++ version of the matlab minimisation code (brent's method). You can find it here.
This is my c++ version.
class SSE_c : public brent::func_base //create functor
{
public:
mat A;
double offset;
virtual double operator() (double lam)
{
return SSE(A,offset,lam);
}
SSE_c(mat a,double of) {A=a;offset=of;}
};
SSE_c fun(x,y0);
brent::glomin(low_limit,up_limit,c,100,step_threshold,step_threshold,fun,lambda);
double SSE(mat& m,double ofs,double lam)
{
mat z(m.n_rows,m.n_cols);
for(uint i=0;i<m.n_cols;i++)
z.col(i) = jglog(m.col(i),ofs,lam);
std::cout << "Iteration:" << count++;
double s=0;
vec mean_spec(z.n_rows);
FuncOnMatRows(z,mean_spec,[](rowvec const& w){return mean(w);});
for(uint i=0;i<z.n_cols;i++)
for(uint j=0;j<z.n_rows;j++)
if(is_finite(z(j,i)))
s += pow((z(j,i)-mean_spec(j)),2);
return s;
}
vec jglog(vec&& v,double ofs,double lam)
{
vec g=glogF(v,ofs,lam);
double gmn;
vec interm = log(sqrt(square(v-ofs)+lam));
if(interm.is_finite())
gmn=exp(mean(interm));
else
gmn=exp(mean(interm.elem(find_finite(interm))));
g = g*gmn;
return g;
}
vec glogF(vec&& v,double ofs,double lam)
{
return glogF(v,ofs,lam);
}
vec glogF(vec& v,double ofs,double lam)
{
vec z = log((v-ofs)+sqrt(square(v-ofs)+lam));
return z;
}
template<typename Func>
void FuncOnMatRows(const mat& M,vec& v,Func const & func)
{
for(uint i=0;i<M.n_rows;i++) // operation calculated on rows
{
if(M.row(i).is_finite())
v(i) = func(M.row(i));
else if(!any(M.row(i)>0))
v(i) = NAN;
else
{
rowvec b=M.row(i);
v(i) = func(b.elem(find_finite(b)).t()); //output of .elem is always colvec, so transpose
}
}
}

general tbb issue for calculating fibonacci numbers

I came across the tbb template below as an example of task-based programming for calculating the sum of fibonacci numbers in c++. But when I run it I get a value of 1717986912 which can't be the case. The output should be 3. What am I doing wrong?
class FibTask: public task
{
public:
const long n;
long * const sum;
FibTask( long n_, long* sum_ ) : n(n_), sum(sum_) {}
task* execute( )
{
// Overrides virtual function task::execute
if( n < 0)
{
return 0;
}
else
{
long x, y;
FibTask& a = *new( allocate_child( ) ) FibTask(n-1,&x);
FibTask& b = *new( allocate_child( ) ) FibTask(n-2,&y);
// Set ref_count to "two children plus one for the wait".
set_ref_count(3);
// Start b running.
spawn( b );
// Start a running and wait for all children (a and b).
spawn_and_wait_for_all( a );
// Do the sum
*sum = x+y;
}
return NULL;
}
long ParallelFib( long n )
{
long sum;
FibTask& a = *new(task::allocate_root( )) FibTask(n,&sum);
task::spawn_root_and_wait(a);
return sum;
}
};
long main(int argc, char** argv)
{
FibTask * obj = new FibTask(3,0);
long b = obj->ParallelFib(3);
std::cout << b;
return 0;
}
The cutoff is messed here. It must be 2 at least. E.g.:
if( n<2 ) {
*sum = n;
return NULL;
}
The original example also uses SerialFib as showed here http://www.threadingbuildingblocks.org/docs/help/tbb_userguide/Simple_Example_Fibonacci_Numbers.htm
The inefficient method for calculating Fibonacci numbers using inefficient blocking style technique will be even more inefficient without call to SerialFib().
WARNING: Please note that this example is intended just to demonstrate this particular low-level TBB API and this particular way of using it. It is not intended for reuse unless you are really sure why you are doing this.
Modern high-level API (though, still for the inefficient Fibonacci algorithm) would look like this:
int Fib(int n) {
if( n<CUTOFF ) { // 2 is minimum
return fibSerial(n);
} else {
int x, y;
tbb::parallel_invoke([&]{x=Fib(n-1);}, [&]{y=Fib(n-2);});
return x+y;
}
}

All possible combinations(with repetition) as values in array using recursion

I'm trying to solve a problem in which I need to insert math operations(+/- in this case) between digits or merge them to get a requested number.
For ex.: 123456789 => 123+4-5+6-7+8-9 = 120
My concept is basically generating different combinations of operation codes in array and calculating the expression until it equals some number.
The problem is I can't think of a way to generate every possible combination of math operations using recursion.
Here's the code:
#include <iostream>
#include <algorithm>
using namespace std;
enum {noop,opplus,opminus};//opcodes: 0,1,2
int applyOp(int opcode,int x, int y);
int calculate(int *digits,int *opcodes, int length);
void nextCombination();
int main()
{
int digits[9] = {1,2,3,4,5,6,7,8,9};
int wantedNumber = 100;
int length = sizeof(digits)/sizeof(digits[0]);
int opcodes[length-1];//math symbols
fill_n(opcodes,length-1,0);//init
while(calculate(digits,opcodes,length) != wantedNumber)
{
//recursive combination function here
}
return 0;
}
int applyOp(int opcode,int x, int y)
{
int result = x;
switch(opcode)
{
case noop://merge 2 digits together
result = x*10 + y;
break;
case opminus:
result -= y;
break;
case opplus:
default:
result += y;
break;
}
return result;
}
int calculate(int *digits,int *opcodes, int length)
{
int result = digits[0];
for(int i = 0;i < length-1; ++i)//elem count
{
result = applyOp(opcodes[i],result,digits[i+1]);//left to right, no priority
}
return result;
}
The key is backtracking. Each level of recursion handles
a single digit; in addition, you'll want to stop the recursion
one you've finished.
The simplest way to do this is to define a Solver class, which
keeps track of the global information, like the generated string
so far and the running total, and make the recursive function
a member. Basically something like:
class Solver
{
std::string const input;
int const target;
std::string solution;
int total;
bool isSolved;
void doSolve( std::string::const_iterator pos );
public:
Solver( std::string const& input, int target )
: input( input )
, target( target )
{
}
std::string solve()
{
total = 0;
isSolved = false;
doSolve( input.begin() );
return isSolved
? solution
: "no solution found";
}
};
In doSolve, you'll have to first check whether you've finished
(pos == input.end()): if so, set isSolved = total == target
and return immediately; otherwise, try the three possibilities,
(total = 10 * total + toDigit(*pos), total += toDigit(*pos),
and total -= toDigit(*pos)), each time saving the original
total and solution, adding the necessary text to
solution, and calling doSolve with the incremented pos.
On returning from the recursive call, if ! isSolved, restore
the previous values of total and solution, and try the next
possibility. Return as soon as you see isSolved, or when all
three possibilities have been solved.