My aim is to use GSL monte carlo integration for an integrand in which an arbitrary multiprecision library (Boost) is used. I decided to use an arbitrary multiprecision library because the integral had difficulties to reach convergence.
This is the actual mathematical formula describing what I am trying to code. My guess is that I do not reach convergence and therefore NaN because and can get very small values, near zeros.
This is my code:
mp::float128 PDFfunction(double invL, int t, double invtau, double x0, double x, int n_lim) {
const double c = M_PI * (M_PI/4) * ((2 * t) * invtau);
mp::float128 res = 0;
for(int n = 1; n <= n_lim; ++n){
res += exp(-1 * (n * n) * c) * cos((n * M_PI * x) * invL) * cos((n * M_PI * x0) * invL);
}
mp::float128 res_tot = invL + ((2 * invL) * res);
return res_tot;
}
The following lines define the integral that I carry out using GSL:
struct my_f_params {double x0; double xt_pos; double y0; double yt_pos; double invLx; double invLy;
double invtau_x; double invtau_y; int n_lim; double tax_rate;};
double g(double *k, size_t dim, void *p){
struct my_f_params * fp = (struct my_f_params *)p;
mp::float128 temp_pbx = prob1Dbox(fp->invLx, k[0], fp->invtau_x, fp->x0, fp->xt_pos, fp->n_lim);
mp::float128 temp_pby = prob1Dbox(fp->invLy, k[0], fp->invtau_y, fp->y0, fp->yt_pos, fp->n_lim);
mp::float128 AFac = (-2 * k[0] * fp->tax_rate);
mp::float128 res = exp(log(temp_pbx) + log(temp_pby) + AFac);
return res.convert_to<double>();
}
double integrate_integral(const double& x0, const double& xt_pos, const double& y0,
const double& yt_pos, const double& invLx, const double& invLy, const double& invtau_x,
const double& invtau_y, const int& n_lim, const double& tax_rate){
double res, err;
double xl[1] = {0};
double xu[1] = {10000000};
const gsl_rng_type *T;
gsl_rng *r;
gsl_monte_function G;
struct my_f_params params = {x0, xt_pos, y0, yt_pos, invLx, invLy, invtau_x, invtau_y, n_lim, tax_rate};
G.f = &g;
G.dim = 1;
G.params = ¶ms;
size_t calls = 10000;
gsl_rng_env_setup ();
T = gsl_rng_default;
r = gsl_rng_alloc (T);
gsl_monte_vegas_state *s = gsl_monte_vegas_alloc (1);
gsl_monte_vegas_integrate (&G, xl, xu, 1, 10000, r, s,
&res, &err);
do
{
gsl_monte_vegas_integrate (&G, xl, xu, 1, calls/5, r, s,
&res, &err);
}
while (fabs (gsl_monte_vegas_chisq (s) - 1.0) > 0.5);
gsl_monte_vegas_free (s);
gsl_rng_free (r);
return res;
}
When I try to run integral_integrate with x0 = 0; xt_pos = 0; y0 = 0; yt_pos = 10; invLx = invLy = 0.09090909; invtau_x = invtau_y = 0.000661157; n_lim = 1000; tax_rate = 7e-8; I get NaN. Why is this the case? I was not expecting this result since I used Log-Sum-Exp to get rid of possible underflow.
Related
My objective is to parallelise a one-dimensional integral. I have looked around, and I would say that I could do that in two ways: i) implementing OpenMP with ODEINT, boost library integrate_adapative function (see https://www.boost.org/doc/libs/1_56_0/libs/numeric/odeint/doc/html/boost_numeric_odeint/tutorial/parallel_computation_with_openmp_and_mpi.html).; ii) implementing OpenMP with GSL monte carlo integration (as in here: Internal compiler error with nested functions in OpenMP parallel regions).
My problem is that I cannot really understand what they did in both links I provided.
I was wondering whether someone has experience with that, and may point out how I could implement both approaches on my case. Is it OpenMP with boost faster or GSL and OpenMP implementation?
PDFfunction represents the probability density function that is used within the integrand function. PDFfunction is equivalent to , and in LateX is expressed as:
And this is how I code it:
double PDFfunction(double invL, int t, double invtau, double x0, double x, int n) {
const double c = M_PI * (M_PI/4) * ((2 * t) * invtau);
double res = 0;
for(int i = 1; i <= n; ++n){
res += exp(-1 * (i * i) * c) * cos((i * M_PI * x) * invL) * cos((i * M_PI * x0) * invL);
}
return invL + ((2 * invL) * res);
}
Composite_at_t is a function that makes use of the PDFfunction to compute pbA and pbB. Composite_at_t is equivalent to , where ) and ).
double Composite_at_t(double t, double B, double x0, double xt_pos, double y0, double yt_pos, double invLtot, double invtau, int n_lim) {
double pbA = PDFfunction(invLtot, t, invtau, x0, xt_pos, n_lim);
double pbB = PDFfunction(invLtot, t, invtau, y0, yt_pos, n_lim);
double b1 = 2 * (2 * t) * exp(-2 * t * B);
return pbA * pbB * b1;
}
Composite_at_tCLASS is a Func class which computes a first integral over variable t.
class Composite_at_tCLASS: public Func{
private:
double B;
double x0;
double xt_pos;
double y0;
double yt_pos;
double invLtot;
double invtau;
int n_lim;
public:
Composite_at_tCLASS(double B_, double x0_, double xt_pos_, double y0_, double yt_pos_, double invLtot_, double invtau_, int n_lim_) : B(B_), x0(x0_), xt_pos(xt_pos_), y0(y0_), yt_pos(yt_pos_), invLtot(invLtot_), invtau(invtau_), n_lim(n_lim_) {}
double operator()(const double& t) const{
return Composite_at_t(t, B, x0, xt_pos, y0, yt_pos, invLtot, invtau, n_lim);
}
};
integrate_CompositeCLASS is the actual function that uses the class Composite_at_tCLASS and perform the integral over t, between 0 and time_lim.
double integrate_CompositeCLASS(double B, double x0, double xt_pos, double y0, double yt_pos, double invLtot, double invtau, int n_lim, double time_lim){
Composite_at_tCLASS f(B, x0, xt_pos, y0, yt_pos, invLtot, invtau, n_lim);
double err_est;
int err_code;
double res = integrate(f, 0, time_lim, err_est, err_code);
return res;
}
For the numerical integration using the GSL library I would use the following code:
struct my_f_params { double B; double x0; double xt_pos; double y0; double yt_pos; double invtau; int n_lim; double invLtot;};
double g(double *k, size_t dim, void *p){
struct my_f_params * fp = (struct my_f_params *)p;
return Composite_at_t(k[0],fp->B, fp->x0, fp->xt_pos, fp->y0, fp->yt_pos, fp->invLtot, fp->invtau, fp->n_lim);
}
And this is the actual object that perform the GSL integral:
double integrate_integral(const double& invtau, const int& n_lim, const double& invLtot,
const double& x0, const double& xt_pos, const double& y0, const double& yt_pos, const double& time_lim){
double res, err;
double xl[1] = {0};
double xu[1] = {time_lim};
const gsl_rng_type *T;
gsl_rng *r;
gsl_monte_function G;
struct my_f_params params = { B, x0, xt_pos, y0, yt_pos, invtau, n_lim, invLtot};
G.f = &g;
G.dim = 1;
G.params = ¶ms;
size_t calls = 10000;
gsl_rng_env_setup ();
T = gsl_rng_default;
r = gsl_rng_alloc (T);
gsl_monte_vegas_state *s = gsl_monte_vegas_alloc (1);
gsl_monte_vegas_integrate (&G, xl, xu, 1, 10000, r, s,
&res, &err);
do
{
gsl_monte_vegas_integrate (&G, xl, xu, 1, calls/5, r, s,
&res, &err);
}
while (fabs (gsl_monte_vegas_chisq (s) - 1.0) > 0.5);
gsl_monte_vegas_free (s);
gsl_rng_free (r);
return res;
}
I have a question that I'm stuck in it;
I want to catch the VOF of the phases on the wall by a defined profile. Is there any macro for this purpose, and with what data accessing and looping macro?
I send my code here, so if you can help me, I will be appreciated. but when I apply my code, the fluent crashes out as soon as I initialize the solution.
DEFINE_PROFILE(heatflux_slip_shoulder_W, t, i)
{
/*Domain *d=Get_Domain(1);*/
int phase_domain_index = 0; /* primary phase index is 0 */
Thread *tm = THREAD_SUPER_THREAD(t);
Thread *subthread = THREAD_SUB_THREAD(tm,phase_domain_index);
double p[ND_ND]; /* this will hold the position vector */
double x, y, r, qslip, sigma, tav_stick, tav_fric, temp, Vx, Vy, Vz, V_W, V_T, VF;
double w = 1120;
double pi = 3.1415927;
double U = 0.002;
double press = 50000000;
double delta = 0.65;
double etta = 0.7;
double heat_ratio = 0.6383;
double Rshol = 0.0075;
double fric ;
double Rpin = 0.0025;
face_t f;
/*thread_loop_f(f, d)*/
{
begin_f_loop(f,t)
{
VF =F_VOF(f,subthread);
F_CENTROID(p, f, t);
x = p[0];
y = p[2];
r = sqrt((x*x) + (y*y));
temp = F_T(f,t);if ((297.<=temp)&&(temp<=273.))
{
sigma_6 = 325.80263157895 + (-0.171052631578952 * temp);
sigma_5 = 282.355263157897 + (-0.0921052631578974*temp);
}
...(and some conditions like above)...
if ((297.<=temp)&&(temp<=855.))
fric = 0.383752244165171 + (-0.000448833034111311 * temp);
else
fric = 0;
tav_6 = sigma_6*1000000/1.732;
tav_5 = sigma_5*1000000/1.732;
tav = VF * tav_6 + (1 - VF ) * tav_5;
qslip = ((pi*w*r/30) - (U*y/r)) * ((etta * (1 - delta) * tav) + (delta * fric * press));
F_PROFILE(f, t, i) = heat_ratio * qslip;
}
end_f_loop(f,t)
}
well, here is my code:
#include "udf.h"
#include "mem.h"
#include "sg.h"
#include "sg_mphase.h"
#include "flow.h"
#include "metric.h"
DEFINE_PROFILE(heatflux_slip_shoulder_W, t, i)
{
/*Domain *d=Get_Domain(1);*/
int phase_domain_index = 0; /* primary phase index is 0 */
Thread *tm = THREAD_SUPER_THREAD(t);
Thread *subthread = THREAD_SUB_THREAD(tm,phase_domain_index);
double p[ND_ND]; /* this will hold the position vector */
double x, y, r, qslip, sigma, tav_stick, tav_fric, temp, Vx, Vy, Vz, V_W, V_T, VF;
double w = 1120;
double pi = 3.1415927;
double U = 0.002;
double press = 50000000;
double delta = 0.65;
double etta = 0.7;
double heat_ratio = 0.6383;
double Rshol = 0.0075;
double fric ;
double Rpin = 0.0025;
face_t f;
/*thread_loop_f(f, d)*/
{
begin_f_loop(f,t)
{
VF =F_VOF(f,subthread);
F_CENTROID(p, f, t);
x = p[0];
y = p[2];
r = sqrt((x*x) + (y*y));
temp = F_T(f,t);if ((297.<=temp)&&(temp<=273.))
{
sigma_6 = 325.80263157895 + (-0.171052631578952 * temp);
sigma_5 = 282.355263157897 + (-0.0921052631578974*temp);
}
...(and some conditions like above)...
if ((297.<=temp)&&(temp<=855.))
fric = 0.383752244165171 + (-0.000448833034111311 * temp);
else
fric = 0;
tav_6 = sigma_6*1000000/1.732;
tav_5 = sigma_5*1000000/1.732;
tav = VF * tav_6 + (1 - VF ) * tav_5;
qslip = ((pi*w*r/30) - (U*y/r)) * ((etta * (1 - delta) * tav) + (delta * fric * press));
F_PROFILE(f, t, i) = heat_ratio * qslip;
}
end_f_loop(f,t)
}
I have created a simple cpp project in Visual Studio, but have encountered the follwoing linking error:
LNK2019: unresolved external symbol "public: class std::vector > __thiscall solver::euler(class point_3d (__cdecl*)(class point_3d,double),double,int,class point_3d,double)" (?euler#?$solver#Vpoint_3d####QAE?AV?$vector#Vpoint_3d##V?$allocator#Vpoint_3d###std###std##P6A?AVpoint_3d##V4#N#ZNH0N#Z
I have no idea what may be causing this and would be gratefull for any advice.
Files that constitute the project:
main.cpp
#include "point_3d.h"
#include "solver.h"
point_3d lorentz(point_3d val, double t) {
double x = 10*(val.get_y()-val.get_x());
double y = val.get_x()*(28.0- val.get_z())- val.get_y();
double z = val.get_x()* val.get_y()-(8.0/3)* val.get_z();
return point_3d(x, y, z);
}
int main(){
auto file = std::ofstream("data/results.json");
solver<point_3d> sol;
std::vector<point_3d> result = sol.euler(lorentz, 0.01, 50000, point_3d(1, 1, 1));
}
solver.h
template<typename T>
class solver{
public:
std::vector<T> euler(T (*f)(T val, double s),double h, int steps, T val, double t0 = 0.0);
std::vector<T> back_euler(T (*f)(T val, double s), double h, int steps, T val, double t0 = 0.0,int n=50);
std::vector<T> rk_2(T (*f)(T val, double s), double h, int steps, T val, double t0 = 0.0);
std::vector<T> rk_4(T (*f)(T val, double s), double h, int steps, T val, double t0 = 0.0);
};
solver.cpp
#include "solver.h"
template<typename T>
std::vector<T> solver<T>::euler(T (*f)(T val, double s), double h, int steps, T val, double t0 ) {
std::vector<T>res;
res.resize(steps + 1);
res[0] = val;
double t = t0;
for (int i = 0; i < steps;i++) {
res[i + 1] = res[i] + h * f(res[i], t);
t += h;
}
return res;
}
template<typename T>
std::vector<T> solver<T>::back_euler(T (*f)(T val, double s), double h, int steps, T val, double t0, int n) {
std::vector<T>res;
res.resize(steps + 1);
res[0] = val;
double t = t0;
for (int i = 0; i < steps; i++) {
t += h;
res[i + 1] = res[i];
for (int j = 0; j < n; j++) {
res[i + 1] += res[i] + h * f(res[i + 1],t);
}
}
return res;
}
template<typename T>
std::vector<T> solver<T>::rk_2(T (*f)(T val, double s), double h, int steps, T val, double t0) {
std::vector<T>res;
res.resize(steps + 1);
res[0] = val;
double t = t0;
for (int i = 0; i < steps; i++) {
auto k = h * f(res[i], t) / 2.0;
res[i + 1] = res[i] + h * f(k+res[i],t+h/2.0);
t += h;
}
return res;
}
template<typename T>
std::vector<T> solver<T>::rk_4(T (*f)(T val, double s), double h, int steps, T val, double t0) {
std::vector<T>res;
res.resize(steps + 1);
res[0] = val;
double t = t0;
for (int i = 0; i < steps; i++) {
auto k1 = f(res[i],t);
auto k2 = f(res[i]+h/2.0*k1, t+h/3.0);
auto k3 = f(res[i] + h / 2.0 * k2, t + 2*h / 3.0);
auto k4 = f(res[i] + h * k3, t + h);
res[i + 1] = res[i] + (k1 / 6 + k2 / 3 + k3 / 3 + k4 / 6) * h;
t += h;
}
return res;
}
point_3d.h
class point_3d{
double x;
double y;
double z;
public:
point_3d(double x, double y, double z): x(x), y(y), z(z) {};
double get_x();
double get_y();
double get_z();
point_3d operator+(point_3d other);
point_3d& operator=(point_3d other);
friend point_3d operator*(double k, point_3d p);
};
point_3d.cpp
#include "point_3d.h"
double point_3d::get_x() {
return x;
}
double point_3d::get_y() {
return y;
}
double point_3d::get_z() {
return z;
}
point_3d point_3d::operator+(point_3d other) {
double x = get_x() + other.get_x();
double y = get_y() + other.get_y();
double z = get_z() + other.get_z();
return point_3d(x, y, z);
}
point_3d& point_3d::operator=(point_3d other) {
this->x = other.get_x();
this->y = other.get_y();
this->z = other.get_z();
return *this;
}
point_3d operator*(double k, point_3d p) {
double x = p.get_x();
double y = p.get_y();
double z = p.get_z();
return point_3d(k * x, k * y, k * z);
}
You need to include vector library to use std::vector.
#include <vector>
"Unresolved symbol" means that there is a class or a function declared but not found anywhere in the project (or in the libraries it includes).
I have spent countless hours trying to speed up my bilinear interpolation up, with no avail. I even tried a SSE version (a double version and a float version), but that was even slower than this version.
Does anyone have any ideas?
template <typename T>
__forceinline void interp2_mx(const T& x, const T& y,
const T* z,
const int32_t& n,
const int32_t& mm2,
const int32_t& nm2,
T& val,
const T& extrapval = T(0))
{
int64_t xp = (int64_t)(x) - 1; // adjust for MATLAB indexing
int64_t yp = (int64_t)(y) - 1;
if (xp < 0 || xp > nm2 || yp < 0 || yp > mm2)
{
val = extrapval;
}
else
{
const T* line = z + yp * n + xp;
T xf = x - (int64_t)(x);
T yf = y - (int64_t)(y);
T x1mf = (T)1 - xf;
T y1mf = (T)1 - yf;
T v00 = x1mf * y1mf * (*(line));
T v01 = xf * y1mf * (*(line + 1));
T v10 = x1mf * yf * (*(line + n));
T v11 = xf * yf * (*(line + n + 1));
val = v00 + v01 + v10 + v11;
}
}
template <typename T>
void interp2(const T* z,
const int32_t& mz, const int32_t& nz,
const T* xi, const T* yi,
const int32_t& mi, const int32_t& ni,
T* zi,
const T& extrapval = T(0))
{
const int32_t nzm2 = nz - 2;
const int32_t mzm2 = mz - 2;
#pragma omp parallel for
for (int m = 0; m < mi; ++m)
{
T* line_zi = zi + m * ni;
const T* x = xi + m * ni;
const T* y = yi + m * ni;
for (int n = 0; n < ni; ++n, ++x, ++y, ++line_zi)
{
interp2_mx((*x), (*y), z, nz, mzm2, nzm2, (*line_zi));
}
}
}
Your calculation of xf does a float-to-int64_t-to-float conversion. I assume you know the value is in range, otherwise this would be Undefined Behavior (and mathematically pointless). std::modf() may be the better function as it directly calculates the desired value.
I also think that adjacent pixels have rather related xf & x1mf values, yet you recalculate them. I'm not sure about this as your coordinates seem to be stored indirectly ((*x), (*y)). It may very well be more efficient to calculate those on the fly. Since these pointers may alias the output, they can't be prefetched and the reads will be blocking the memory bus.
I wanted to write my own newton fractal generator.. It's using OpenCL... but that's not the problem.. my problem is that atm. only veerryy few pixels are converging.
So to explain what I've done so far:
I've selected a function I wanted to use: f(z)=z^4-1 (for testing purposes)
I've calculated the roots of this function: 1+0î, -1+0î, 0+1î, 0-1î
I've written a OpenCL Host and Kernel:
the kernel uses a struct with 4 doubles: re (real), im (imaginary), r (as abs), phi (as argument, polar angle or how you call it)
computes from resolution, zoom and global_work_id the "type" of the pixel and the intensity - where type is the root the newton method is converging to / whether it's diverging
Here's what I get rendered:
Here is the whole kernel:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define pi 3.14159265359
struct complex {
double im;
double re;
double r;
double phi;
};
struct complex createComplexFromPolar(double _r, double _phi){
struct complex t;
t.r = _r;
t.phi = _phi;
t.re = cos(t.phi)*t.r;
t.im = sin(t.phi)*t.r;
return t;
}
struct complex createComplexFromKarthes(double real, double imag){
struct complex t;
t.re = real;
t.im = imag;
t.phi = atan(imag / real);
t.r = sqrt(t.re*t.re + t.im*t.im);
return t;
}
struct complex recreateComplexFromKarthes(struct complex t){
return t = createComplexFromKarthes(t.re, t.im);
}
struct complex recreateComplexFromPolar(struct complex t){
return t = createComplexFromPolar(t.r, t.phi);
}
struct complex addComplex(const struct complex z, const struct complex c){
struct complex t;
t.re = c.re + z.re;
t.im = c.im + z.im;
return recreateComplexFromKarthes(t);
}
struct complex subComplex(const struct complex z, const struct complex c){
struct complex t;
t.re = z.re - c.re;
t.im = z.im - c.im;
return recreateComplexFromKarthes(t);
}
struct complex addComplexScalar(const struct complex z, const double n){
struct complex t;
t.re = z.re + n;
return recreateComplexFromKarthes(t);
}
struct complex subComplexScalar(const struct complex z, const double n){
struct complex t;
t.re = z.re - n;
return recreateComplexFromKarthes(t);
}
struct complex multComplexScalar(const struct complex z, const double n) {
struct complex t;
t.re = z.re * n;
t.im = z.im * n;
return recreateComplexFromKarthes(t);
}
struct complex multComplex(const struct complex z, const struct complex c) {
return createComplexFromPolar(z.r*c.r, z.phi + c.phi);
}
struct complex powComplex(const struct complex z, int i){
struct complex t = z;
for (int j = 0; j < i; j++){
t = multComplex(t, z);
}
return t;
}
struct complex divComplex(const struct complex z, const struct complex c) {
return createComplexFromPolar(z.r / c.r, z.phi - c.phi);
}
bool compComplex(const struct complex z, const struct complex c, float comp){
struct complex t = subComplex(z, c);
if (fabs(t.re) <= comp && fabs(t.im) <= comp)
return true;
return false;
}
__kernel void newtonFraktal(__global const int* res, __global const int* zoom, __global int* offset, __global const double* param, __global int* result, __global int* resType){
const int x = get_global_id(0) + offset[0];
const int y = get_global_id(1) + offset[1];
const int xRes = res[0];
const int yRes = res[1];
const double a = (x - (xRes / 2)) == 0 ? 0 : (double)(zoom[0] / (x - (double)(xRes / 2)));
const double b = (y - (yRes / 2)) == 0 ? 0 : (double)(zoom[1] / (y - (double)(yRes / 2)));
struct complex z = createComplexFromKarthes(a, b);
struct complex zo = z;
struct complex c = createComplexFromKarthes(param[0], param[1]);
struct complex x1 = createComplexFromKarthes(1,0);
struct complex x2 = createComplexFromKarthes(-1, 0);
struct complex x3 = createComplexFromKarthes(0, 1);
struct complex x4 = createComplexFromKarthes(0, -1);
resType[x + xRes * y] = 3;
int i = 0;
while (i < 30000 && fabs(z.r) < 10000){
z = subComplex(z, divComplex(subComplexScalar(powComplex(z, 4), 1), multComplexScalar(powComplex(z, 3), 4)));
i++;
if (compComplex(z, x1, 0.05)){
resType[x + xRes * y] = 0;
break;
} else if (compComplex(z, x2, 0.05)){
resType[x + xRes * y] = 1;
break;
} else if (compComplex(z, x3, 0.05)){
resType[x + xRes * y] = 2;
break;
}
}
if (fabs(z.r) >= 10000){
resType[x + xRes * y] = 4;
}
result[x + xRes * y] = i;
}
And here is the coloration of the image:
const int xRes = core->getXRes();
const int yRes = core->getYRes();
for (int y = 0; y < fraktal->getHeight(); y++){
for (int x = 0; x < fraktal->getWidth(); x++){
int conDiv = genCL->result[x + y * xRes];
int type = genCL->typeRes[x + y * xRes];
if (type == 0){
//converging to x1
fraktal->setPixel(x, y, 1*conDiv, 1*conDiv, 0, 1);
} else if (type == 1){
//converging to x2
fraktal->setPixel(x, y, 0, 0, 1*conDiv, 1);
} else if (type == 2){
//converging to x3
fraktal->setPixel(x, y, 0, 1*conDiv, 0, 1);
} else if (type == 3){
//diverging and interrupted by loop end
fraktal->setPixel(x, y, 1*conDiv, 0, 0, 1);
} else {
//diverging and interrupted by z.r > 10000
fraktal->setPixel(x, y, 1, 1, 1, 0.1*conDiv);
}
}
}
I had some mistakes in the complex number computations but I check everything today again and again.. I think they should be okay now.. but what else could be the reason that there are just this few start values converging? Did I do something wrong with newton's method?
Thanks for all your help!!
Well somewhat it really helped to run the code as normal C code.. as this makes Debugging easier: so the issue were some coding issues which I have been able to solve now.. for example my pow function was corrupted and when I added or subtracted I forgot to set the imaginary part to the temp complex number .. so here's my final OpenCL kernel:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define pi 3.14159265359
struct complex {
double im;
double re;
double r;
double phi;
};
struct complex createComplexFromPolar(double _r, double _phi){
struct complex t;
t.r = _r;
t.phi = _phi;
t.re = _r*cos(_phi);
t.im = _r*sin(_phi);
return t;
}
struct complex createComplexFromKarthes(double real, double imag){
struct complex t;
t.re = real;
t.im = imag;
t.phi = atan2(imag, real);
t.r = sqrt(t.re*t.re + t.im*t.im);
return t;
}
struct complex recreateComplexFromKarthes(struct complex t){
return createComplexFromKarthes(t.re, t.im);
}
struct complex recreateComplexFromPolar(struct complex t){
return createComplexFromPolar(t.r, t.phi);
}
struct complex addComplex(const struct complex z, const struct complex c){
return createComplexFromKarthes(c.re + z.re, c.im + z.im);
}
struct complex subComplex(const struct complex z, const struct complex c){
return createComplexFromKarthes(z.re - c.re, z.im - c.im);
}
struct complex addComplexScalar(const struct complex z, const double n){
return createComplexFromKarthes(z.re + n,z.im);
}
struct complex subComplexScalar(const struct complex z, const double n){
return createComplexFromKarthes(z.re - n, z.im);
}
struct complex multComplexScalar(const struct complex z, const double n){
return createComplexFromKarthes(z.re * n,z.im * n);
}
struct complex multComplex(const struct complex z, const struct complex c) {
return createComplexFromKarthes(z.re*c.re-z.im*c.im, z.re*c.im+z.im*c.re);
//return createComplexFromPolar(z.r*c.r, z.phi + c.phi);
}
struct complex powComplex(const struct complex z, int i){
struct complex t = z;
for (int j = 0; j < i-1; j++){
t = multComplex(t, z);
}
return t;
}
struct complex divComplex(const struct complex z, const struct complex c) {
return createComplexFromPolar(z.r / c.r, z.phi-c.phi);
}
bool compComplex(const struct complex z, const struct complex c, float comp){
if (fabs(z.re - c.re) <= comp && fabs(z.im - c.im) <= comp)
return true;
return false;
}
__kernel void newtonFraktal(__global const int* res, __global const int* zoom, __global int* offset, __global const double* param, __global int* result, __global int* resType){
const int x = get_global_id(0) + offset[0];
const int y = get_global_id(1) + offset[1];
const int xRes = res[0];
const int yRes = res[1];
const double a = (x - (xRes / 2)) == 0 ? 0 : (double)((x - (double)(xRes / 2)) / zoom[0]);
const double b = (y - (yRes / 2)) == 0 ? 0 : (double)((y - (double)(yRes / 2)) / zoom[1]);
struct complex z = createComplexFromKarthes(a, b);
//struct complex c = createComplexFromKarthes(param[0], param[1]);
struct complex x1 = createComplexFromKarthes(0.7071068, 0.7071068);
struct complex x2 = createComplexFromKarthes(0.7071068, -0.7071068);
struct complex x3 = createComplexFromKarthes(-0.7071068, 0.7071068);
struct complex x4 = createComplexFromKarthes(-0.7071068, -0.7071068);
struct complex f, d;
resType[x + xRes * y] = 11;
int i = 0;
while (i < 6000 && fabs(z.r) < 10000){
f = addComplexScalar(powComplex(z, 4), 1);
d = multComplexScalar(powComplex(z, 3), 3);
z = subComplex(z, divComplex(f, d));
i++;
if (compComplex(z, x1, 0.0000001)){
resType[x + xRes * y] = 0;
break;
} else if (compComplex(z, x2, 0.0000001)){
resType[x + xRes * y] = 1;
break;
} else if (compComplex(z, x3, 0.0000001)){
resType[x + xRes * y] = 2;
break;
} else if (compComplex(z, x4, 0.0000001)){
resType[x + xRes * y] = 3;
break;
}
}
if (fabs(z.r) >= 1000){
resType[x + xRes * y] = 10;
}
result[x + xRes * y] = i;
}
hope it might help someone someday.. :)