I'm using CppAD with Ipopt. In some places I have to typecast the AD format to its base format that is double. I have used CppAD::Value(AD var) it gives me output in double format.
But considering line no.27 the code is fine during compile time. Its showing the following error in runtime. The error is :
cppad-20160000.1 error from a known source:
Value: argument is a variable (not a parameter)
Error detected by false result for
Parameter(x)
at line 89 in the file
/usr/include/cppad/local/value.hpp
I have attached the code I'm running for verification. Kindly help. please install Ipopt before running the code.
The type of value is still CppAD::AD . But the Value() is not working.
#include <cppad/ipopt/solve.hpp>
#include <iostream>
namespace {
using CppAD::AD;
class FG_eval {
public:
typedef CPPAD_TESTVECTOR( AD<double> ) ADvector;
void operator()(ADvector& fg, const ADvector& x)
{ assert( fg.size() == 3 );
assert( x.size() == 4 );
// Fortran style indexing
AD<double> x1 = x[0];
AD<double> x2 = x[1];
AD<double> x3 = x[2];
AD<double> x4 = x[3];
// f(x)
fg[0] = x1 * x4 * (x1 + x2 + x3) + x3;
// g_1 (x)
fg[1] = x1 * x2 * x3 * x4;
// g_2 (x)
fg[2] = x1 * x1 + x2 * x2 + x3 * x3 + x4 * x4;
// help on this line
std::cout << CppAD::Value(x1) << std::endl;
//
return;
}
};
}
bool get_started()
{ bool ok = true;
size_t i;
typedef CPPAD_TESTVECTOR( double ) Dvector;
// number of independent variables (domain dimension for f and g)
size_t nx = 4;
// number of constraints (range dimension for g)
size_t ng = 2;
// initial value of the independent variables
Dvector xi(nx);
xi[0] = 1.0;
xi[1] = 5.0;
xi[2] = 5.0;
xi[3] = 1.0;
// lower and upper limits for x
Dvector xl(nx), xu(nx);
for(i = 0; i < nx; i++)
{ xl[i] = 1.0;
xu[i] = 5.0;
}
// lower and upper limits for g
Dvector gl(ng), gu(ng);
gl[0] = 25.0; gu[0] = 1.0e19;
gl[1] = 40.0; gu[1] = 40.0;
// object that computes objective and constraints
FG_eval fg_eval;
// options
std::string options;
// turn off any printing
options += "Integer print_level 0\n";
options += "String sb yes\n";
// maximum number of iterations
options += "Integer max_iter 10\n";
// approximate accuracy in first order necessary conditions;
// see Mathematical Programming, Volume 106, Number 1,
// Pages 25-57, Equation (6)
options += "Numeric tol 1e-6\n";
// derivative testing
options += "String derivative_test second-order\n";
// maximum amount of random pertubation; e.g.,
// when evaluation finite diff
options += "Numeric point_perturbation_radius 0.\n";
// place to return solution
CppAD::ipopt::solve_result<Dvector> solution;
// solve the problem
CppAD::ipopt::solve<Dvector, FG_eval>(
options, xi, xl, xu, gl, gu, fg_eval, solution
);
//
// Check some of the solution values
//
ok &= solution.status == CppAD::ipopt::solve_result<Dvector>::success;
//
double check_x[] = { 1.000000, 4.743000, 3.82115, 1.379408 };
double check_zl[] = { 1.087871, 0., 0., 0. };
double check_zu[] = { 0., 0., 0., 0. };
double rel_tol = 1e-6; // relative tolerance
double abs_tol = 1e-6; // absolute tolerance
for(i = 0; i < nx; i++)
{ ok &= CppAD::NearEqual(
check_x[i], solution.x[i], rel_tol, abs_tol
);
ok &= CppAD::NearEqual(
check_zl[i], solution.zl[i], rel_tol, abs_tol
);
ok &= CppAD::NearEqual(
check_zu[i], solution.zu[i], rel_tol, abs_tol
);
}
return ok;
}
int main()
{
get_started();
return 0;
}
cmakelists.txt
cmake_minimum_required(VERSION 3.5)
project(jnk)
add_compile_options(-std=c++11)
add_executable(${PROJECT_NAME} testCppAD.cpp)
target_link_libraries(${PROJECT_NAME}
ipopt z uv
)
I found out the answer. We have to use an inbuilt function CppAD::Var2Par() before using CppAD::Value(). Will work.
Related
I'm new in the world of C++ and I'm having some trouble with the boost library. In my problem I want to solve a ODE-System with 5 equations. It isn't a stiff problem. As iterative method I used both integreate(rhs, x0, t0, tf, size_step, write_output) and integreate_adaptive(stepper, sys, x0, t0, tf, size_step, write_output). Both these method actually integrate the equations but giving me non-sense results changing the size of the step from 0.001 to 5 almost randomly. The equations and data are correct. What can I do to fix this problem? Here is the code:
#include <iostream>
#include <vector>
#include <boost/numeric/odeint.hpp>
#include <fstream>
#include <boost/array.hpp>
using namespace std;
using namespace boost::numeric::odeint;
//DATA
double Lin = 20000; // kg/h
double Gdry = 15000; // kg/h
double P = 760; // mmHg
double TinH2O = 50; // °C
double ToutH2O = 25; // °C
double Tinair = 20; // °C
double Z = 0.5; // relative humidity
double Cu = 0.26; // kcal/kg*K
double CpL = 1; // kcal/kg*K
double DHev = 580; // kcal/kg
double hga = 4000; // kcal/m/h/K
double hla = 30000; // kcal/m/h/K
double A = -49.705; // Pev 1st coeff mmHg vs °C
double B = 2.71; // Pev 2nd coeff mmHg vs °C
double Usair = 0.62*(A + B*Tinair) / P;
double Uair = Z*Usair;
double Kua = hga / Cu;
double L0 = 19292; // kg/h
typedef vector< double > state_type;
vector <double> pack_height;
vector <double> Umidity;
vector <double> T_liquid;
vector <double> T_gas;
vector <double> Liquid_flow;
vector <double> Gas_flow;
void rhs(const state_type& x , state_type& dxdt , const double z )
{// U Tl Tg L G
double Ti = (hla*x[1] + hga*x[2] + Kua*DHev*(x[0] - 0.62*A / P)) / (hla + hga + Kua*DHev*0.62*B / P);
double Ui = 0.62*(A + B*Ti) / P;
dxdt[0] = Kua*(Ui - x[0]) / Gdry / 100;
dxdt[1] = hla*(x[1] - Ti) / x[3] / CpL / 100;
dxdt[2] = hga*(Ti - x[2]) / Gdry / Cu / 100;
dxdt[3] = Kua*(Ui - x[0]) / 100;
dxdt[4] = Kua*(Ui - x[0]) / 100;
}
void write_output(const state_type& x, const double z)
{
pack_height.push_back(z);
Umidity.push_back(x[0]);
T_liquid.push_back(x[1]);
T_gas.push_back(x[2]);
Liquid_flow.push_back(x[3]);
Gas_flow.push_back(x[4]);
cout << z << " " << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4] << endl;
}
int main()
{
state_type x(5);
x[0] = Uair;
x[1] = ToutH2O;
x[2] = Tinair;
x[3] = L0;
x[4] = Gdry;
double z0 = 0.0;
double zf = 5.5;
double stepsize = 0.001;
integrate( rhs , x , z0 , zf , stepsize , write_output );
return 0;
}
And this is the final results that i get from the prompt:
0 0.00183349 25 20 19292 15000
0.001 0.00183356 25 20 19292 15000
0.0055 0.0018339 25.0002 20.0001 19292 15000
0.02575 0.00183542 25.001 20.0007 19292 15000
0.116875 0.00184228 25.0046 20.003 19292.1 15000.1
0.526938 0.00187312 25.0206 20.0135 19292.6 15000.6
2.37222 0.00201203 25.0928 20.0608 19294.7 15002.7
5.5 0.00224788 25.2155 20.142 19298.2 15006.2
Only the first iteration has the right-asked stepsize.. and obiviously the solution is not the right one.. what can i do? Thank you in advance. :)
If you read the documentation, then you will find that the constant step-size routines are integrate_const and integrate_n_steps, or possibly integrate_adaptive with a non-controlled stepper.
The short call to integrate uses the standard dopri5 stepper with adaptive step size, so that the changing step size is no surprise. You could possibly use the dense output of the stepper to interpolate values at equidistant times.
I am looking for for a fast-SSE-low-precision (~1e-3) exponential function.
I came across this great answer:
/* max. rel. error = 3.55959567e-2 on [-87.33654, 88.72283] */
__m128 FastExpSse (__m128 x)
{
__m128 a = _mm_set1_ps (12102203.0f); /* (1 << 23) / log(2) */
__m128i b = _mm_set1_epi32 (127 * (1 << 23) - 298765);
__m128i t = _mm_add_epi32 (_mm_cvtps_epi32 (_mm_mul_ps (a, x)), b);
return _mm_castsi128_ps (t);
}
Based on the work of Nicol N. Schraudolph: N. N. Schraudolph. "A fast, compact approximation of the exponential function." Neural Computation, 11(4), May 1999, pp.853-862.
Now I would need a "double precision" version: __m128d FastExpSSE (__m128d x).
This is because I don't control the input and output precision, which happen to be double precision, and the two conversions double -> float, then float -> double is eating 50% of the CPU resources.
What changes would be needed?
I naively tried this:
__m128i double_to_uint64(__m128d x) {
x = _mm_add_pd(x, _mm_set1_pd(0x0010000000000000));
return _mm_xor_si128(
_mm_castpd_si128(x),
_mm_castpd_si128(_mm_set1_pd(0x0010000000000000))
);
}
__m128d FastExpSseDouble(__m128d x) {
#define S 52
#define C (1llu << S) / log(2)
__m128d a = _mm_set1_pd(C); /* (1 << 52) / log(2) */
__m128i b = _mm_set1_epi64x(127 * (1llu << S) - 298765llu << 29);
auto y = double_to_uint64(_mm_mul_pd(a, x));
__m128i t = _mm_add_epi64(y, b);
return _mm_castsi128_pd(t);
}
Of course this returns garbage as I don't know what I'm doing...
edit:
About the 50% factor, it is a very rough estimation, comparing the speedup (with respect to std::exp) converting a vector of single precision numbers (great) to the speedup with a list of double precision numbers (not so great).
Here is the code I used:
// gives the result in place
void FastExpSseVector(std::vector<double> & v) { //vector with several millions elements
const auto I = v.size();
const auto N = (I / 4) * 4;
for (int n = 0; n < N; n += 4) {
float a[4] = { float(v[n]), float(v[n + 1]), float(v[n + 2]), float(v[n + 3]) };
__m128 x;
x = _mm_load_ps(a);
auto r = FastExpSse(x);
_mm_store_ps(a, r);
v[n] = a[0];
v[n + 1] = a[1];
v[n + 2] = a[2];
v[n + 3] = a[3];
}
for (int n = N; n < I; ++n) {
v[n] = FastExp(v[n]);
}
}
And here is what I would do if I had this "double precision" version:
void FastExpSseVectorDouble(std::vector<double> & v) {
const auto I = v.size();
const auto N = (I / 2) * 2;
for (int n = 0; n < N; n += 2) {
__m128d x;
x = _mm_load_pd(&v[n]);
auto r = FastExpSseDouble(x);
_mm_store_pd(&v[n], r);
}
for (int n = N; n < I; ++n) {
v[n] = FastExp(v[n]);
}
}
Something like this should do the job. You need to tune the 1.05 constant to get a lower maximal error -- I'm too lazy to do that:
__m128d fastexp(const __m128d &x)
{
__m128d scaled = _mm_add_pd(_mm_mul_pd(x, _mm_set1_pd(1.0/std::log(2.0)) ), _mm_set1_pd(3*1024.0-1.05));
return _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(scaled), 11));
}
This just gets about 2.5% relative precision -- for better precision you may need to add a second term.
Also, for values which overflow or underflow this will result in unspecified values, you can avoid this by clamping the scaled value to some values.
I have a program that solves generally for 1D brownian motion using an Euler's Method.
Being a stochastic process, I want to average it over many particles. But I find that as I ramp up the number of particles, it overloads and i get the std::badalloc error, which I understand is a memory error.
Here is my full code
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <cstdlib>
#include <limits>
#include <ctime>
using namespace std;
// Box-Muller Method to generate gaussian numbers
double generateGaussianNoise(double mu, double sigma) {
const double epsilon = std::numeric_limits<double>::min();
const double tau = 2.0 * 3.14159265358979323846;
static double z0, z1;
static bool generate;
generate = !generate;
if (!generate) return z1 * sigma + mu;
double u1, u2;
do {
u1 = rand() * (1.0 / RAND_MAX);
u2 = rand() * (1.0 / RAND_MAX);
} while (u1 <= epsilon);
z0 = sqrt(-2.0 * log(u1)) * cos(tau * u2);
z1 = sqrt(-2.0 * log(u1)) * sin(tau * u2);
return z0 * sigma + mu;
}
int main() {
// Initialize Variables
double gg; // Gaussian Number Picked from distribution
// Integrator
double t0 = 0; // Setting the Time Window
double tf = 10;
double n = 5000; // Number of Steps
double h = (tf - t0) / n; // Time Step Size
// Set Constants
const double pii = atan(1) * 4; // pi
const double eta = 1; // viscous constant
const double m = 1; // mass
const double aa = 1; // radius
const double Temp = 30; // Temperature in Kelvins
const double KB = 1; // Boltzmann Constant
const double alpha = (6 * pii * eta * aa);
// More Constants
const double mu = 0; // Gaussian Mean
const double sigma = 1; // Gaussian Std Deviation
const double ng = n; // No. of pts to generate for Gauss distribution
const double npart = 1000; // No. of Particles
// Initial Conditions
double x0 = 0;
double y0 = 0;
double t = t0;
// Vectors
vector<double> storX; // Vector that keeps displacement values
vector<double> storY; // Vector that keeps velocity values
vector<double> storT; // Vector to store time
vector<double> storeGaussian; // Vector to store Gaussian numbers generated
vector<double> holder; // Placeholder Vector for calculation operations
vector<double> mainstore; // Vector that holds the final value desired
storT.push_back(t0);
// Prepares mainstore
for (int z = 0; z < (n+1); z++) {
mainstore.push_back(0);
}
for (int NN = 0; NN < npart; NN++) {
holder.clear();
storX.clear();
storY.clear();
storT.clear();
storT.push_back(0);
// Prepares holder
for (int z = 0; z < (n+1); z++) {
holder.push_back(0);
storX.push_back(0);
storY.push_back(0);
}
// Gaussian Generator
srand(time(NULL));
for (double iiii = 0; iiii < ng; iiii++) {
gg = generateGaussianNoise(0, 1); // generateGaussianNoise(mu,sigma)
storeGaussian.push_back(gg);
}
// Solver
for (int ii = 0; ii < n; ii++) {
storY[ii + 1] =
storY[ii] - (alpha / m) * storY[ii] * h +
(sqrt(2 * alpha * KB * Temp) / m) * sqrt(h) * storeGaussian[ii];
storX[ii + 1] = storX[ii] + storY[ii] * h;
holder[ii + 1] =
pow(storX[ii + 1], 2); // Finds the displacement squared
t = t + h;
storT.push_back(t);
}
// Updates the Main Storage
for (int z = 0; z < storX.size(); z++) {
mainstore[z] = mainstore[z] + holder[z];
}
}
// Average over the number of particles
for (int z = 0; z < storX.size(); z++) {
mainstore[z] = mainstore[z] / (npart);
}
// Outputs the data
ofstream fout("LangevinEulerTest.txt");
for (int jj = 0; jj < storX.size(); jj++) {
fout << storT[jj] << '\t' << mainstore[jj] << '\t' << storX[jj] << endl;
}
return 0;
}
As you can see, npart is the variable that I change to vary the number of particles. But after each iteration, I do clear my storage vectors like storX,storY... So on paper, the number of particles should not affect memory? I am only just calling the compiler to repeat many more times, and add onto the main storage vector mainstore. I am running my code on a computer with 4GB ram.
Would greatly appreciate it if anyone could point out my errors in logic or suggest improvements.
Edit: Currently the number of particles is set to npart = 1000.
So when I try to ramp it up to like npart = 20000 or npart = 50000, it gives me memory errors.
Edit2 I've edited the code to allocate an extra index to each of the storage vectors. But it does not seem to fix the memory overflow
There is an out of bounds exception in the solver part. storY has size n and you access ii+1 where i goes up to n-1. So for your code provided. storY has size 5000. It is allowed to access with indices between 0 and 4999 (including) but you try to access with index 5000. The same for storX, holder and mainstore.
Also, storeGaussian does not get cleared before adding new variables. It grows by n for each npart loop. You access only the first n values of it in the solver part anyway.
Please note, that vector::clear removes all elements from the vector, but does not necessarily change the vector's capacity (i.e. it's storage array), see the documentation.
This won't cause the problem here, because you'll reuse the same array in the next runs, but it's something to be aware when using vectors.
I'm fairly new with C++ and I'm trying to use it via Rcpp to speed up my R code.
The below code integrates from from t0 to t1- this is done in the "lorenz" function. Test4 integrates using "lorenz" "counts" number of times. However at time "t1" the state of the system is modified in "write_lorenz" before the system is rerun and this is where the problem is. If I run the same program over and over again by calling test4 from R, printing to the screen always produces the same result, however, my returned matrix "u" does not, and seems to eventually converge to whatever "t1" is which is the problem.
My input values don't change so I'm wondering if there a memory leak, or if something else is going on, how to fix it.
Also I'm wondering if my initialization of "u" is incorrect and I should be using the "new" command.
What I tried was
NumericMatrix* u = NULL;
*u = new NumericMatrix;
and then I tried accessing the elements of the matrix as *u(1,2) for instance, but accessing the elements this way caused an error saying u was not a function.
Any help would be greatly appreciated
I modified this code from the following site
http://headmyshoulder.github.io/odeint-v2/examples.html
so I could use it with Rcpp
//###################################R Code ###############################
library(Rcpp)
sourceCpp("test4.cpp")
sigma <- 10.0
R <-28.0
b <- 8.0 / 3.0
a2 <- c(10.0 , 1.0 , 1.0) #initial conditions X0,I0,Y0
L2 <- c(0.0 , 2.0 , 0.1) #initial time, kick time, error
counts <- 2
kick <-1.0; # kick size
pars <-c(sigma,R,b,kick)
test4(a=a,L2=L2,counts=counts,pars= pars)
// C ++ code
//[[Rcpp::depends(BH)]]
//[[Rcpp::depends(RcppEigen)]]
//[[Rcpp::plugins("cpp11")]]
#include <Rcpp.h>
#include <RcppEigen.h>
#include <math.h>
#include <boost/array.hpp>
#include <boost/numeric/odeint.hpp>
#include <boost/algorithm/string.hpp>
using namespace boost::numeric::odeint;
using namespace std;
using namespace Rcpp;
using namespace Eigen;
double sigma =0;
double e =0;
double b =0 ;
double t0 =0;
double t1 = 0;
double kick =0;
double X0 = 0;
double I0 =0;
double Y0 =0;
double N = 0;
int counter =0;
typedef boost::array< double , 3 > state_type;
NumericMatrix u(4,5);
void lorenz( const state_type &x , state_type &dxdt , double t )
{
dxdt[0] = sigma * ( x[1] - x[0] );
dxdt[1] = e * x[0] - x[1] - x[0] * x[2];
dxdt[2] = -b * x[2] + x[0] * x[1];
}
void write_lorenz( const state_type &x , const double t )
{
if(t==t1){
X0 = x[0];
I0 = x[1];
Y0 = x[2];
N = X0+I0+Y0;
X0 = X0 + exp(kick*N);
counter++;
//for some reason cout and u don't match for multiple runs of the
//program
cout << t << '\t' << X0 << '\t' << I0 << '\t' << Y0 << endl;
u(counter,0) = t;
u(counter,1) = X0;
u(counter,2) = I0;
u(counter,3) = Y0;
}
}
// [[Rcpp::export]]
NumericMatrix test4(NumericVector a, NumericVector L2,int counts,NumericVector pars)
{
double error; // control integration error
// initialize model parameters
//maybe these should be parenthesis?
sigma = pars[0]; //# average per capita birh rate 1-10(the mean is 4.7)
e = pars[1]; // # per capita average growth rate
b= pars[2];
kick = pars[3]; // kick size
//cout << sigma << R << b << kick << endl;
//myfile.open (ST);
X0 = a[0]; I0 =a[1]; Y0 = a[2];
int i = 0;
t0 = L2[0];
t1 = L2[1];
error = L2[2];
u(0,0) = t0;
u(0,1) = X0;
u(0,2) = I0;
u(0,3) = Y0;
// initial conditions
for(i=0;i<counts;i++)
{
state_type x = { X0 , I0 , Y0 };
integrate( lorenz , x , t0 , t1 , error , write_lorenz );
}
return u; // the variable I hope will be global
}
Here is a simple adaptation of the pure C++ file you linked to. We simply define a struct of three vectors into which we push the elements of each iterations--as opposed to printing them to standard output.
For data structures that grow, we prefer C++ standard library types (as our types have to be like R types, their internals do not match well to increasing one-by-one which is expensive for them). So at the end, we just copy into an R data.frame.
#include <boost/array.hpp>
#include <boost/numeric/odeint.hpp>
#include <Rcpp.h>
// [[Rcpp::depends(BH)]]
// [[Rcpp::plugins(cpp11)]]
using namespace std;
using namespace boost::numeric::odeint;
const double sigma = 10.0, r = 28.0, b = 8.0 / 3.0;
typedef boost::array< double , 3 > state_type;
void lorenz( const state_type &x , state_type &dxdt , double t ) {
dxdt[0] = sigma * ( x[1] - x[0] );
dxdt[1] = r * x[0] - x[1] - x[0] * x[2];
dxdt[2] = -b * x[2] + x[0] * x[1];
}
struct foo { std::vector<double> a, b, c; };
struct foo f;
void append_lorenz(const state_type &x , const double t ) {
f.a.push_back(x[0]);
f.b.push_back(x[1]);
f.c.push_back(x[2]);
}
using namespace Rcpp;
// [[Rcpp::export]]
DataFrame callMain() {
state_type x = { 10.0 , 1.0 , 1.0 }; // initial conditions
integrate( lorenz , x , 0.0 , 25.0 , 0.1 , append_lorenz );
return DataFrame::create(Named("a") = f.a,
Named("b") = f.b,
Named("c") = f.c);
}
/*** R
res <- callMain()
print(head(res))
*/
Here is the output of the R code (intentially limited to fist few rows):
R> sourceCpp("/tmp/lorenz.cpp")
R> res <- callMain()
R> print(head(res))
a b c
1 10.00000 1.00000 1.00000
2 9.40816 2.99719 1.12779
3 8.92164 5.35684 1.46991
4 8.68193 7.82671 2.05762
5 8.73730 10.42718 2.94783
6 9.11080 13.10452 4.18849
R>
I managed to get my sqrt function to run perfectly, but I'm second guessing if I wrote this code correctly based on the pseudo code I was given.
Here is the pseudo code:
x = 1
repeat 10 times: x = (x + n / x) / 2
return x.
The code I wrote,
#include <iostream>
#include <math.h>
using namespace std;
double my_sqrt_1(double n)
{
double x= 1; x<10; ++x;
return (x+n/x)/2;
}
No, your code is not following your pseudo-code. For example, you're not repeating anything in your code. You need to add a loop to do that:
#include <iostream>
#include <math.h>
using namespace std;
double my_sqrt_1(double n)
{
double x = 1;
for(int i = 0; i < 10; ++i) // repeat 10 times
x = (x+n/x)/2;
return x;
}
Let's analyze your code:
double x = 1;
// Ok, x set to 1
x < 10;
// This is true, as 1 is less than 10, but it is not used anywhere
++x;
// Increment x - now x == 2
return (x + n / x) / 2
// return value is always (2 + n / 2) / 2
As you don't have any loop, function will always exit in the first "iteration" with the return value (2 + n / 2) / 2.
Just as another approach that you can use binary search or the another pretty elegant solution is to use the Newton's method.
Newton's method is a method for finding roots of a function, making use of a function's derivative. At each step, a value is calculated as: x(step) = x(step-1) - f(x(step-1))/f'(x(step-1)) Newton's_method
This might be faster than binary search.My implementation in C++:
double NewtonMethod(double x) {
double eps = 0.0001; //the precision
double x0 = 10;
while( fabs(x-x0) > eps) {
double a = x0*x0-n;
double r = a/(2*x0);
x = x0 - r;
x0 = x;
}
return x;
}
Since people are showing different approaches to calculating the square root, I couldn't resist ;)...
Below is the exact copy (with the original comments, but without preprocessor directives) of the inverse square root implementation from Quake III Arena:
float Q_rsqrt( float number )
{
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = number * 0.5F;
y = number;
i = * ( long * ) &y; // evil floating point bit level hacking
i = 0x5f3759df - ( i >> 1 ); // what the...?
y = * ( float * ) &i;
y = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration
// y = y * ( threehalfs - ( x2 * y * y ) ); // 2nd iteration, this can be removed
return y;
}