I searched around and couldn't find the trunc function for C++. I know I can do this:
int main()
{
double a = 12.566789;
cout << setprecision(2) << fixed << (int)(a * 100) / 100.0 << endl;
return 0;
}
but I'm not sure it's the best way to do this. Thank you.
If your C library is so old that it lacks a trunc function (specified in C99), you can easily implement one based on floor and ceil (specified in C89)
double trunc(double d){ return (d>0) ? floor(d) : ceil(d) ; }
trunc is there, in <cmath>:
#include <iostream>
#include <cmath>
int main() {
std::cout << trunc(3.141516) << std::endl;
}
I suppose you're looking for something else?
There's a trunc function in C that you can use in C++
trunc(a*100)/100
Keep in mind that you still have to specify formatting requests, because floating point can't represent all real numbers exactly, and you could get output like 12.5600000001 or 12.55999999 if you don't tell the output code the precision you want.
TL;DR
Use the following for output:
cout << setprecision(2) << fixed << a<< endl;
And the following if you need a truncated result somewhere during a mathematical calculation:
trunc(a*100)/100
(Or better yet, use fixed-point math.)
Sure. Use the trunc() function from math.h. It's a C function, but it works as well in C++ as it does in C. If you want to keep a couple digits, you can always:
double a = 12.566789;
double b = trunc(a * 100) / 100.0;
If you're using an ancient C or C++ library that doesn't implement trunc, use boost::math::trunc.
I've developed a very fast trunc-function:
double ftrunc( double d )
{
static_assert(sizeof(double) == sizeof(uint64_t), "sizeof(double) not equal to sizeof(uint64_t)");
static_assert(numeric_limits<double>::is_iec559, "double must be IEEE-754");
// assume size_t is our CPU's native register-width
static_assert(sizeof(size_t) == sizeof(uint64_t) || sizeof(size_t) == sizeof(uint32_t), "register-width must be 32 or 64 bit");
if constexpr( sizeof(size_t) == sizeof(uint64_t) )
// we have 64 bit registers
{
unsigned const MANTISSA_BITS = 52,
EXP_BIAS = 0x3FF,
INF_NAN_BASE = 0x7FF;
uint64_t const EXP_MASK = (uint64_t)0x7FF << MANTISSA_BITS,
SIGN_MASK = (uint64_t)0x800 << MANTISSA_BITS ,
MIN_INTEGRAL_DIGITS_EXP = (uint64_t) EXP_BIAS << MANTISSA_BITS,
MIN_INTEGRAL_ONLY_EXP = (uint64_t)(EXP_BIAS + MANTISSA_BITS) << MANTISSA_BITS,
INF_NAN_EXP = (uint64_t)INF_NAN_BASE << MANTISSA_BITS,
NEG_MANTISSA_MASK = 0x000FFFFFFFFFFFFFu;
union
{
double du;
uint64_t dx;
};
du = d;
uint64_t exp = dx & EXP_MASK;
if( exp >= MIN_INTEGRAL_DIGITS_EXP )
// value has integral digits
if( exp < MIN_INTEGRAL_ONLY_EXP )
{
// there are fraction-digits to mask out, mask them
unsigned shift = (unsigned)(exp >> MANTISSA_BITS) - EXP_BIAS;
dx &= ~(NEG_MANTISSA_MASK >> shift);
return du;
}
else
if( exp < INF_NAN_EXP )
// value is integral
return du;
else
// infinite, NaN, SNaN
// raise exception on SNaN if necessary
return du + du;
else
{
// below +/-1.0
// return +/-0.0
dx &= SIGN_MASK;
return du;
}
}
else if constexpr( sizeof(size_t) == sizeof(uint32_t) )
// we have 32 bit registers
{
unsigned const MANTISSA_BITS = 52,
HI_MANTISSA_BITS = 20,
EXP_BIAS = 0x3FF,
INF_NAN_BASE = 0x7FF;
uint32_t const EXP_MASK = (uint32_t)0x7FFu << HI_MANTISSA_BITS,
SIGN_MASK = (uint32_t)0x800u << HI_MANTISSA_BITS,
MIN_INTEGRAL_DIGITS_EXP = (uint32_t) EXP_BIAS << HI_MANTISSA_BITS,
MAX_INTEGRAL32_EXP = (uint32_t)(EXP_BIAS + HI_MANTISSA_BITS) << HI_MANTISSA_BITS,
MIN_INTEGRAL_ONLY_EXP = (uint32_t)(EXP_BIAS + MANTISSA_BITS) << HI_MANTISSA_BITS,
INF_NAN_EXP = (uint32_t)INF_NAN_BASE << HI_MANTISSA_BITS,
NEG_HI_MANTISSA_MASK = 0x000FFFFFu,
NEG_LO_MANTISSA_MASK = 0xFFFFFFFFu;
union
{
double du;
struct
{
uint32_t dxLo;
uint32_t dxHi;
};
};
du = d;
uint32_t exp = dxHi & EXP_MASK;
if( exp >= MIN_INTEGRAL_DIGITS_EXP )
// value has integral digits
if( exp < MIN_INTEGRAL_ONLY_EXP )
// there are fraction-digits to mask out
if( exp <= MAX_INTEGRAL32_EXP )
{
// the fraction digits are in the upper dword, mask them and zero the lower dword
unsigned shift = (unsigned)(exp >> HI_MANTISSA_BITS) - EXP_BIAS;
dxHi &= ~(NEG_HI_MANTISSA_MASK >> shift);
dxLo = 0;
return du;
}
else
{
// the fraction digits are in the lower dword, mask them
unsigned shift = (unsigned)(exp >> HI_MANTISSA_BITS) - EXP_BIAS - HI_MANTISSA_BITS;
dxLo &= ~(NEG_LO_MANTISSA_MASK >> shift);
return du;
}
else
if( exp < INF_NAN_EXP )
// value is integral
return du;
else
// infinite, NaN, SNaN
// raise exception on SNaN if necessary
return du + du;
else
{
// below +/-1.0
// return +/-0.0
dxHi &= SIGN_MASK;
dxLo = 0;
return du;
}
}
}
It's faster than most implemementations. On my Ryzen 7 1800X the average execution-time of values >= 2^0 and <= 2^54 is 12 clock cycles.
use ceil or floor from cmath
Related
I have a issue with my code below which does encoding of a vector of long into a string by storing the differences of the sequence.
The encode / decode works fine as long as the value is same or below 2^30
Any value above it, the logic fails. Note that the sizeof(long) is 8 bytes.
static std::string encode(const std::vector<long>& path) {
long lastValue = 0L;
std::stringstream result;
for (long value : path) {
long delta = value - lastValue;
lastValue = value;
long var = 0;
// Shift the delta value left by 1 bit and encode each 5-bit chunk into a character
for (var = delta < 0 ? ~(delta << 1) : delta << 1; var >= 32L; var >>= 5) {
result << (char)((32L | var & 31L) + 63L); //char is getting written to result stringstream
}
// Encode the last 5-bit chunk into a character
result << (char)(var + 63L); // char is getting written to result stringstream
}
std::cout << std::endl;
return result.str();
}
static std::unique_ptr<std::vector<long>> decode(const std::string& encoded) {
auto decoded = std::make_unique<std::vector<long>>();
long last_val = 0;
int index = 0;
while (index < encoded.length()) {
int shift = 0;
long current = 1;
int c;
do {
c = encoded[index++] - 63 - 1;
current += c << shift;
shift += 5;
} while (c >= 31);
long v = ( (current & 1) == 0 ? current >> 1 : ~(current >> 1) );
last_val += v;
decoded->push_back(last_val);
}
return std::move(decoded);
}
Can someone please provide insight what might be going wrong ?
inside the decode function, it was required to declare c as "long" and not as "int".
How can I get numerator and denominator from a fractional number? for example, from "1.375" i want to get "1375/1000" or "11/8" as a result. How can i do it with c++??
I have tried to do it by separating the numbers before the point and after the point but it doesn't give any idea how to get my desired output.
You didn't really specify whether you need to convert a floating point or a string to ratio, so I'm going to assume the former one.
Instead of trying string or arithmetic-based approaches, you can directly use properties of IEEE-754 encoding.
Floats (called binary32 by the standard) are encoded in memory like this:
S EEEEEEEE MMMMMMMMMMMMMMMMMMMMMMM
^ ^
bit 31 bit 0
where S is sign bit, Es are exponent bits (8 of them) Ms are mantissa bits (23 bits).
The number can be decoded like this:
value = (-1)^S * significand * 2 ^ expoenent
where:
significand = 1.MMMMMMMMMMMMMMMMMMMMMMM (as binary)
exponent = EEEEEEEE (as binary) - 127
(note: this is for so called "normal numbers", there are also zeroes, subnormals, infinities and NaNs - see Wikipedia page I linked)
This can be used here. We can rewrite the equation above like this:
(-1)^S * significand * exponent = (-1)^s * (significand * 2^23) * 2 ^ (exponent - 23)
The point is that significand * 2^23 is an integer (equal to 1.MMMMMMMMMMMMMMMMMMMMMMM, binary - by multiplying by 2^23, we moved the point 23 places right).2 ^ (exponent - 23) is an integer too, obviously.
In other words: we can write the number as:
(significand * 2^23) / 2^(-(exponent - 23)) (when exponent - 23 < 0)
or
[(significand * 2^23) * 2^(exponent - 23)] / 1 (when exponent - 23 >= 0)
So we have both numerator and denominator - directly from binary representation of the number.
All of the above could be implemented like this in C++:
struct Ratio
{
int64_t numerator; // numerator includes sign
uint64_t denominator;
float toFloat() const
{
return static_cast<float>(numerator) / denominator;
}
static Ratio fromFloat(float v)
{
// First, obtain bitwise representation of the value
const uint32_t bitwiseRepr = *reinterpret_cast<uint32_t*>(&v);
// Extract sign, exponent and mantissa bits (as stored in memory) for convenience:
const uint32_t signBit = bitwiseRepr >> 31u;
const uint32_t expBits = (bitwiseRepr >> 23u) & 0xffu; // 8 bits set
const uint32_t mntsBits = bitwiseRepr & 0x7fffffu; // 23 bits set
// Handle some special cases:
if(expBits == 0 && mntsBits == 0)
{
// special case: +0 and -0
return {0, 1};
}
else if(expBits == 255u && mntsBits == 0)
{
// special case: +inf, -inf
// Let's agree that infinity is always represented as 1/0 in Ratio
return {signBit ? -1 : 1, 0};
}
else if(expBits == 255u)
{
// special case: nan
// Let's agree, that if we get NaN, we returns max int64_t by 0
return {std::numeric_limits<int64_t>::max(), 0};
}
// mask lowest 23 bits (mantissa)
uint32_t significand = (1u << 23u) | mntsBits;
const int64_t signFactor = signBit ? -1 : 1;
const int32_t exp = expBits - 127 - 23;
if(exp < 0)
{
return {signFactor * static_cast<int64_t>(significand), 1u << static_cast<uint32_t>(-exp)};
}
else
{
return {signFactor * static_cast<int64_t>(significand * (1u << static_cast<uint32_t>(exp))), 1};
}
}
};
(hopefully comments and description above are understandable - let me know, if there's something to improve)
I've omitted checks for out of range values for simplicity.
We can use it like this:
float fv = 1.375f;
Ratio rv = Ratio::fromFloat(fv);
std::cout << "fv = " << fv << ", rv = " << rv << ", rv.toFloat() = " << rv.toFloat() << "\n";
And the output is:
fv = 1.375, rv = 11534336/8388608, rv.toFloat() = 1.375
As you can see, exactly the same values on both ends.
The problem is that numerators and denumerators are big. This is because the code always multiplies significand by 2^23, even if smaller value would be enough to make it integer (this is equivalent to writing 0.2 as 2000000/10000000 instead of 2/10 - it's the same thing, only written differently).
This can be solved by changing the code to multiply significand (and divide exponent) by minimum number, like this (ellipsis stands for parts which are the same as above):
// counts number of subsequent least significant bits equal to 0
// example: for 1001000 (binary) returns 3
uint32_t countTrailingZeroes(uint32_t v)
{
uint32_t counter = 0;
while(counter < 32 && (v & 1u) == 0)
{
v >>= 1u;
++counter;
}
return counter;
}
struct Ratio
{
...
static Ratio fromFloat(float v)
{
...
uint32_t significand = (1u << 23u) | mntsBits;
const uint32_t nTrailingZeroes = countTrailingZeroes(significand);
significand >>= nTrailingZeroes;
const int64_t signFactor = signBit ? -1 : 1;
const int32_t exp = expBits - 127 - 23 + nTrailingZeroes;
if(exp < 0)
{
return {signFactor * static_cast<int64_t>(significand), 1u << static_cast<uint32_t>(-exp)};
}
else
{
return {signFactor * static_cast<int64_t>(significand * (1u << static_cast<uint32_t>(exp))), 1};
}
}
};
And now, for the following code:
float fv = 1.375f;
Ratio rv = Ratio::fromFloat(fv);
std::cout << "fv = " << fv << ", rv = " << rv << ", rv.toFloat() = " << rv.toFloat() << "\n";
We get:
fv = 1.375, rv = 11/8, rv.toFloat() = 1.375
In C++ you can use the Boost rational class. But you need to give numerator and denominator.
For this you need to find out no of digits in the input string after the decimal point. You can do this by string manipulation functions. Read the input character by character and find no of characters after the .
char inputstr[30];
int noint=0, nodec=0;
char intstr[30], dec[30];
int decimalfound = 0;
int denominator = 1;
int numerator;
scanf("%s",inputstr);
len = strlen(inputstr);
for (int i=0; i<len; i++)
{
if (decimalfound ==0)
{
if (inputstr[i] == '.')
{
decimalfound = 1;
}
else
{
intstr[noint++] = inputstr[i];
}
}
else
{
dec[nodec++] = inputstr[i];
denominator *=10;
}
}
dec[nodec] = '\0';
intstr[noint] = '\0';
numerator = atoi(dec) + (atoi(intstr) * 1000);
// You can now use the numerator and denominator as the fraction,
// either in the Rational class or you can find gcd and divide by
// gcd.
What about this simple code:
double n = 1.375;
int num = 1, den = 1;
double frac = (num * 1.f / den);
double margin = 0.000001;
while (abs(frac - n) > margin){
if (frac > n){
den++;
}
else{
num++;
}
frac = (num * 1.f / den);
}
I don't really tested too much, it's only an idea.
I hope I'll be forgiven for posting an answer which uses "only the C language". I know you tagged the question with C++ - but I couldn't turn down the bait, sorry. This is still valid C++ at least (although it does, admittedly, use mainly C string-processing techniques).
int num_string_float_to_rat(char *input, long *num, long *den) {
char *tok = NULL, *end = NULL;
char buf[128] = {'\0'};
long a = 0, b = 0;
int den_power = 1;
strncpy(buf, input, sizeof(buf) - 1);
tok = strtok(buf, ".");
if (!tok) return 1;
a = strtol(tok, &end, 10);
if (*end != '\0') return 2;
tok = strtok(NULL, ".");
if (!tok) return 1;
den_power = strlen(tok); // Denominator power of 10
b = strtol(tok, &end, 10);
if (*end != '\0') return 2;
*den = static_cast<int>(pow(10.00, den_power));
*num = a * *den + b;
num_simple_fraction(num, den);
return 0;
}
Sample usage:
int rc = num_string_float_to_rat("0015.0235", &num, &den);
// Check return code -> should be 0!
printf("%ld/%ld\n", num, den);
Output:
30047/2000
Full example at http://codepad.org/CFQQEZkc .
Notes:
strtok() is used to parse the input in to tokens (no need to reinvent the wheel in that regard). strtok() modifies its input - so a temporary buffer is used for safety
it checks for invalid characters - and will return a non-zero return code if found
strtol() has been used instead of atoi() - as it can detect non-numeric characters in the input
scanf() has not been used to slurp the input - due to rounding issues with floating point numbers
the base for strtol() has been explicitly set to 10 to avoid problems with leading zeros (otherwise a leading zero will cause the number to be interpreted as octal)
it uses a num_simple_fraction() helper (not shown) - which in turn uses a gcd() helper (also not shown) - to convert the result to a simple fraction
log10() of the numerator is determined by calculating the length of the token after the decimal point
I'd do this in three steps.
1) find the decimal point, so that you know how large the denominator has to be.
2) get the numerator. That's just the original text with the decimal point removed.
3) get the denominator. If there was no decimal point, the denominator is 1. Otherwise, the denominator is 10^n, where n is the number of digits to the right of the (now-removed) decimal point.
struct fraction {
std::string num, den;
};
fraction parse(std::string input) {
// 1:
std::size_t dec_point = input.find('.');
// 2:
if (dec_point == std::string::npos)
dec_point = 0;
else {
dec_point = input.length() - dec_point;
input.erase(input.begin() + dec_point);
}
// 3:
int denom = 1;
for (int i = 1; i < dec_point; ++i)
denom *= 10;
string result = { input, std::to_string(denom) };
return result;
}
I am stuck on an assignment for my modern numerical software development class.
Function prototype (assume x = 6.5):
//returns the IEEE fractional part of x as a decimal floating point number. You must convert binary to decimal.
inline double fraction(double x) {}
What I got:
inline double fraction(double x)
{
// Get the fraction
unsigned long long frac_mask = (1u << 52) - 1; // Get 52 1's
unsigned long long xint = *reinterpret_cast<long long*>(&x); // Interpret x's bits as an int
unsigned long long frac_num = xint & frac_mask; // Get the fraction as an int
double fraction = double(frac_num) / double(2u << 52); // Divide frac_num by 2^52
return fraction;
/* This code works, but is not what is specified:
double fraction = x / pow(2, exponent(x));
fraction = fmod(fraction, 1);
return fraction;
*/
}
I keep getting a NaN. The answer I am looking for is 0.625. I am kind of hopelessly lost. Any help is much appreciated.
I was able to successfully isolate the exponent of the double with the following function:
inline int exponent(double x) //returns the unbiased(true) binary exponent of x as a decimal integer. Remember that subnormals are a special case. Consider 0 to be a subnormal.
{
if (x == 0.0)
return -1022;
else if (isnan(x))
return 1024;
// Get the exponent
unsigned long long exp_mask = (1u << 11) - 1; // Get eleven 1's
exp_mask <<= 52; // Move into place
unsigned long long xint = *reinterpret_cast<long long*>(&x); // Interpret x's bits as an int
unsigned long long exp_bits = xint & exp_mask; // Get the exponent bits
unsigned long long exp = exp_bits >> 52; // Get the exponent as a number
return exp -1023;
}
I am confused why the exponent logic works, but the fraction won't.
You are mixing unsigned (presumably 32-bits) with values that need 64 bits.
For example, frac_num is only 32-bits, use a long or long long... [or uint64_t, which is a more reliable way to get a 64-bit value.
inline double fraction(double x)
{
// Get the fraction
uint64_t frac_mask = (1ul << 52) - 1; // Get 52 1's
// uint64_t xint = *reinterpret_cast<uint64_t*>(&x); // Interpret x's bits as an int
uint64_t xint;
memcpy(&xint, &x, sizeof(xint)); // Interpret x's bits as an int
int64_t frac_num = xint & frac_mask; // Get the fraction as an int
frac_num += 1ul << 52; // Add hidden bit.
double fraction = double(frac_num) / double(2ul << 52); // Divide frac_num by 2^52
return fraction;
}
Note the addition of l to the 1u and 2u, to ensure they are long, and. You will need to include cstdint to get the sized integers.
Edit: that will of course just give you the mantissa in the form of a fraction. The decimal point may be anywhere between bit 1023 and -1023, meaning that only values between -1 and +1 will have the correct result.
A complete example using the code above [+ some printouts]
#include <cstdint>
#include <iostream>
#include <cstring>
inline double fraction(double x)
{
// Get the fraction
uint64_t frac_mask = (1ul << 52) - 1; // Get 52 1's
std::cout << "mask=" << std::hex << frac_mask << std::endl;
// uint64_t xint = *reinterpret_cast<uint64_t*>(&x); // Interpret x's bits as an int
uint64_t xint;
memcpy(&xint, &x, sizeof(xint)); // Interpret x's bits as an int
int64_t frac_num = xint & frac_mask; // Get the fraction as an int
frac_num += 1ul << 52; // Add hidden bit.
std::cout << "xint=" << std::hex << xint << " num=" << std::hex << frac_num << std::endl;
double fraction = double(frac_num) / double(2ul << 52); // Divide frac_num by 2^52
return fraction;
}
int main()
{
double a = 0.5;
double b = 0.75;
double d = 6.5;
double e = 4.711;
double fa = fraction(a);
double fb = fraction(b);
double fd = fraction(d);
double fe = fraction(e);
std::cout << "fa=" << std::fixed << fa << " fb=" << fb << std::endl;
std::cout << "fd=" << std::fixed << fd << " fe=" << fe << std::endl;
}
resutl of running the above:
mask=fffffffffffff
xint=3fe0000000000000 num=10000000000000
mask=fffffffffffff
xint=3fe8000000000000 num=18000000000000
mask=fffffffffffff
xint=401a000000000000 num=1a000000000000
mask=fffffffffffff
xint=4012d810624dd2f2 num=12d810624dd2f2
fa=0.500000 fb=0.750000
fd=0.812500 fe=0.588875
Note that if you divide 4.711 by 2 a few times [3 times to be precise], you get 0.588875, and if you divide 6.5 by 8 (or by 2 three times over), you get 0.8125
I need to go to bed, but you basically have to take the exponent into account to figure out the fraction of a floating point number. Or simply convert to an integer, and subtract it - as long as it's within range.
Code: Try It Online
// bit_cast, bit_width
#include <bit>
// assert
#include <cassert>
// uint64_t
#include <cstdint>
[[nodiscard]]
constexpr auto Frac(double x)
noexcept -> double
{
using Bits = std::uint64_t;
constexpr Bits s_sign_bit_count{ 1ull };
constexpr Bits s_exponent_bit_count{ 11ull };
constexpr Bits s_mantissa_bit_count{ 52ull };
constexpr Bits s_sign_max{ (1ull << s_sign_bit_count) - 1ull };
constexpr Bits s_exponent_max{ (1ull << s_exponent_bit_count) - 1ull };
constexpr Bits s_mantissa_max{ (1ull << s_mantissa_bit_count) - 1ull };
constexpr Bits s_sign_mask{ s_sign_max << (s_exponent_bit_count + s_mantissa_bit_count) };
constexpr Bits s_exponent_mask{ s_exponent_max << s_mantissa_bit_count };
constexpr Bits s_mantissa_mask{ s_mantissa_max };
constexpr Bits s_exponent_bias{ (1ull << (s_exponent_bit_count - 1ull)) - 1ull };
if ((-1.0 < x) and (x < 1.0))
{
// Includes: subnormal, +0, -0
// No integral part.
return x;
}
const Bits u = std::bit_cast< Bits >(x);
const Bits exponent_bits = (u & s_exponent_mask) >> s_mantissa_bit_count;
assert(s_exponent_bias <= exponent_bits);
const Bits exponent = exponent_bits - s_exponent_bias;
if (s_mantissa_bit_count <= exponent)
{
// Includes: +Inf, -Inf, NaN
// No fractional part.
return {};
}
const Bits fraction_bit_count = s_mantissa_bit_count - exponent;
const Bits fraction_mask = (1ull << fraction_bit_count) - 1ull;
const Bits fraction_bits = u & fraction_mask;
const Bits fraction_shift = s_mantissa_bit_count - std::bit_width(fraction_bits)
+ 1ull; // Implicit leading one
Bits fraction = u & s_sign_mask;
if (fraction_shift < exponent_bits)
{
const Bits fraction_exponent = exponent_bits - fraction_shift;
const Bits fraction_mantissa = (fraction_bits << fraction_shift)
// Remove implicit leading one.
& s_mantissa_mask;
fraction |= (fraction_exponent << s_mantissa_bit_count);
fraction |= fraction_mantissa;
}
return std::bit_cast< double >(fraction);
}
Depending on your own preference, you can return x as well in case of a NaN.
Test:
// setprecision
#include <iomanip>
// cout, endl, fixed
#include <iostream>
auto main() -> int
{
std::cout << std::fixed << std::setprecision(11);
{
constexpr double d = 7.99999952316;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 0.5;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 0.75;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 6.5;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 4.711;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
return 0;
}
Output
frac(7.99999952316) = 0.99999952316
frac(0.50000000000) = 0.50000000000
frac(0.75000000000) = 0.75000000000
frac(6.50000000000) = 0.50000000000
frac(4.71100000000) = 0.71100000000
I am using Anthony Williams' fixed point library described in the Dr Dobb's article "Optimizing Math-Intensive Applications with Fixed-Point Arithmetic" to calculate the distance between two geographical points using the Rhumb Line method.
This works well enough when the distance between the points is significant (greater than a few kilometers), but is very poor at smaller distances. The worst case being when the two points are equal or near equal, the result is a distance of 194 meters, while I need precision of at least 1 metre at distances >= 1 metre.
By comparison with a double precision floating-point implementation, I have located the problem to the fixed::sqrt() function, which performs poorly at small values:
x std::sqrt(x) fixed::sqrt(x) error
----------------------------------------------------
0 0 3.05176e-005 3.05176e-005
1e-005 0.00316228 0.00316334 1.06005e-006
2e-005 0.00447214 0.00447226 1.19752e-007
3e-005 0.00547723 0.0054779 6.72248e-007
4e-005 0.00632456 0.00632477 2.12746e-007
5e-005 0.00707107 0.0070715 4.27244e-007
6e-005 0.00774597 0.0077467 7.2978e-007
7e-005 0.0083666 0.00836658 1.54875e-008
8e-005 0.00894427 0.00894427 1.085e-009
Correcting the result for fixed::sqrt(0) is trivial by treating it as a special case, but that will not solve the problem for small non-zero distances, where the error starts at 194 metres and converges toward zero with increasing distance. I probably need at least an order of maginitude improvement in precision toward zero.
The fixed::sqrt() algorithim is briefly explained on page 4 of the article linked above, but I am struggling to follow it let alone determine whether it is possible to improve it. The code for the function is reproduced below:
fixed fixed::sqrt() const
{
unsigned const max_shift=62;
uint64_t a_squared=1LL<<max_shift;
unsigned b_shift=(max_shift+fixed_resolution_shift)/2;
uint64_t a=1LL<<b_shift;
uint64_t x=m_nVal;
while(b_shift && a_squared>x)
{
a>>=1;
a_squared>>=2;
--b_shift;
}
uint64_t remainder=x-a_squared;
--b_shift;
while(remainder && b_shift)
{
uint64_t b_squared=1LL<<(2*b_shift-fixed_resolution_shift);
int const two_a_b_shift=b_shift+1-fixed_resolution_shift;
uint64_t two_a_b=(two_a_b_shift>0)?(a<<two_a_b_shift):(a>>-two_a_b_shift);
while(b_shift && remainder<(b_squared+two_a_b))
{
b_squared>>=2;
two_a_b>>=1;
--b_shift;
}
uint64_t const delta=b_squared+two_a_b;
if((2*remainder)>delta)
{
a+=(1LL<<b_shift);
remainder-=delta;
if(b_shift)
{
--b_shift;
}
}
}
return fixed(internal(),a);
}
Note that m_nVal is the internal fixed point representation value, it is an int64_t and the representation uses Q36.28 format (fixed_resolution_shift = 28). The representation itself has enough precision for at least 8 decimal places, and as a fraction of equatorial arc is good for distances of around 0.14 metres, so the limitation is not the fixed-point representation.
Use of the rhumb line method is a standards body recommendation for this application so cannot be changed, and in any case a more accurate square-root function is likely to be required elsewhere in the application or in future applications.
Question: Is it possible to improve the accuracy of the fixed::sqrt() algorithm for small non-zero values while still maintaining its bounded and deterministic convergence?
Additional Information
The test code used to generate the table above:
#include <cmath>
#include <iostream>
#include "fixed.hpp"
int main()
{
double error = 1.0 ;
for( double x = 0.0; error > 1e-8; x += 1e-5 )
{
double fixed_root = sqrt(fixed(x)).as_double() ;
double std_root = std::sqrt(x) ;
error = std::fabs(fixed_root - std_root) ;
std::cout << x << '\t' << std_root << '\t' << fixed_root << '\t' << error << std::endl ;
}
}
Conclusion
In the light of Justin Peel's solution and analysis, and comparison with the algorithm in "The Neglected Art of Fixed Point Arithmetic", I have adapted the latter as follows:
fixed fixed::sqrt() const
{
uint64_t a = 0 ; // root accumulator
uint64_t remHi = 0 ; // high part of partial remainder
uint64_t remLo = m_nVal ; // low part of partial remainder
uint64_t testDiv ;
int count = 31 + (fixed_resolution_shift >> 1); // Loop counter
do
{
// get 2 bits of arg
remHi = (remHi << 2) | (remLo >> 62); remLo <<= 2 ;
// Get ready for the next bit in the root
a <<= 1;
// Test radical
testDiv = (a << 1) + 1;
if (remHi >= testDiv)
{
remHi -= testDiv;
a += 1;
}
} while (count-- != 0);
return fixed(internal(),a);
}
While this gives far greater precision, the improvement I needed is not to be achieved. The Q36.28 format alone just about provides the precision I need, but it is not possible to perform a sqrt() without loss of a few bits of precision. However some lateral thinking provides a better solution. My application tests the calculated distance against some distance limit. The rather obvious solution in hindsight is to test the square of the distance against the square of the limit!
Given that sqrt(ab) = sqrt(a)sqrt(b), then can't you just trap the case where your number is small and shift it up by a given number of bits, compute the root and shift that back down by half the number of bits to get the result?
I.e.
sqrt(n) = sqrt(n.2^k)/sqrt(2^k)
= sqrt(n.2^k).2^(-k/2)
E.g. Choose k = 28 for any n less than 2^8.
The original implementation obviously has some problems. I became frustrated with trying to fix them all with the way the code is currently done and ended up going at it with a different approach. I could probably fix the original now, but I like my way better anyway.
I treat the input number as being in Q64 to start which is the same as shifting by 28 and then shifting back by 14 afterwards (the sqrt halves it). However, if you just do that, then the accuracy is limited to 1/2^14 = 6.1035e-5 because the last 14 bits will be 0. To remedy this, I then shift a and remainder correctly and to keep filling in digits I do the loop again. The code can be made more efficient and cleaner, but I'll leave that to someone else. The accuracy shown below is pretty much as good as you can get with Q36.28. If you compare the fixed point sqrt with the floating point sqrt of the input number after it has been truncated by fixed point(convert it to fixed point and back), then the errors are around 2e-9(I didn't do this in the code below, but it requires one line of change). This is right in line with the best accuracy for Q36.28 which is 1/2^28 = 3.7529e-9.
By the way, one big mistake in the original code is that the term where m = 0 is never considered so that bit can never be set. Anyway, here is the code. Enjoy!
#include <iostream>
#include <cmath>
typedef unsigned long uint64_t;
uint64_t sqrt(uint64_t in_val)
{
const uint64_t fixed_resolution_shift = 28;
const unsigned max_shift=62;
uint64_t a_squared=1ULL<<max_shift;
unsigned b_shift=(max_shift>>1) + 1;
uint64_t a=1ULL<<(b_shift - 1);
uint64_t x=in_val;
while(b_shift && a_squared>x)
{
a>>=1;
a_squared>>=2;
--b_shift;
}
uint64_t remainder=x-a_squared;
--b_shift;
while(remainder && b_shift)
{
uint64_t b_squared=1ULL<<(2*(b_shift - 1));
uint64_t two_a_b=(a<<b_shift);
while(b_shift && remainder<(b_squared+two_a_b))
{
b_squared>>=2;
two_a_b>>=1;
--b_shift;
}
uint64_t const delta=b_squared+two_a_b;
if((remainder)>=delta && b_shift)
{
a+=(1ULL<<(b_shift - 1));
remainder-=delta;
--b_shift;
}
}
a <<= (fixed_resolution_shift/2);
b_shift = (fixed_resolution_shift/2) + 1;
remainder <<= (fixed_resolution_shift);
while(remainder && b_shift)
{
uint64_t b_squared=1ULL<<(2*(b_shift - 1));
uint64_t two_a_b=(a<<b_shift);
while(b_shift && remainder<(b_squared+two_a_b))
{
b_squared>>=2;
two_a_b>>=1;
--b_shift;
}
uint64_t const delta=b_squared+two_a_b;
if((remainder)>=delta && b_shift)
{
a+=(1ULL<<(b_shift - 1));
remainder-=delta;
--b_shift;
}
}
return a;
}
double fixed2float(uint64_t x)
{
return static_cast<double>(x) * pow(2.0, -28.0);
}
uint64_t float2fixed(double f)
{
return static_cast<uint64_t>(f * pow(2, 28.0));
}
void finderror(double num)
{
double root1 = fixed2float(sqrt(float2fixed(num)));
double root2 = pow(num, 0.5);
std::cout << "input: " << num << ", fixed sqrt: " << root1 << " " << ", float sqrt: " << root2 << ", finderror: " << root2 - root1 << std::endl;
}
main()
{
finderror(0);
finderror(1e-5);
finderror(2e-5);
finderror(3e-5);
finderror(4e-5);
finderror(5e-5);
finderror(pow(2.0,1));
finderror(1ULL<<35);
}
with the output of the program being
input: 0, fixed sqrt: 0 , float sqrt: 0, finderror: 0
input: 1e-05, fixed sqrt: 0.00316207 , float sqrt: 0.00316228, finderror: 2.10277e-07
input: 2e-05, fixed sqrt: 0.00447184 , float sqrt: 0.00447214, finderror: 2.97481e-07
input: 3e-05, fixed sqrt: 0.0054772 , float sqrt: 0.00547723, finderror: 2.43815e-08
input: 4e-05, fixed sqrt: 0.00632443 , float sqrt: 0.00632456, finderror: 1.26255e-07
input: 5e-05, fixed sqrt: 0.00707086 , float sqrt: 0.00707107, finderror: 2.06055e-07
input: 2, fixed sqrt: 1.41421 , float sqrt: 1.41421, finderror: 1.85149e-09
input: 3.43597e+10, fixed sqrt: 185364 , float sqrt: 185364, finderror: 2.24099e-09
I'm not sure how you're getting the numbers from fixed::sqrt() shown in the table.
Here's what I do:
#include <stdio.h>
#include <math.h>
#define __int64 long long // gcc doesn't know __int64
typedef __int64 fixed;
#define FRACT 28
#define DBL2FIX(x) ((fixed)((double)(x) * (1LL << FRACT)))
#define FIX2DBL(x) ((double)(x) / (1LL << FRACT))
// De-++-ified code from
// http://www.justsoftwaresolutions.co.uk/news/optimizing-applications-with-fixed-point-arithmetic.html
fixed sqrtfix0(fixed num)
{
static unsigned const fixed_resolution_shift=FRACT;
unsigned const max_shift=62;
unsigned __int64 a_squared=1LL<<max_shift;
unsigned b_shift=(max_shift+fixed_resolution_shift)/2;
unsigned __int64 a=1LL<<b_shift;
unsigned __int64 x=num;
unsigned __int64 remainder;
while(b_shift && a_squared>x)
{
a>>=1;
a_squared>>=2;
--b_shift;
}
remainder=x-a_squared;
--b_shift;
while(remainder && b_shift)
{
unsigned __int64 b_squared=1LL<<(2*b_shift-fixed_resolution_shift);
int const two_a_b_shift=b_shift+1-fixed_resolution_shift;
unsigned __int64 two_a_b=(two_a_b_shift>0)?(a<<two_a_b_shift):(a>>-two_a_b_shift);
unsigned __int64 delta;
while(b_shift && remainder<(b_squared+two_a_b))
{
b_squared>>=2;
two_a_b>>=1;
--b_shift;
}
delta=b_squared+two_a_b;
if((2*remainder)>delta)
{
a+=(1LL<<b_shift);
remainder-=delta;
if(b_shift)
{
--b_shift;
}
}
}
return (fixed)a;
}
// Adapted code from
// http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Digit-by-digit_calculation
fixed sqrtfix1(fixed num)
{
fixed res = 0;
fixed bit = (fixed)1 << 62; // The second-to-top bit is set
int s = 0;
// Scale num up to get more significant digits
while (num && num < bit)
{
num <<= 1;
s++;
}
if (s & 1)
{
num >>= 1;
s--;
}
s = 14 - (s >> 1);
while (bit != 0)
{
if (num >= res + bit)
{
num -= res + bit;
res = (res >> 1) + bit;
}
else
{
res >>= 1;
}
bit >>= 2;
}
if (s >= 0) res <<= s;
else res >>= -s;
return res;
}
int main(void)
{
double testData[] =
{
0,
1e-005,
2e-005,
3e-005,
4e-005,
5e-005,
6e-005,
7e-005,
8e-005,
};
int i;
for (i = 0; i < sizeof(testData) / sizeof(testData[0]); i++)
{
double x = testData[i];
fixed xf = DBL2FIX(x);
fixed sqf0 = sqrtfix0(xf);
fixed sqf1 = sqrtfix1(xf);
double sq0 = FIX2DBL(sqf0);
double sq1 = FIX2DBL(sqf1);
printf("%10.8f: "
"sqrtfix0()=%10.8f / err=%e "
"sqrt()=%10.8f "
"sqrtfix1()=%10.8f / err=%e\n",
x,
sq0, fabs(sq0 - sqrt(x)),
sqrt(x),
sq1, fabs(sq1 - sqrt(x)));
}
printf("sizeof(double)=%d\n", (int)sizeof(double));
return 0;
}
And here's what I get (with gcc and Open Watcom):
0.00000000: sqrtfix0()=0.00003052 / err=3.051758e-05 sqrt()=0.00000000 sqrtfix1()=0.00000000 / err=0.000000e+00
0.00001000: sqrtfix0()=0.00311279 / err=4.948469e-05 sqrt()=0.00316228 sqrtfix1()=0.00316207 / err=2.102766e-07
0.00002000: sqrtfix0()=0.00445557 / err=1.656955e-05 sqrt()=0.00447214 sqrtfix1()=0.00447184 / err=2.974807e-07
0.00003000: sqrtfix0()=0.00543213 / err=4.509667e-05 sqrt()=0.00547723 sqrtfix1()=0.00547720 / err=2.438148e-08
0.00004000: sqrtfix0()=0.00628662 / err=3.793423e-05 sqrt()=0.00632456 sqrtfix1()=0.00632443 / err=1.262553e-07
0.00005000: sqrtfix0()=0.00701904 / err=5.202484e-05 sqrt()=0.00707107 sqrtfix1()=0.00707086 / err=2.060551e-07
0.00006000: sqrtfix0()=0.00772095 / err=2.501943e-05 sqrt()=0.00774597 sqrtfix1()=0.00774593 / err=3.390476e-08
0.00007000: sqrtfix0()=0.00836182 / err=4.783859e-06 sqrt()=0.00836660 sqrtfix1()=0.00836649 / err=1.086198e-07
0.00008000: sqrtfix0()=0.00894165 / err=2.621519e-06 sqrt()=0.00894427 sqrtfix1()=0.00894409 / err=1.777289e-07
sizeof(double)=8
EDIT:
I've missed the fact that the above sqrtfix1() won't work well with large arguments. It can be fixed by appending 28 zeroes to the argument and essentially calculating the exact integer square root of that. This comes at the expense of doing internal calculations in 128-bit arithmetic, but it's pretty straightforward:
fixed sqrtfix2(fixed num)
{
unsigned __int64 numl, numh;
unsigned __int64 resl = 0, resh = 0;
unsigned __int64 bitl = 0, bith = (unsigned __int64)1 << 26;
numl = num << 28;
numh = num >> (64 - 28);
while (bitl | bith)
{
unsigned __int64 tmpl = resl + bitl;
unsigned __int64 tmph = resh + bith + (tmpl < resl);
tmph = numh - tmph - (numl < tmpl);
tmpl = numl - tmpl;
if (tmph & 0x8000000000000000ULL)
{
resl >>= 1;
if (resh & 1) resl |= 0x8000000000000000ULL;
resh >>= 1;
}
else
{
numl = tmpl;
numh = tmph;
resl >>= 1;
if (resh & 1) resl |= 0x8000000000000000ULL;
resh >>= 1;
resh += bith + (resl + bitl < resl);
resl += bitl;
}
bitl >>= 2;
if (bith & 1) bitl |= 0x4000000000000000ULL;
if (bith & 2) bitl |= 0x8000000000000000ULL;
bith >>= 2;
}
return resl;
}
And it gives pretty much the same results (slightly better for 3.43597e+10) than this answer:
0.00000000: sqrtfix0()=0.00003052 / err=3.051758e-05 sqrt()=0.00000000 sqrtfix2()=0.00000000 / err=0.000000e+00
0.00001000: sqrtfix0()=0.00311279 / err=4.948469e-05 sqrt()=0.00316228 sqrtfix2()=0.00316207 / err=2.102766e-07
0.00002000: sqrtfix0()=0.00445557 / err=1.656955e-05 sqrt()=0.00447214 sqrtfix2()=0.00447184 / err=2.974807e-07
0.00003000: sqrtfix0()=0.00543213 / err=4.509667e-05 sqrt()=0.00547723 sqrtfix2()=0.00547720 / err=2.438148e-08
0.00004000: sqrtfix0()=0.00628662 / err=3.793423e-05 sqrt()=0.00632456 sqrtfix2()=0.00632443 / err=1.262553e-07
0.00005000: sqrtfix0()=0.00701904 / err=5.202484e-05 sqrt()=0.00707107 sqrtfix2()=0.00707086 / err=2.060551e-07
0.00006000: sqrtfix0()=0.00772095 / err=2.501943e-05 sqrt()=0.00774597 sqrtfix2()=0.00774593 / err=3.390476e-08
0.00007000: sqrtfix0()=0.00836182 / err=4.783859e-06 sqrt()=0.00836660 sqrtfix2()=0.00836649 / err=1.086198e-07
0.00008000: sqrtfix0()=0.00894165 / err=2.621519e-06 sqrt()=0.00894427 sqrtfix2()=0.00894409 / err=1.777289e-07
2.00000000: sqrtfix0()=1.41419983 / err=1.373327e-05 sqrt()=1.41421356 sqrtfix2()=1.41421356 / err=1.851493e-09
34359700000.00000000: sqrtfix0()=185363.69654846 / err=5.097361e-06 sqrt()=185363.69655356 sqrtfix2()=185363.69655356 / err=1
.164153e-09
Many many years ago I worked on a demo program for a small computer our outfit had built. The computer had a built-in square-root instruction, and we built a simple program to demonstrate the computer doing 16-bit add/subtract/multiply/divide/square-root on a TTY. Alas, it turned out that there was a serious bug in the square root instruction, but we had promised to demo the function. So we created an array of the squares of the values 1-255, then used a simple lookup to match the value typed in to one of the array values. The index was the square root.
Given the reprensentation of decimal I have --you can find it here for instance--, I tried to convert a double this way:
explicit Decimal(double n)
{
DoubleAsQWord doubleAsQWord;
doubleAsQWord.doubleValue = n;
uint64 val = doubleAsQWord.qWord;
const uint64 topBitMask = (int64)(0x1 << 31) << 32;
//grab the 63th bit
bool isNegative = (val & topBitMask) != 0;
//bias is 1023=2^(k-1)-1, where k is 11 for double
uint32 exponent = (((uint64)(val >> 31) >> 21) & 0x7FF) - 1023;
//exclude both sign and exponent (<<12, >>12) and normalize mantissa
uint64 mantissa = ((uint64)(0x1 << 31) << 21) | (val << 12) >> 12;
// normalized mantissa is 53 bits long,
// the exponent does not care about normalizing bit
uint8 scale = exponent + 11;
if (scale > 11)
scale = 11;
else if (scale < 0)
scale = 0;
lo_ = ((isNegative ? -1 : 1) * n) * std::pow(10., scale);
signScale_ = (isNegative ? 0x1 : 0x0) | (scale << 1);
// will always be 0 since we cannot reach
// a 128 bits precision with a 64 bits double
hi_ = 0;
}
The DoubleAsQWord type is used to "cast" from double to its uint64 representation:
union DoubleAsQWord
{
double doubleValue;
uint64 qWord;
};
My Decimal type has these fields:
uint64 lo_;
uint32 hi_;
int32 signScale_;
All this stuff is encapsulated in my Decimal class. You can notice I extract the mantissa even if I'm not using it. I'm still thinking of a way to guess the scale accurately.
This is purely practical, and seems to work in the case of a stress test:
BOOST_AUTO_TEST_CASE( convertion_random_stress )
{
const double EPSILON = 0.000001f;
srand(time(0));
for (int i = 0; i < 10000; ++i)
{
double d1 = ((rand() % 10) % 2 == 0 ? -1 : 1)
* (double)(rand() % 1000 + 1000.) / (double)(rand() % 42 + 2.);
Decimal d(d1);
double d2 = d.toDouble();
double absError = fabs(d1 - d2);
BOOST_CHECK_MESSAGE(
absError <= EPSILON,
"absError=" << absError << " with " << d1 << " - " << d2
);
}
}
Anyway, how would you convert from double to this decimal representation?
I think you guys will be interested in an implementation of a C++ wrapper to the Intel Decimal Floating-Point Math Library:
C++ Decimal Wrapper Class
Intel DFP
What about using VarR8FromDec Function ?
EDIT: This function is declared on Windows system only. However an equivalent C implementation is available with WINE, here: http://source.winehq.org/source/dlls/oleaut32/vartype.c
Perhaps you are looking for System::Convert::ToDecimal()
http://msdn.microsoft.com/en-us/library/a69w9ca0%28v=vs.80%29.aspx
Alternatively you could try recasting the Double as a Decimal.
An example from the MSDN.
http://msdn.microsoft.com/en-us/library/aa326763%28v=vs.71%29.aspx
// Convert the double argument; catch exceptions that are thrown.
void DecimalFromDouble( double argument )
{
Object* decValue;
// Convert the double argument to a Decimal value.
try
{
decValue = __box( (Decimal)argument );
}
catch( Exception* ex )
{
decValue = GetExceptionType( ex );
}
Console::WriteLine( formatter, __box( argument ), decValue );
}
If you do not have access to the .Net routines then this is tricky. I have done this myself for my hex editor (so that users can display and edit C# Decimal values using the Properties dialog) - see http://www.hexedit.com for more information. Also the source for HexEdit is freely available - see my article at http://www.codeproject.com/KB/cpp/HexEdit.aspx.
Actually my routines convert between Decimal and strings but you can of course use sprintf to convert the double to a string first. (Also when you talk about double I think you explicitly mean IEEE 64-bit floating point format, though this is what most compilers/systems use nowadays.)
Note that there are a few gotchas if you want to handle precisely all valid Decimal values and return an error for any value that cannot be converted, since the format is not well documented. (The Decimal format is aweful really, eg the same number can have many representations.)
Here is my code that converts a string to a Decimal. Note that it uses the the GNU Multiple Precision Arithmetic Library (functions that start with mpz_). The String2Decimal function obviously returns false if it fails for some reason, such as the value being too big. The parameter 'presult' must point to a buffer of at least 16 bytes, to store the result.
bool String2Decimal(const char *ss, void *presult)
{
bool retval = false;
// View the decimal (result) as four 32 bit integers
unsigned __int32 *dd = (unsigned __int32 *)presult;
mpz_t mant, max_mant;
mpz_inits(mant, max_mant, NULL);
int exp = 0; // Exponent
bool dpseen = false; // decimal point seen yet?
bool neg = false; // minus sign seen?
// Scan the characters of the value
const char *pp;
for (pp = ss; *pp != '\0'; ++pp)
{
if (*pp == '-')
{
if (pp != ss)
goto exit_func; // minus sign not at start
neg = true;
}
else if (isdigit(*pp))
{
mpz_mul_si(mant, mant, 10);
mpz_add_ui(mant, mant, unsigned(*pp - '0'));
if (dpseen) ++exp; // Keep track of digits after decimal pt
}
else if (*pp == '.')
{
if (dpseen)
goto exit_func; // more than one decimal point
dpseen = true;
}
else if (*pp == 'e' || *pp == 'E')
{
char *end;
exp -= strtol(pp+1, &end, 10);
pp = end;
break;
}
else
goto exit_func; // unexpected character
}
if (*pp != '\0')
goto exit_func; // extra characters after end
if (exp < -28 || exp > 28)
goto exit_func; // exponent outside valid range
// Adjust mantissa for -ve exponent
if (exp < 0)
{
mpz_t tmp;
mpz_init_set_si(tmp, 10);
mpz_pow_ui(tmp, tmp, -exp);
mpz_mul(mant, mant, tmp);
mpz_clear(tmp);
exp = 0;
}
// Get max_mant = size of largest mantissa (2^96 - 1)
//mpz_set_str(max_mant, "79228162514264337593543950335", 10); // 2^96 - 1
static unsigned __int32 ffs[3] = { 0xFFFFffffUL, 0xFFFFffffUL, 0xFFFFffffUL };
mpz_import(max_mant, 3, -1, sizeof(ffs[0]), 0, 0, ffs);
// Check for mantissa too big.
if (mpz_cmp(mant, max_mant) > 0)
goto exit_func; // value too big
else if (mpz_sgn(mant) == 0)
exp = 0; // if mantissa is zero make everything zero
// Set integer part
dd[2] = mpz_getlimbn(mant, 2);
dd[1] = mpz_getlimbn(mant, 1);
dd[0] = mpz_getlimbn(mant, 0);
// Set exponent and sign
dd[3] = exp << 16;
if (neg && mpz_sgn(mant) > 0)
dd[3] |= 0x80000000;
retval = true; // indicate success
exit_func:
mpz_clears(mant, max_mant, NULL);
return retval;
}
How about this:
1) sprintf number into s
2) find decimal point (strchr), store in idx
3) atoi = obtain integer part easily, use union to separate high/lo
4) use strlen - idx to obtain number of digits after point
sprintf may be slow but you´ll get the solution under 2 minutes of typing...