Adding positive and negative numbers in IEEE-754 format - c++

My problem seems to be pretty simple: I wrote a program that manually adds floating point numbers together. This program has certain restrictions. (such as no iostream or use of any unary operators), so that is the reason for the lack of those things. As for the problem, the program seems to function correctly when adding two positive floats (1.5 + 1.5 = 3.0, for example), but when adding two negative numbers (10.0 + -5.0) I get very wacky numbers. Here is the code:
#include <cstdio>
#define BIAS32 127
struct Real
{
//sign bit
int sign;
//UNBIASED exponent
long exponent;
//Fraction including implied 1. at bit index 23
unsigned long fraction;
};
Real Decode(int float_value);
int Encode(Real real_value);
Real Normalize(Real value);
Real Add(Real left, Real right);
unsigned long Add(unsigned long leftop, unsigned long rightop);
unsigned long Multiply(unsigned long leftop, unsigned long rightop);
void alignExponents(Real* left, Real* right);
bool is_neg(Real real);
int Twos(int op);
int main(int argc, char* argv[])
{
int left, right;
char op;
int value;
Real rLeft, rRight, result;
if (argc < 4) {
printf("Usage: %s <left> <op> <right>\n", argv[0]);
return -1;
}
sscanf(argv[1], "%f", (float*)&left);
sscanf(argv[2], "%c", &op);
sscanf(argv[3], "%f", (float*)&right);
rLeft = Decode(left);
rRight = Decode(right);
if (op == '+') {
result = Add(rLeft, rRight);
}
else {
printf("Unknown operator '%c'\n", op);
return -2;
}
value = Encode(result);
printf("%.3f %c %.3f = %.3f (0x%08x)\n",
*((float*)&left),
op,
*((float*)&right),
*((float*)&value),
value
);
return 0;
}
Real Decode(int float_value)
{ // Test sign bit of float_value - Test exponent bits of float_value & apply bias - Test mantissa bits of float_value
Real result{ float_value >> 31 & 1 ? 1 : 0, ((long)Add(float_value >> 23 & 0xFF, -BIAS32)), (unsigned long)float_value & 0x7FFFFF };
return result;
};
int Encode(Real real_value)
{
int x = 0;
x |= real_value.fraction; // Set the fraction bits of x
x |= real_value.sign << 31; // Set the sign bits of x
x |= Add(real_value.exponent, BIAS32) << 23; // Set the exponent bits of x
return x;
}
Real Normalize(Real value)
{
if (is_neg(value))
{
value.fraction = Twos(value.fraction);
}
unsigned int i = 0;
while (i < 9)
{
if ((value.fraction >> Add(23, i)) & 1) // If there are set bits past the mantissa section
{
value.fraction >>= 1; // shift mantissa right by 1
value.exponent = Add(value.exponent, 1); // increment exponent to accomodate for shift
}
i = Add(i, 1);
}
return value;
}
Real Add(Real left, Real right)
{
Real a = left, b = right;
alignExponents(&a, &b); // Aligns exponents of both operands
unsigned long sum = Add(a.fraction, b.fraction);
Real result = Normalize({ a.sign, a.exponent, sum }); // Normalize result if need be
return result;
}
unsigned long Add(unsigned long leftop, unsigned long rightop)
{
unsigned long sum = 0, test = 1; // sum initialized to 0, test created to compare bits
while (test) // while test is not 0
{
if (leftop & test) // if the digit being tested is 1
{
if (sum & test) sum ^= test << 1; // if the sum tests to 1, carry a bit over
sum ^= test;
}
if (rightop & test)
{
if (sum & test) sum ^= test << 1;
sum ^= test;
}
test <<= 1;
}
return sum;
}
void alignExponents(Real* a, Real* b)
{
if (a->exponent != b->exponent) // If the exponents are not equal
{
if (a->exponent > b->exponent)
{
int disp = a->exponent - b->exponent; // number of shifts needed based on difference between two exponents
b->fraction |= 1 << 23; // sets the implicit bit for shifting
b->exponent = a->exponent; // sets exponents equal to each other
b->fraction >>= disp; // mantissa is shifted over to accomodate for the increase in power
return;
}
int disp = b->exponent - a->exponent;
a->fraction |= 1 << 23;
a->exponent = b->exponent;
a->fraction >>= disp;
return;
}
return;
}
bool is_neg(Real real)
{
if (real.sign) return true;
return false;
}
int Twos(int op)
{
return Add(~op, -1); // NOT the operand and add 1 to it
}
On top of that, I just tested the values 10.5 + 5.5 and got a 24.0, so there appears to be even more wrong with this than I initially thought. I've been working on this for days and would love some help/advice.

Here is some help/advice. Now that you have worked on some of the code, I suggest going back and reworking your data structure. The declaration of such a crucial data structure would benefit from a lot more comments, making sure you know exactly what each field means.
For example, the implicit bit is not always 1. It is zero if the exponent is zero. That should be dealt with in your Encode and Decode functions. For the rest of your code, it is just a significand bit and should not have any special handling.
When you start thinking about rounding, you will find you often need more than 23 bits in an intermediate result.
Making the significand of negative numbers 2's complement will create a problem of having the same information stored two ways. You will have both a sign bit as though doing sign-and-magnitude and have the sign encoded in the signed integer signficand. Keeping them consistent will be a mess. Whatever you decide about how Real will store negative numbers, document it and keep it consistent throughout.
If I were implementing this I would start by defining Real very, very carefully. I would then decide what operations I wanted to be able to do on Real, and write functions to do them. If you get those right each function will be relatively simple.

Related

how can i get numerator and denominator from a fractional number?

How can I get numerator and denominator from a fractional number? for example, from "1.375" i want to get "1375/1000" or "11/8" as a result. How can i do it with c++??
I have tried to do it by separating the numbers before the point and after the point but it doesn't give any idea how to get my desired output.
You didn't really specify whether you need to convert a floating point or a string to ratio, so I'm going to assume the former one.
Instead of trying string or arithmetic-based approaches, you can directly use properties of IEEE-754 encoding.
Floats (called binary32 by the standard) are encoded in memory like this:
S EEEEEEEE MMMMMMMMMMMMMMMMMMMMMMM
^ ^
bit 31 bit 0
where S is sign bit, Es are exponent bits (8 of them) Ms are mantissa bits (23 bits).
The number can be decoded like this:
value = (-1)^S * significand * 2 ^ expoenent
where:
significand = 1.MMMMMMMMMMMMMMMMMMMMMMM (as binary)
exponent = EEEEEEEE (as binary) - 127
(note: this is for so called "normal numbers", there are also zeroes, subnormals, infinities and NaNs - see Wikipedia page I linked)
This can be used here. We can rewrite the equation above like this:
(-1)^S * significand * exponent = (-1)^s * (significand * 2^23) * 2 ^ (exponent - 23)
The point is that significand * 2^23 is an integer (equal to 1.MMMMMMMMMMMMMMMMMMMMMMM, binary - by multiplying by 2^23, we moved the point 23 places right).2 ^ (exponent - 23) is an integer too, obviously.
In other words: we can write the number as:
(significand * 2^23) / 2^(-(exponent - 23)) (when exponent - 23 < 0)
or
[(significand * 2^23) * 2^(exponent - 23)] / 1 (when exponent - 23 >= 0)
So we have both numerator and denominator - directly from binary representation of the number.
All of the above could be implemented like this in C++:
struct Ratio
{
int64_t numerator; // numerator includes sign
uint64_t denominator;
float toFloat() const
{
return static_cast<float>(numerator) / denominator;
}
static Ratio fromFloat(float v)
{
// First, obtain bitwise representation of the value
const uint32_t bitwiseRepr = *reinterpret_cast<uint32_t*>(&v);
// Extract sign, exponent and mantissa bits (as stored in memory) for convenience:
const uint32_t signBit = bitwiseRepr >> 31u;
const uint32_t expBits = (bitwiseRepr >> 23u) & 0xffu; // 8 bits set
const uint32_t mntsBits = bitwiseRepr & 0x7fffffu; // 23 bits set
// Handle some special cases:
if(expBits == 0 && mntsBits == 0)
{
// special case: +0 and -0
return {0, 1};
}
else if(expBits == 255u && mntsBits == 0)
{
// special case: +inf, -inf
// Let's agree that infinity is always represented as 1/0 in Ratio
return {signBit ? -1 : 1, 0};
}
else if(expBits == 255u)
{
// special case: nan
// Let's agree, that if we get NaN, we returns max int64_t by 0
return {std::numeric_limits<int64_t>::max(), 0};
}
// mask lowest 23 bits (mantissa)
uint32_t significand = (1u << 23u) | mntsBits;
const int64_t signFactor = signBit ? -1 : 1;
const int32_t exp = expBits - 127 - 23;
if(exp < 0)
{
return {signFactor * static_cast<int64_t>(significand), 1u << static_cast<uint32_t>(-exp)};
}
else
{
return {signFactor * static_cast<int64_t>(significand * (1u << static_cast<uint32_t>(exp))), 1};
}
}
};
(hopefully comments and description above are understandable - let me know, if there's something to improve)
I've omitted checks for out of range values for simplicity.
We can use it like this:
float fv = 1.375f;
Ratio rv = Ratio::fromFloat(fv);
std::cout << "fv = " << fv << ", rv = " << rv << ", rv.toFloat() = " << rv.toFloat() << "\n";
And the output is:
fv = 1.375, rv = 11534336/8388608, rv.toFloat() = 1.375
As you can see, exactly the same values on both ends.
The problem is that numerators and denumerators are big. This is because the code always multiplies significand by 2^23, even if smaller value would be enough to make it integer (this is equivalent to writing 0.2 as 2000000/10000000 instead of 2/10 - it's the same thing, only written differently).
This can be solved by changing the code to multiply significand (and divide exponent) by minimum number, like this (ellipsis stands for parts which are the same as above):
// counts number of subsequent least significant bits equal to 0
// example: for 1001000 (binary) returns 3
uint32_t countTrailingZeroes(uint32_t v)
{
uint32_t counter = 0;
while(counter < 32 && (v & 1u) == 0)
{
v >>= 1u;
++counter;
}
return counter;
}
struct Ratio
{
...
static Ratio fromFloat(float v)
{
...
uint32_t significand = (1u << 23u) | mntsBits;
const uint32_t nTrailingZeroes = countTrailingZeroes(significand);
significand >>= nTrailingZeroes;
const int64_t signFactor = signBit ? -1 : 1;
const int32_t exp = expBits - 127 - 23 + nTrailingZeroes;
if(exp < 0)
{
return {signFactor * static_cast<int64_t>(significand), 1u << static_cast<uint32_t>(-exp)};
}
else
{
return {signFactor * static_cast<int64_t>(significand * (1u << static_cast<uint32_t>(exp))), 1};
}
}
};
And now, for the following code:
float fv = 1.375f;
Ratio rv = Ratio::fromFloat(fv);
std::cout << "fv = " << fv << ", rv = " << rv << ", rv.toFloat() = " << rv.toFloat() << "\n";
We get:
fv = 1.375, rv = 11/8, rv.toFloat() = 1.375
In C++ you can use the Boost rational class. But you need to give numerator and denominator.
For this you need to find out no of digits in the input string after the decimal point. You can do this by string manipulation functions. Read the input character by character and find no of characters after the .
char inputstr[30];
int noint=0, nodec=0;
char intstr[30], dec[30];
int decimalfound = 0;
int denominator = 1;
int numerator;
scanf("%s",inputstr);
len = strlen(inputstr);
for (int i=0; i<len; i++)
{
if (decimalfound ==0)
{
if (inputstr[i] == '.')
{
decimalfound = 1;
}
else
{
intstr[noint++] = inputstr[i];
}
}
else
{
dec[nodec++] = inputstr[i];
denominator *=10;
}
}
dec[nodec] = '\0';
intstr[noint] = '\0';
numerator = atoi(dec) + (atoi(intstr) * 1000);
// You can now use the numerator and denominator as the fraction,
// either in the Rational class or you can find gcd and divide by
// gcd.
What about this simple code:
double n = 1.375;
int num = 1, den = 1;
double frac = (num * 1.f / den);
double margin = 0.000001;
while (abs(frac - n) > margin){
if (frac > n){
den++;
}
else{
num++;
}
frac = (num * 1.f / den);
}
I don't really tested too much, it's only an idea.
I hope I'll be forgiven for posting an answer which uses "only the C language". I know you tagged the question with C++ - but I couldn't turn down the bait, sorry. This is still valid C++ at least (although it does, admittedly, use mainly C string-processing techniques).
int num_string_float_to_rat(char *input, long *num, long *den) {
char *tok = NULL, *end = NULL;
char buf[128] = {'\0'};
long a = 0, b = 0;
int den_power = 1;
strncpy(buf, input, sizeof(buf) - 1);
tok = strtok(buf, ".");
if (!tok) return 1;
a = strtol(tok, &end, 10);
if (*end != '\0') return 2;
tok = strtok(NULL, ".");
if (!tok) return 1;
den_power = strlen(tok); // Denominator power of 10
b = strtol(tok, &end, 10);
if (*end != '\0') return 2;
*den = static_cast<int>(pow(10.00, den_power));
*num = a * *den + b;
num_simple_fraction(num, den);
return 0;
}
Sample usage:
int rc = num_string_float_to_rat("0015.0235", &num, &den);
// Check return code -> should be 0!
printf("%ld/%ld\n", num, den);
Output:
30047/2000
Full example at http://codepad.org/CFQQEZkc .
Notes:
strtok() is used to parse the input in to tokens (no need to reinvent the wheel in that regard). strtok() modifies its input - so a temporary buffer is used for safety
it checks for invalid characters - and will return a non-zero return code if found
strtol() has been used instead of atoi() - as it can detect non-numeric characters in the input
scanf() has not been used to slurp the input - due to rounding issues with floating point numbers
the base for strtol() has been explicitly set to 10 to avoid problems with leading zeros (otherwise a leading zero will cause the number to be interpreted as octal)
it uses a num_simple_fraction() helper (not shown) - which in turn uses a gcd() helper (also not shown) - to convert the result to a simple fraction
log10() of the numerator is determined by calculating the length of the token after the decimal point
I'd do this in three steps.
1) find the decimal point, so that you know how large the denominator has to be.
2) get the numerator. That's just the original text with the decimal point removed.
3) get the denominator. If there was no decimal point, the denominator is 1. Otherwise, the denominator is 10^n, where n is the number of digits to the right of the (now-removed) decimal point.
struct fraction {
std::string num, den;
};
fraction parse(std::string input) {
// 1:
std::size_t dec_point = input.find('.');
// 2:
if (dec_point == std::string::npos)
dec_point = 0;
else {
dec_point = input.length() - dec_point;
input.erase(input.begin() + dec_point);
}
// 3:
int denom = 1;
for (int i = 1; i < dec_point; ++i)
denom *= 10;
string result = { input, std::to_string(denom) };
return result;
}

Hexadecimal representation of double fractional part for SHA-256

I am trying to write a SHA-256 hash function for practice. In the wikipedia stands that the initial hash values are given by the fractional parts of the square roots of the first 8 primes 2..19. Now i am trying to calculate them. What i have done so far:
#include <vector>
#include <cstdint>
#include <cmath>
#include <cstdio>
// fill primes with all prime values between min and max value
int getPrimes(uint32_t min, uint32_t max, std::vector<uint32_t>* primes)
{
if (min < 1) min = 1; // primes can only be >= 1
if (min > max) return 0; // max has to be larger than min
for (uint32_t value = min; value <= max; value++)
{
uint32_t tmp;
for (tmp = 2; tmp <= sqrt(value); tmp++) // start to check with 2, because 1 is always going to work
{
if (value % tmp == 0)
{
break;
}
}
if (tmp > sqrt(value)) primes->push_back(value); // if no other integer divisor is found, add number to vector
}
return 0;
}
int main()
{
std::vector<uint32_t> primes;
getPrimes(2, 20, &primes); // fills vector with all prime values between 2 and 20
double tmp = sqrt(primes[0]); // get square root, returns double
printf("value %f\n", tmp); // debug
printf("size of double %i\n", sizeof(double)); // get representation byte size
double * tmpOffset = &tmp; // get value offset
unsigned char * tmpChar = (unsigned char*)tmpOffset; // convert to char pointer
printf("address of variable %i\n", &tmp); // debug
printf("raw values\n1:%X\n2:%X\n3:%X\n4:%X\n5:%X\n6:%X\n7:%X\n8:%X\n",
(uint8_t)tmpChar[0], (uint8_t)tmpChar[1], (uint8_t)tmpChar[2], (uint8_t)tmpChar[3],
(uint8_t)tmpChar[4], (uint8_t)tmpChar[5], (uint8_t)tmpChar[6], (uint8_t)tmpChar[7]);
return 0;
}
This returns the first 8 primes, calculates the square root of 2 and fetches directly from the memory location where it is stored the actual byte values:
value 1.414214
size of double 8
address of variable 6881016
raw values
1:CD
2:3B
3:7F
4:66
5:9E
6:A0
7:F6
8:3F
Compared to the value given in the wikipedia article 0x6a09e667 it looks awfully wrong what i am doing here. Is there a remapping happening or how excatly is the binary reresentation of a double?Can someone point me in the right direction how to correctly calculate the fractional part in hex?
Edit:
Thanks for your help! It is not pretty but does work for now.
printf("raw fractional part:\n0x%02X %02X %02X %02X %02X %02X %02X\n",
(uint8_t)(0xf & tmpChar[6]), (uint8_t)tmpChar[5], (uint8_t)tmpChar[4], (uint8_t)tmpChar[3],
(uint8_t)tmpChar[2], (uint8_t)tmpChar[1], (uint8_t)tmpChar[0]);
uint32_t fracPart = (0xf & tmpChar[6]);
fracPart <<= 8;
fracPart |= tmpChar[5];
fracPart <<= 8;
fracPart |= tmpChar[4] ;
fracPart <<= 8;
fracPart |= tmpChar[3];
fracPart <<= 4;
fracPart |= (0xf0 & tmpChar[2]) >> 4;
printf("fractional part: %X\n", fracPart);
Edit2
A little bit of a nicer implementation:
uint32_t fracPart2 = *(uint32_t*)((char*)&tmp + 3); // point to fractional part - 4 bit
fracPart2 <<= 4; // shift to correct value
fracPart2 |= (0xf0 & *((char*)&tmp + 2)) >> 4; // append last 4 bit
printf("beautiful fractional part: %X\n", fracPart2);
This solution is highly platform dependant and in a second approach i am going for something like in the link of comment 2.
Edit3
So this is my final solution, which does not depend on the internal representation of a double and calculates the fraction just using math.
uint32_t getFractionalPart(double value)
{
uint32_t retValue = 0;
for (uint8_t i = 0; i < 8; i++)
{
value = value - floor(value);
retValue <<= 4;
value *= 16;
retValue += floor(value);
}
return retValue;
}
One thing to keep in mind is that the double here is 64 bits.
If you look at the IEEE representation of doubles at the below link, it has 1 sign bit, 11 exponent bits and the remaining are the precision bits.
Now when you look at the output you've gotten, take a look at the nibbles I've put in quotes, do they look familiar?
The reason the number is backwards is because of endianness. The 12th bit is where the fractional part is starting in this case, and then it moves backwards.
1:CD
2:3B
3:'7'F
4:'66'
5:'9E'
6:'A0'
7:F'6'
8:3F
or
8:3F
7:F'6'
6:'A0'
5:'9E'
4:'66'
3:'7'F
2:3B
1:CD
https://en.wikipedia.org/wiki/Double-precision_floating-point_format

Binary-Decimal Negative bit set

How can I tell if a binary number is negative?
Currently I have the code below. It works fine converting to Binary. When converting to decimal, I need to know if the left most bit is 1 to tell if it is negative or not but I cannot seem to figure out how to do that.
Also, instead of making my Bin2 function print 1's an 0's, how can I make it return an integer? I didn't want to store it in a string and then convert to int.
EDIT: I'm using 8 bit numbers.
int Bin2(int value, int Padding = 8)
{
for (int I = Padding; I > 0; --I)
{
if (value & (1 << (I - 1)))
std::cout<< '1';
else
std::cout<<'0';
}
return 0;
}
int Dec2(int Value)
{
//bool Negative = (Value & 10000000);
int Dec = 0;
for (int I = 0; Value > 0; ++I)
{
if(Value % 10 == 1)
{
Dec += (1 << I);
}
Value /= 10;
}
//if (Negative) (Dec -= (1 << 8));
return Dec;
}
int main()
{
Bin2(25);
std::cout<<"\n\n";
std::cout<<Dec2(11001);
}
You are checking for negative value incorrectly. Do the following instead:
bool Negative = (value & 0x80000000); //It will work for 32-bit platforms only
Or may be just compare it with 0.
bool Negative = (value < 0);
Why don't you just compare it to 0. Should work fine and almost certainly you can't do this in a manner more efficient than the compiler.
I am entirely unclear if this is what the OP is looking for, but its worth a toss:
If you know you have a value in a signed int that is supposed to be representing a signed 8-bit value, you can pull it apart, store it in a signed 8-bit value, then promote it back to a native int signed value like this:
#include <stdio.h>
int main(void)
{
// signed integer, value is 245. 8bit signed value is (-11)
int num = 0xF5;
// pull out the low 8 bits, storing them in a signed char.
signed char ch = (signed char)(num & 0xFF);
// now let the signed char promote to a signed int.
int res = ch;
// finally print both.
printf("%d ==> %d\n",num, res);
// do it again for an 8 bit positive value
// this time with just direct casts.
num = 0x70;
printf("%d ==> %d\n", num, (int)((signed char)(num & 0xFF)));
return 0;
}
Output
245 ==> -11
112 ==> 112
Is that what you're trying to do? In short, the code above will take the 8bits sitting at the bottom of num, treat them as a signed 8-bit value, then promote them to a signed native int. The result is you can now "know" not only whether the 8-bits were a negative number (since res will be negative if they were), you also get the 8-bit signed number as a native int in the process.
On the other hand, if all you care about is whether the 8th bit is set in the input int, and is supposed to denote a negative value state, then why not just :
int IsEightBitNegative(int val)
{
return (val & 0x80) != 0;
}

Going crazy, why are my variables changing on me?

Okay I've had this happen to me before where variables randomly change numbers because of memory allocation issues or wrong addressing etc, such as when you go out of bounds with an array. However, I'm not using arrays, or pointers or addresses so I have no idea why after executing this loop it suddenly decides that "exponent" after being set to 0 is equal to 288 inside the loop:
EDIT: It decides to break on specifically: 0x80800000.
This does not break in one test, we have a "testing" client which iterates through several test cases, each time it calls this again, each time the function is called again the values should be set equal to their original values.
/*
* float_i2f - Return bit-level equivalent of expression (float) x
* Result is returned as unsigned int, but
* it is to be interpreted as the bit-level representation of a
* single-precision floating point values.
* Legal ops: Any integer/unsigned operations incl. ||, &&. also if, while
* Max ops: 30
* Rating: 4
*/
unsigned float_i2f(int x) {
int sign= 0;
int a=0;
int exponent=0;
int crash_test=0;
int exp=0;
int fraction=0;
int counter=0;
if (x == 0) return 0;
if (!(x ^ (0x01 << 31)))
{
return 0xCF << 24;
}
if (x>>31)
{
sign = 0xFF << 31;
x = (~x) + 1;
}
else
{
sign = 0x00;
}
//printf(" After : %x ", x);
a = 1;
exponent = 0;
crash_test = 0;
while ((a*2) <= x)
{
if (a == 0) a =1;
if (a == 1) crash_test = exponent;
/*
if(exponent == 288)
{exponent =0;
counter ++;
if(counter <=2)
printf("WENT OVERBOARD WTF %d ORIGINAL %d", a, crash_test);
}
*/
if (exponent > 300) break;
exponent ++;
a *= 2;
}
exp = (exponent + 0x7F) << 23;
fraction = (~(((0x01)<< 31) >> 7)) & (x << (25 - (exponent + 1)));
return sign | exp | fraction;
}
Use a debugger or IDE, set a watch/breakpoint/assert on the value of exponent (e.g. (exponent > 100).
What was the offending value of x that float_i2f() was called with? Did exponent blow up for all x, or some range?
(Did you just say when x = 0x80800000 ? Did you set a watch on exponent and step that in a debugger for that value? Should answer your question. Did you check that 0x807FFFFF works, for example?)
I tried it myself with Visual Studio, and an input of "10", and it seemed to work OK.
Q: Can you give me an input value of "x" where it fails?
Q: What compiler are you using? What platform are you running on?
You have line that increments exponent at the end of your while loop.
while((a*2) <= x)
{
if(a == 0) a =1;
if(a == 1) crash_test = exponent;
/*
if(exponent == 288)
{
exponent =0;
counter ++;
if(counter <=2)
printf("WENT OVERBOARD WTF %d ORIGINAL %d", a, crash_test);
}
*/
if(exponent > 300) break;
exponent ++;
a *= 2;
}
The variable exponent isn't doing anything mysterious. You are incrementing exponent each time through the loop, so it eventually hits any number you like. The real question is why doesn't your loop exit when you think it should?
Your loop condition depends on a. Try printing out the successive values of a as your loop repeats. Do you notice anything funny happening after a reaches 1073741824? Have you heard about integer overflow in your classes yet?
Just handle the case where "a" goes negative (or better, validate your input so it never goes negative int he first place), and you should be fine :)
There were many useless attempts at optimization in there, I've removed them so the code is easier to read. Also I used <stdint.h> types as appropriate.
There was signed integer overflow in a *= 2 in the loop, but the main problem was lack of constants and weird computation of magic numbers.
This still isn't exemplary because the constants should all be named, but this seems to work reliably.
#include <stdio.h>
#include <stdint.h>
uint32_t float_i2f(int32_t x) {
uint32_t sign= 0;
uint32_t exponent=0;
uint32_t fraction=0;
if (x == 0) return 0;
if ( x == 0x80000000 )
{
return 0xCF000000u;
}
if ( x < 0 )
{
sign = 0x80000000u;
x = - x;
}
else
{
sign = 0;
}
/* Count order of magnitude, this will be excessive by 1. */
for ( exponent = 1; ( 1u << exponent ) <= x; ++ exponent ) ;
if ( exponent < 24 ) {
fraction = 0x007FFFFF & ( x << 24 - exponent ); /* strip leading 1-bit */
} else {
fraction = 0x007FFFFF & ( x >> exponent - 24 );
}
exponent = (exponent + 0x7E) << 23;
return sign | exponent | fraction;
}
a overflows. a*2==0 when a==1<<31, so every time exponent%32==0, a==0 and you loop until exponent==300.
There are a few other issues as well:
Your fraction calculation is off when exponent>=24. Negative left shifts do not automatically turn into positive right shifts.
The mask to generate the fraction is also slightly wrong. The leading bit is always assumed to be 1, and the mantissa is only 23 bits, so fraction for x<2^23 should be:
fraction = (~(((0x01)<< 31) >> 8)) & (x << (24 - (exponent + 1)));
The loop to calculate the exponent fails when abs(x)>=1<<31 (and incidentally results in precision loss if you don't round appropriately); a loop that takes the implicit 1 into account would be better here.

How to convert a double to a C# decimal in C++?

Given the reprensentation of decimal I have --you can find it here for instance--, I tried to convert a double this way:
explicit Decimal(double n)
{
DoubleAsQWord doubleAsQWord;
doubleAsQWord.doubleValue = n;
uint64 val = doubleAsQWord.qWord;
const uint64 topBitMask = (int64)(0x1 << 31) << 32;
//grab the 63th bit
bool isNegative = (val & topBitMask) != 0;
//bias is 1023=2^(k-1)-1, where k is 11 for double
uint32 exponent = (((uint64)(val >> 31) >> 21) & 0x7FF) - 1023;
//exclude both sign and exponent (<<12, >>12) and normalize mantissa
uint64 mantissa = ((uint64)(0x1 << 31) << 21) | (val << 12) >> 12;
// normalized mantissa is 53 bits long,
// the exponent does not care about normalizing bit
uint8 scale = exponent + 11;
if (scale > 11)
scale = 11;
else if (scale < 0)
scale = 0;
lo_ = ((isNegative ? -1 : 1) * n) * std::pow(10., scale);
signScale_ = (isNegative ? 0x1 : 0x0) | (scale << 1);
// will always be 0 since we cannot reach
// a 128 bits precision with a 64 bits double
hi_ = 0;
}
The DoubleAsQWord type is used to "cast" from double to its uint64 representation:
union DoubleAsQWord
{
double doubleValue;
uint64 qWord;
};
My Decimal type has these fields:
uint64 lo_;
uint32 hi_;
int32 signScale_;
All this stuff is encapsulated in my Decimal class. You can notice I extract the mantissa even if I'm not using it. I'm still thinking of a way to guess the scale accurately.
This is purely practical, and seems to work in the case of a stress test:
BOOST_AUTO_TEST_CASE( convertion_random_stress )
{
const double EPSILON = 0.000001f;
srand(time(0));
for (int i = 0; i < 10000; ++i)
{
double d1 = ((rand() % 10) % 2 == 0 ? -1 : 1)
* (double)(rand() % 1000 + 1000.) / (double)(rand() % 42 + 2.);
Decimal d(d1);
double d2 = d.toDouble();
double absError = fabs(d1 - d2);
BOOST_CHECK_MESSAGE(
absError <= EPSILON,
"absError=" << absError << " with " << d1 << " - " << d2
);
}
}
Anyway, how would you convert from double to this decimal representation?
I think you guys will be interested in an implementation of a C++ wrapper to the Intel Decimal Floating-Point Math Library:
C++ Decimal Wrapper Class
Intel DFP
What about using VarR8FromDec Function ?
EDIT: This function is declared on Windows system only. However an equivalent C implementation is available with WINE, here: http://source.winehq.org/source/dlls/oleaut32/vartype.c
Perhaps you are looking for System::Convert::ToDecimal()
http://msdn.microsoft.com/en-us/library/a69w9ca0%28v=vs.80%29.aspx
Alternatively you could try recasting the Double as a Decimal.
An example from the MSDN.
http://msdn.microsoft.com/en-us/library/aa326763%28v=vs.71%29.aspx
// Convert the double argument; catch exceptions that are thrown.
void DecimalFromDouble( double argument )
{
Object* decValue;
// Convert the double argument to a Decimal value.
try
{
decValue = __box( (Decimal)argument );
}
catch( Exception* ex )
{
decValue = GetExceptionType( ex );
}
Console::WriteLine( formatter, __box( argument ), decValue );
}
If you do not have access to the .Net routines then this is tricky. I have done this myself for my hex editor (so that users can display and edit C# Decimal values using the Properties dialog) - see http://www.hexedit.com for more information. Also the source for HexEdit is freely available - see my article at http://www.codeproject.com/KB/cpp/HexEdit.aspx.
Actually my routines convert between Decimal and strings but you can of course use sprintf to convert the double to a string first. (Also when you talk about double I think you explicitly mean IEEE 64-bit floating point format, though this is what most compilers/systems use nowadays.)
Note that there are a few gotchas if you want to handle precisely all valid Decimal values and return an error for any value that cannot be converted, since the format is not well documented. (The Decimal format is aweful really, eg the same number can have many representations.)
Here is my code that converts a string to a Decimal. Note that it uses the the GNU Multiple Precision Arithmetic Library (functions that start with mpz_). The String2Decimal function obviously returns false if it fails for some reason, such as the value being too big. The parameter 'presult' must point to a buffer of at least 16 bytes, to store the result.
bool String2Decimal(const char *ss, void *presult)
{
bool retval = false;
// View the decimal (result) as four 32 bit integers
unsigned __int32 *dd = (unsigned __int32 *)presult;
mpz_t mant, max_mant;
mpz_inits(mant, max_mant, NULL);
int exp = 0; // Exponent
bool dpseen = false; // decimal point seen yet?
bool neg = false; // minus sign seen?
// Scan the characters of the value
const char *pp;
for (pp = ss; *pp != '\0'; ++pp)
{
if (*pp == '-')
{
if (pp != ss)
goto exit_func; // minus sign not at start
neg = true;
}
else if (isdigit(*pp))
{
mpz_mul_si(mant, mant, 10);
mpz_add_ui(mant, mant, unsigned(*pp - '0'));
if (dpseen) ++exp; // Keep track of digits after decimal pt
}
else if (*pp == '.')
{
if (dpseen)
goto exit_func; // more than one decimal point
dpseen = true;
}
else if (*pp == 'e' || *pp == 'E')
{
char *end;
exp -= strtol(pp+1, &end, 10);
pp = end;
break;
}
else
goto exit_func; // unexpected character
}
if (*pp != '\0')
goto exit_func; // extra characters after end
if (exp < -28 || exp > 28)
goto exit_func; // exponent outside valid range
// Adjust mantissa for -ve exponent
if (exp < 0)
{
mpz_t tmp;
mpz_init_set_si(tmp, 10);
mpz_pow_ui(tmp, tmp, -exp);
mpz_mul(mant, mant, tmp);
mpz_clear(tmp);
exp = 0;
}
// Get max_mant = size of largest mantissa (2^96 - 1)
//mpz_set_str(max_mant, "79228162514264337593543950335", 10); // 2^96 - 1
static unsigned __int32 ffs[3] = { 0xFFFFffffUL, 0xFFFFffffUL, 0xFFFFffffUL };
mpz_import(max_mant, 3, -1, sizeof(ffs[0]), 0, 0, ffs);
// Check for mantissa too big.
if (mpz_cmp(mant, max_mant) > 0)
goto exit_func; // value too big
else if (mpz_sgn(mant) == 0)
exp = 0; // if mantissa is zero make everything zero
// Set integer part
dd[2] = mpz_getlimbn(mant, 2);
dd[1] = mpz_getlimbn(mant, 1);
dd[0] = mpz_getlimbn(mant, 0);
// Set exponent and sign
dd[3] = exp << 16;
if (neg && mpz_sgn(mant) > 0)
dd[3] |= 0x80000000;
retval = true; // indicate success
exit_func:
mpz_clears(mant, max_mant, NULL);
return retval;
}
How about this:
1) sprintf number into s
2) find decimal point (strchr), store in idx
3) atoi = obtain integer part easily, use union to separate high/lo
4) use strlen - idx to obtain number of digits after point
sprintf may be slow but you´ll get the solution under 2 minutes of typing...