Isolate the fractional part of a double using bitwise operations in c++ - c++

I am stuck on an assignment for my modern numerical software development class.
Function prototype (assume x = 6.5):
//returns the IEEE fractional part of x as a decimal floating point number. You must convert binary to decimal.
inline double fraction(double x) {}
What I got:
inline double fraction(double x)
{
// Get the fraction
unsigned long long frac_mask = (1u << 52) - 1; // Get 52 1's
unsigned long long xint = *reinterpret_cast<long long*>(&x); // Interpret x's bits as an int
unsigned long long frac_num = xint & frac_mask; // Get the fraction as an int
double fraction = double(frac_num) / double(2u << 52); // Divide frac_num by 2^52
return fraction;
/* This code works, but is not what is specified:
double fraction = x / pow(2, exponent(x));
fraction = fmod(fraction, 1);
return fraction;
*/
}
I keep getting a NaN. The answer I am looking for is 0.625. I am kind of hopelessly lost. Any help is much appreciated.
I was able to successfully isolate the exponent of the double with the following function:
inline int exponent(double x) //returns the unbiased(true) binary exponent of x as a decimal integer. Remember that subnormals are a special case. Consider 0 to be a subnormal.
{
if (x == 0.0)
return -1022;
else if (isnan(x))
return 1024;
// Get the exponent
unsigned long long exp_mask = (1u << 11) - 1; // Get eleven 1's
exp_mask <<= 52; // Move into place
unsigned long long xint = *reinterpret_cast<long long*>(&x); // Interpret x's bits as an int
unsigned long long exp_bits = xint & exp_mask; // Get the exponent bits
unsigned long long exp = exp_bits >> 52; // Get the exponent as a number
return exp -1023;
}
I am confused why the exponent logic works, but the fraction won't.

You are mixing unsigned (presumably 32-bits) with values that need 64 bits.
For example, frac_num is only 32-bits, use a long or long long... [or uint64_t, which is a more reliable way to get a 64-bit value.
inline double fraction(double x)
{
// Get the fraction
uint64_t frac_mask = (1ul << 52) - 1; // Get 52 1's
// uint64_t xint = *reinterpret_cast<uint64_t*>(&x); // Interpret x's bits as an int
uint64_t xint;
memcpy(&xint, &x, sizeof(xint)); // Interpret x's bits as an int
int64_t frac_num = xint & frac_mask; // Get the fraction as an int
frac_num += 1ul << 52; // Add hidden bit.
double fraction = double(frac_num) / double(2ul << 52); // Divide frac_num by 2^52
return fraction;
}
Note the addition of l to the 1u and 2u, to ensure they are long, and. You will need to include cstdint to get the sized integers.
Edit: that will of course just give you the mantissa in the form of a fraction. The decimal point may be anywhere between bit 1023 and -1023, meaning that only values between -1 and +1 will have the correct result.
A complete example using the code above [+ some printouts]
#include <cstdint>
#include <iostream>
#include <cstring>
inline double fraction(double x)
{
// Get the fraction
uint64_t frac_mask = (1ul << 52) - 1; // Get 52 1's
std::cout << "mask=" << std::hex << frac_mask << std::endl;
// uint64_t xint = *reinterpret_cast<uint64_t*>(&x); // Interpret x's bits as an int
uint64_t xint;
memcpy(&xint, &x, sizeof(xint)); // Interpret x's bits as an int
int64_t frac_num = xint & frac_mask; // Get the fraction as an int
frac_num += 1ul << 52; // Add hidden bit.
std::cout << "xint=" << std::hex << xint << " num=" << std::hex << frac_num << std::endl;
double fraction = double(frac_num) / double(2ul << 52); // Divide frac_num by 2^52
return fraction;
}
int main()
{
double a = 0.5;
double b = 0.75;
double d = 6.5;
double e = 4.711;
double fa = fraction(a);
double fb = fraction(b);
double fd = fraction(d);
double fe = fraction(e);
std::cout << "fa=" << std::fixed << fa << " fb=" << fb << std::endl;
std::cout << "fd=" << std::fixed << fd << " fe=" << fe << std::endl;
}
resutl of running the above:
mask=fffffffffffff
xint=3fe0000000000000 num=10000000000000
mask=fffffffffffff
xint=3fe8000000000000 num=18000000000000
mask=fffffffffffff
xint=401a000000000000 num=1a000000000000
mask=fffffffffffff
xint=4012d810624dd2f2 num=12d810624dd2f2
fa=0.500000 fb=0.750000
fd=0.812500 fe=0.588875
Note that if you divide 4.711 by 2 a few times [3 times to be precise], you get 0.588875, and if you divide 6.5 by 8 (or by 2 three times over), you get 0.8125
I need to go to bed, but you basically have to take the exponent into account to figure out the fraction of a floating point number. Or simply convert to an integer, and subtract it - as long as it's within range.

Code: Try It Online
// bit_cast, bit_width
#include <bit>
// assert
#include <cassert>
// uint64_t
#include <cstdint>
[[nodiscard]]
constexpr auto Frac(double x)
noexcept -> double
{
using Bits = std::uint64_t;
constexpr Bits s_sign_bit_count{ 1ull };
constexpr Bits s_exponent_bit_count{ 11ull };
constexpr Bits s_mantissa_bit_count{ 52ull };
constexpr Bits s_sign_max{ (1ull << s_sign_bit_count) - 1ull };
constexpr Bits s_exponent_max{ (1ull << s_exponent_bit_count) - 1ull };
constexpr Bits s_mantissa_max{ (1ull << s_mantissa_bit_count) - 1ull };
constexpr Bits s_sign_mask{ s_sign_max << (s_exponent_bit_count + s_mantissa_bit_count) };
constexpr Bits s_exponent_mask{ s_exponent_max << s_mantissa_bit_count };
constexpr Bits s_mantissa_mask{ s_mantissa_max };
constexpr Bits s_exponent_bias{ (1ull << (s_exponent_bit_count - 1ull)) - 1ull };
if ((-1.0 < x) and (x < 1.0))
{
// Includes: subnormal, +0, -0
// No integral part.
return x;
}
const Bits u = std::bit_cast< Bits >(x);
const Bits exponent_bits = (u & s_exponent_mask) >> s_mantissa_bit_count;
assert(s_exponent_bias <= exponent_bits);
const Bits exponent = exponent_bits - s_exponent_bias;
if (s_mantissa_bit_count <= exponent)
{
// Includes: +Inf, -Inf, NaN
// No fractional part.
return {};
}
const Bits fraction_bit_count = s_mantissa_bit_count - exponent;
const Bits fraction_mask = (1ull << fraction_bit_count) - 1ull;
const Bits fraction_bits = u & fraction_mask;
const Bits fraction_shift = s_mantissa_bit_count - std::bit_width(fraction_bits)
+ 1ull; // Implicit leading one
Bits fraction = u & s_sign_mask;
if (fraction_shift < exponent_bits)
{
const Bits fraction_exponent = exponent_bits - fraction_shift;
const Bits fraction_mantissa = (fraction_bits << fraction_shift)
// Remove implicit leading one.
& s_mantissa_mask;
fraction |= (fraction_exponent << s_mantissa_bit_count);
fraction |= fraction_mantissa;
}
return std::bit_cast< double >(fraction);
}
Depending on your own preference, you can return x as well in case of a NaN.
Test:
// setprecision
#include <iomanip>
// cout, endl, fixed
#include <iostream>
auto main() -> int
{
std::cout << std::fixed << std::setprecision(11);
{
constexpr double d = 7.99999952316;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 0.5;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 0.75;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 6.5;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
{
constexpr double d = 4.711;
constexpr double frac = Frac(d);
std::cout << "frac(" << d << ") = " << frac << std::endl;
}
return 0;
}
Output
frac(7.99999952316) = 0.99999952316
frac(0.50000000000) = 0.50000000000
frac(0.75000000000) = 0.75000000000
frac(6.50000000000) = 0.50000000000
frac(4.71100000000) = 0.71100000000

Related

How to store and use two 32-bit signed int in one 64-bit int?

First of all, I want to clarify that this question is different from the questions:
How to store a 64 bit integer in two 32 bit integers and convert back again
Is it possible to store 2 32-bit values in one long int variable?
How to combine two 32-bit integers into one 64-bit integer?
That this question is store and use, which mean I can do this
int64_t score = make_score(-15, 15);
score += make_score(-5, 5); //I can use (add, subtract) the score
int32_t a = get_a(score);
assert(a == -20); //-15 -5 = -20
int32_t b = get_b(score);
assert(b == 20);//15 + 5= 20
This is achievable for two 16-bit int in one 32-bit int (Stockfish did this):
/// Score enum stores a middlegame and an endgame value in a single integer (enum).
/// The least significant 16 bits are used to store the middlegame value and the
/// upper 16 bits are used to store the endgame value. We have to take care to
/// avoid left-shifting a signed int to avoid undefined behavior.
enum Score : int { SCORE_ZERO };
constexpr Score make_score(int mg, int eg) {
return Score((int)((unsigned int)eg << 16) + mg);
}
/// Extracting the signed lower and upper 16 bits is not so trivial because
/// according to the standard a simple cast to short is implementation defined
/// and so is a right shift of a signed integer.
inline Value eg_value(Score s) {
union { uint16_t u; int16_t s; } eg = { uint16_t(unsigned(s + 0x8000) >> 16) };
return Value(eg.s);
}
inline Value mg_value(Score s) {
union { uint16_t u; int16_t s; } mg = { uint16_t(unsigned(s)) };
return Value(mg.s);
}
I'm trying to upgrade mg and eg from int16_t to int32_t but I can't figure out how to do it, I always have trouble when ScoreA + ScoreB ruin the eg and mg inside the Score.
Here is what I tried and failed:
enum Score : int64_t { SCORE_ZERO };
constexpr Score make_score(int mg, int eg) {
return Score((int)((uint64_t)eg << 32) + mg);
}
inline Value eg_value(Score s) {
union { uint32_t u; int32_t s; } eg = { uint32_t(unsigned(s + 0x80000000) >> 32) };
return Value(eg.s);
}
inline Value mg_value(Score s) {
union { uint32_t u; int32_t s; } mg = { uint32_t(unsigned(s)) };
return Value(mg.s);
}
Use memcpy.
As the comment in the original solution pointed out, this kind of bit manipulations are a minefield of potential undefined behavior. memcpy allows you to get rid of those and is well understood by modern compilers, so it will still result in efficient machine code.
enum Score : int64_t { SCORE_ZERO };
enum Value : int32_t { FORTYTWO };
inline Score make_score(int32_t mg, int32_t eg) {
int64_t combined;
std::memcpy(&combined, &eg, 4);
std::memcpy(reinterpret_cast<char*>(&combined) + 4, &mg, 4);
return Score(combined);
}
inline Value eg_value(Score s) {
int32_t eg;
std::memcpy(&eg, &s, 4);
return Value(eg);
}
inline Value mg_value(Score s) {
int32_t mg;
std::memcpy(&mg, reinterpret_cast<char*>(&s) + 4, 4);
return Value(mg);
}
Try it on godbolt.
The problem is that you still have some "int" and "unsigned" keywords left that still convert into the 32 bit version. So replace each "int" with "int64_t" and each "unsigned" with "uint64_t" and it should work as expected.
This can be different approach for this question
#include<iostream>
#include<cstdint>
#include<bitset>
using namespace std;
int main()
{
bitset<32> bit32[2] ={ 45 ,-415152545 };
bitset<64> bit64;
// store in 64 bit varibale
int index=0;
int position=0;
for(int i=0;i<64;i++)
{
bit64[i] =bit32[index][i-position];
if(i==31)
{ index = 1; position=32; }
}
// reset 32 bit container ,index and position
bit32[2] ={0};
index=0;
position=0;
// fetching data in 32 bit container from 64 bit and assign it into a and b .
int32_t a;
int32_t b;
for(int i=0;i<64;i++)
{
bit32[index][i-position] = bit64[i];
if(i==31)
{ index = 1; position=32; }
}
a = bit32[0].to_ulong();
b = bit32[1].to_ulong();
cout<<a<<" "<<b;
}
You could use union as well:
#include <stdint.h>
#include <iostream>
union Score {
int64_t u64;
int32_t u32[2];
Score() : u64(0) {}
Score(int64_t v) : u64(v) {}
Score(int32_t a, int32_t b): u32{a, b} {}
Score & operator=(Score const & original) { if(&original != this) { u64 = original.u64; } return *this; }
int32_t get_a() {return u32[0];}
int32_t get_b() {return u32[1];}
int64_t get() {return u64;}
Score operator+(Score const & other) {
return Score(u32[0] + other.u32[0], u32[1] + other.u32[1]);
}
Score & operator+=(Score const & other) {
u32[0] += other.u32[0];
u32[1] += other.u32[1];
return *this;
}
};
int main() {
Score v(-15, 15);
std::cout << "The size is: " << sizeof(Score) << " Bytes" << std::endl;
std::cout << "A is: " << v.get_a() << std::endl;
std::cout << "B is: " << v.get_b() << std::endl;
std::cout << "adding -5, +5" << std::endl;
Score v1 = v + Score(-5, 5);
std::cout << "A is: " << v1.get_a() << std::endl;
std::cout << "B is: " << v1.get_b() << std::endl;
std::cout << "adding -10, +10" << std::endl;
v += Score(-10, 10);
std::cout << "A is: " << v.get_a() << std::endl;
std::cout << "B is: " << v.get_b() << std::endl;
return 0;
}
Output:
The size is: 8 Bytes
A is: -15
B is: 15
adding -5, +5
A is: -20
B is: 20
adding -10, +10
A is: -25
B is: 25
It's easy.
int64_t value;
int32_t* value1 = (int32_t*)&value;
int32_t* value2 = (int32_t*)&value + 1;
Example:
#include <cstdint>
int main() {
int64_t value;
int32_t* value1 = (int32_t*)&value;
int32_t* value2 = (int32_t*)&value + 1;
*value1 = 10; // Example
*value2 = 20;
return 0;
}

C++ divide decimals error

I am trying to extract 20 decimals from a variable, but there should be an error with the divide operation, as this program gives me a wrong result:
#include <iostream>
#include <cmath>
using namespace std;
int fracpart(long double input)
{
long long I;
I = input * 10;
return I % 10;
}
int main()
{
int n = 9, m = 450;
long double S;
S = (long double)n/m;
for(int i=1; i<=20; i++){
cout << fracpart(S) << " ";
S *= 10;
}
return 0;
}
What I get:
0 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
What I should get:
0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
You can check the base used by the representation of floating-point types inspecting the value of the macro constant FLT_RADIX (defined in header <cfloat>). As you probably already know, the binary system is used internally by the majority of modern computers, rather then the decimal one.
Now consider a rational number like 1/3. It can't be represented by a finite number of digits in base 10, you'll end up with some approximation, like 0.3333333 and an acceptable error. Note that the same number can be represented in a base 3 system with a finite number of digits (0.1).
The number you are trying to print, 9/450, has a "nice" base 10 representation, 0.02, but it can't be represented in base 2 with absolute precision, even if the division could be performed without adding any error. Don't be misleaded by that '2', consider that 0.02 = 2/100 = 1/50 = 1/(2 * 52), where 1/5 can only be approximated, in base 2.
Anyways, there are methods to achieve what you want, for example using the output manipulators std::setprecision and std::fixed (defined in header <iomanip>) or even writing a (really ugly) custom function. Take a look at the output of this program:
#include <iostream>
#include <cmath>
#include <iomanip>
#include <vector>
#include <cstdint>
// splits a number into its integral and fractional (a vector of digits) parts
std::vector<uint8_t> to_digits (
long double x, uint8_t precision, long double &integral
);
// Reconstructs the approximated number
long double from_digits (
long double integral_part, std::vector<uint8_t> &fractional_part
);
int main()
{
using std::cout;
int n = 9, m = 450;
long double S;
S = static_cast<long double>(n)/m;
cout << "\nBase 10 representation of calculated value:\n"
<< std::setprecision(70) << S << '\n';
// This ^^^^^^^^^^^^^^^^^^^^^ will change only how the value is
// printed, not its internal binary representation
cout << "\nBase 10 representation of literal:\n"
<< 0.02L << '\n';
// This ^^^^^ will print the exact same digits
// the next greater representable value is a worse approximation
cout << "\nNext representable value:\n"
<< std::nextafter(S, 1.0) << '\n';
// but you can try to obtain a "better" output
cout << "\nRounded representation printed using <iomanip> functions:\n"
<< std::setprecision(20) << std::fixed << S << '\n';
cout << "\nRounded fractional part printed using custom function:\n";
long double integral_part;
auto dd = to_digits(S, 20, integral_part);
for (auto const d : dd)
{
cout << static_cast<int>(d);
}
cout << '\n';
// Reversing the process...
cout << "\nApproximated value (using custom function):\n";
auto X = from_digits(integral_part, dd);
cout << std::setprecision(70) << std::fixed << X << '\n';
cout << std::setprecision(20) << std::fixed << X << '\n';
}
std::vector<uint8_t> to_digits (
long double x, uint8_t precision, long double &integral
)
{
std::vector<uint8_t> digits;
long double fractional = std::modf(x, &integral);
for ( uint8_t i = 0; i < precision; ++i )
{
long double digit;
fractional = std::modf(fractional * 10, &digit);
digits.push_back(digit);
}
if ( digits.size() && std::round(fractional) == 1.0L )
{
uint8_t i = digits.size();
while ( i )
{
--i;
if ( digits[i] < 9 )
{
++digits[i];
break;
}
digits[i] = 0;
if ( i == 0 )
{
integral += 1.0L;
break;
}
}
}
return digits;
}
long double from_digits (
long double integral_part, std::vector<uint8_t> &fractional_part
)
{
long double x = 1.0L;
for ( auto d : fractional_part )
{
x *= 10.0L;
integral_part += d / x;
}
return integral_part;
}
I Thought it has happening "because the binary division is not perfectly conversible to decimal numbers", but Bob__ was right! The problem is happening because the long long variable is kind of "problematic". So, I just changed the code and used the functions ceil and round that I mentioned. This time I have tested the code, so I hope that it satisfy your needs.
PS1: Extract the function was really necessary.
PS2: Don't forget to include math.h library.
PS3: And, sorry for the delay in answer.
#include <iostream>
#include <cmath>
#include <math.h>
using namespace std;
int main()
{
int n = 9, m = 450;
long double S;
S = (long double)n/m;
for(int i=1; i<=20; i++){
cout << fmod(round(fmod(S * 10,10)), 10) << " ";
S *= 10;
}
return 0;
}
Here is some examples: http://www.cplusplus.com/reference/cmath/trunc/

c++ decimal to binary

I am a beginner starting out in c++ and I am trying to turn a decimal byte into a binary number. However, there is something wrong with my syntax or logic but I cannot figure it out. When trying to debug, I believe the error is around userValue5 but I'm not sure why. Any tips are appreciated and I am using VS2015.
#include "stdafx.h"
#include <iostream>
#include <stdint.h>
#include <cmath>
//finds where each column is a 0 or a 1
int binaryDigit(uint16_t y, uint16_t power)
{
if ((y / (pow(2, power))) > 1)
return 1;
else
return 0;
}
int difference(uint16_t y, int x, uint16_t power)
{
if (x == 1)
return y - pow(2, power);
else
return y;
}
//takes a decimal byte and turns it into binary
int main()
{
using namespace std;
cout << "Please insert a number between 0 and 255 so that I can convert it to binary: ";
uint16_t userValue(0), power(7);
cin >> userValue;
int firstDigit = binaryDigit(userValue, power);
uint16_t userValue2 = difference(userValue, firstDigit, power);
--power;
int secondDigit = binaryDigit(userValue2, power);
uint16_t userValue3 = difference(userValue2, secondDigit, power);
--power;
int thirdDigit = binaryDigit(userValue3, power);
uint16_t userValue4 = difference(userValue3, thirdDigit, power);
--power;
int fourthDigit = binaryDigit(userValue4, power);
uint16_t userValue5 = difference(userValue4, thirdDigit, power);
--power;
int fifthDigit = binaryDigit(userValue5, power);
uint16_t userValue6 = difference(userValue5, thirdDigit, power);
--power;
int sixthDigit = binaryDigit(userValue6, power);
uint16_t userValue7 = difference(userValue6, thirdDigit, power);
--power;
int seventhDigit = binaryDigit(userValue7, power);
uint16_t userValue8 = difference(userValue7, thirdDigit, power);
--power;
int eigthDigit = binaryDigit(userValue8, power);
cout << "The number " << userValue << " in binary is ";
cout << firstDigit << secondDigit << thirdDigit << fourthDigit << " " << fifthDigit << sixthDigit << seventhDigit << eigthDigit << endl;
return 0;
}
bitset is your friend.
#include <bitset>
#include <iostream>
using namespace std;
int main()
{
int userValue = 0;
cout << "Please insert a number between " << INT_MIN << " and " << INT_MAX << " so that I can convert it to binary: ";
cin >> userValue;
cout << bitset<32>(userValue) << endl;
return 0;
}
However, if you want to convert it to a string, you'll probably need stringstream:
stringstream ss;
ss << bitset<32>(userValue);
string str = ss.str();
I figured it out. When I copied and pasted the same code to save time, I forgot to edit the arguments so that's why it wasn't working properly. I fixed it and it works great. Thank you to everyone who commented and posted things like the table of exponents and the bit masking. I've learned a lot and cannot wait to write more programs.
bitset is the way to go, but in general, maybe this is more helpful...
std::string intToBinaryString(int x)
{
// a useful class that allows easy concatenation
std::stringstream ss;
// create an integer that only has a 1 bit in its top position.
unsigned int mask = 1 << (sizeof(int) * 8 - 1);
// for each bit in x, starting at the top, check to see if its non zero...
for(int i = 0; i <sizeof(int) * 8; ++i)
{
// see if x's (top - i)'th bit is non zero
bool nonZeroBit = x & mask;
// assign the (top - i)'th "bit" of ss to be '1' if non zero and '0' otherwise
ss << (nonZeroBit ? '1' : '0');
// shift the mask down by 1.
mask >>= 1;
}
// What we get on the first iteration is
//
// mask = 10000...0000
// & x = 10101...0010
// -----------------------------
// nonZero = 10000...0000 = true
//
// the second iteration is
//
// mask = 01000...0000
// & x = 10101...0010
// -----------------------------
// nonZero = 00000...0000 = false
//
// the third iteration is
//
// mask = 00100...0000
// & x = 10101...0010
// -----------------------------
// nonZero = 00100...0000 = true
//
// ...
//
// then we just add these results into ss
// finally, convert it into a normal string
return ss.str();
}

Creating a NaN float value from the sign, exponent and mantissa parts in C++

I need to create a floating point variable in C++ that has a NaN value. I also need to be able to see which NaN has a larger value. To compare the NaNs you will need to look at the mantissa part of the float.
Creating a NaN using the standard
nanf("abc");
method results in NaNs with the same mantissa, even with different strings used in the nanf function. By creating the NaN from the basic parts of the bit pattern should prove to provide different mantissa's and therefore simple sorting can be performed on the size of the mantissa.
Take a look at frexp() family of functions as well as ldexp(), which is kind of opposite to frexp()
Link: http://www.cplusplus.com/reference/cmath/ldexp/
Here's an example of type-punning floating-point values with unions and integer bitfields.
#include <iostream>
union floatPun {
struct {
unsigned int mantissa : 23;
unsigned int exponent : 8;
unsigned int sign : 1;
};
float value;
};
union doublePun {
struct {
unsigned long long mantissa : 52;
unsigned long long exponent : 11;
unsigned long long sign : 1;
};
float value;
};
template <typename PunT>
static int compare_mantissas(const PunT& a, const PunT& b) {
return int(a.mantissa > b.mantissa) - (b.mantissa > a.mantissa);
}
int main() {
floatPun fa = {0}, fb = {0};
// make NaNs
fa.exponent = fb.exponent = 0xff;
fa.mantissa = 1;
fb.mantissa = 2;
std::cout << "fa: " << fa.value << " fb: " << fb.value << "\n";
// compare mantissas
static const char* const cmp_labels[] = {"less than", "equal to", "greater than"};
std::cout << "mantissa of fa is "
<< cmp_labels[1 + compare_mantissas(fa, fb)]
<< " mantissa of fb\n";
// change fa to +infinity
fa.mantissa = 0;
std::cout << "changed fa's mantissa to zero: " << fa.value << "\n";
}

Is there a trunc function in C++?

I searched around and couldn't find the trunc function for C++. I know I can do this:
int main()
{
double a = 12.566789;
cout << setprecision(2) << fixed << (int)(a * 100) / 100.0 << endl;
return 0;
}
but I'm not sure it's the best way to do this. Thank you.
If your C library is so old that it lacks a trunc function (specified in C99), you can easily implement one based on floor and ceil (specified in C89)
double trunc(double d){ return (d>0) ? floor(d) : ceil(d) ; }
trunc is there, in <cmath>:
#include <iostream>
#include <cmath>
int main() {
std::cout << trunc(3.141516) << std::endl;
}
I suppose you're looking for something else?
There's a trunc function in C that you can use in C++
trunc(a*100)/100
Keep in mind that you still have to specify formatting requests, because floating point can't represent all real numbers exactly, and you could get output like 12.5600000001 or 12.55999999 if you don't tell the output code the precision you want.
TL;DR
Use the following for output:
cout << setprecision(2) << fixed << a<< endl;
And the following if you need a truncated result somewhere during a mathematical calculation:
trunc(a*100)/100
(Or better yet, use fixed-point math.)
Sure. Use the trunc() function from math.h. It's a C function, but it works as well in C++ as it does in C. If you want to keep a couple digits, you can always:
double a = 12.566789;
double b = trunc(a * 100) / 100.0;
If you're using an ancient C or C++ library that doesn't implement trunc, use boost::math::trunc.
I've developed a very fast trunc-function:
double ftrunc( double d )
{
static_assert(sizeof(double) == sizeof(uint64_t), "sizeof(double) not equal to sizeof(uint64_t)");
static_assert(numeric_limits<double>::is_iec559, "double must be IEEE-754");
// assume size_t is our CPU's native register-width
static_assert(sizeof(size_t) == sizeof(uint64_t) || sizeof(size_t) == sizeof(uint32_t), "register-width must be 32 or 64 bit");
if constexpr( sizeof(size_t) == sizeof(uint64_t) )
// we have 64 bit registers
{
unsigned const MANTISSA_BITS = 52,
EXP_BIAS = 0x3FF,
INF_NAN_BASE = 0x7FF;
uint64_t const EXP_MASK = (uint64_t)0x7FF << MANTISSA_BITS,
SIGN_MASK = (uint64_t)0x800 << MANTISSA_BITS ,
MIN_INTEGRAL_DIGITS_EXP = (uint64_t) EXP_BIAS << MANTISSA_BITS,
MIN_INTEGRAL_ONLY_EXP = (uint64_t)(EXP_BIAS + MANTISSA_BITS) << MANTISSA_BITS,
INF_NAN_EXP = (uint64_t)INF_NAN_BASE << MANTISSA_BITS,
NEG_MANTISSA_MASK = 0x000FFFFFFFFFFFFFu;
union
{
double du;
uint64_t dx;
};
du = d;
uint64_t exp = dx & EXP_MASK;
if( exp >= MIN_INTEGRAL_DIGITS_EXP )
// value has integral digits
if( exp < MIN_INTEGRAL_ONLY_EXP )
{
// there are fraction-digits to mask out, mask them
unsigned shift = (unsigned)(exp >> MANTISSA_BITS) - EXP_BIAS;
dx &= ~(NEG_MANTISSA_MASK >> shift);
return du;
}
else
if( exp < INF_NAN_EXP )
// value is integral
return du;
else
// infinite, NaN, SNaN
// raise exception on SNaN if necessary
return du + du;
else
{
// below +/-1.0
// return +/-0.0
dx &= SIGN_MASK;
return du;
}
}
else if constexpr( sizeof(size_t) == sizeof(uint32_t) )
// we have 32 bit registers
{
unsigned const MANTISSA_BITS = 52,
HI_MANTISSA_BITS = 20,
EXP_BIAS = 0x3FF,
INF_NAN_BASE = 0x7FF;
uint32_t const EXP_MASK = (uint32_t)0x7FFu << HI_MANTISSA_BITS,
SIGN_MASK = (uint32_t)0x800u << HI_MANTISSA_BITS,
MIN_INTEGRAL_DIGITS_EXP = (uint32_t) EXP_BIAS << HI_MANTISSA_BITS,
MAX_INTEGRAL32_EXP = (uint32_t)(EXP_BIAS + HI_MANTISSA_BITS) << HI_MANTISSA_BITS,
MIN_INTEGRAL_ONLY_EXP = (uint32_t)(EXP_BIAS + MANTISSA_BITS) << HI_MANTISSA_BITS,
INF_NAN_EXP = (uint32_t)INF_NAN_BASE << HI_MANTISSA_BITS,
NEG_HI_MANTISSA_MASK = 0x000FFFFFu,
NEG_LO_MANTISSA_MASK = 0xFFFFFFFFu;
union
{
double du;
struct
{
uint32_t dxLo;
uint32_t dxHi;
};
};
du = d;
uint32_t exp = dxHi & EXP_MASK;
if( exp >= MIN_INTEGRAL_DIGITS_EXP )
// value has integral digits
if( exp < MIN_INTEGRAL_ONLY_EXP )
// there are fraction-digits to mask out
if( exp <= MAX_INTEGRAL32_EXP )
{
// the fraction digits are in the upper dword, mask them and zero the lower dword
unsigned shift = (unsigned)(exp >> HI_MANTISSA_BITS) - EXP_BIAS;
dxHi &= ~(NEG_HI_MANTISSA_MASK >> shift);
dxLo = 0;
return du;
}
else
{
// the fraction digits are in the lower dword, mask them
unsigned shift = (unsigned)(exp >> HI_MANTISSA_BITS) - EXP_BIAS - HI_MANTISSA_BITS;
dxLo &= ~(NEG_LO_MANTISSA_MASK >> shift);
return du;
}
else
if( exp < INF_NAN_EXP )
// value is integral
return du;
else
// infinite, NaN, SNaN
// raise exception on SNaN if necessary
return du + du;
else
{
// below +/-1.0
// return +/-0.0
dxHi &= SIGN_MASK;
dxLo = 0;
return du;
}
}
}
It's faster than most implemementations. On my Ryzen 7 1800X the average execution-time of values >= 2^0 and <= 2^54 is 12 clock cycles.
use ceil or floor from cmath