32-bit to 16-bit Floating Point Conversion

32-bit to 16-bit Floating Point Conversion - c++

I need a cross-platform library/algorithm that will convert between 32-bit and 16-bit floating point numbers. I don't need to perform math with the 16-bit numbers; I just need to decrease the size of the 32-bit floats so they can be sent over the network. I am working in C++.
I understand how much precision I would be losing, but that's OK for my application.
The IEEE 16-bit format would be great.

Complete conversion from single precision to half precision. This is a direct copy from my SSE version, so it's branch-less. It makes use of the fact that -true == ~0 to preform branchless selections (GCC converts if statements into an unholy mess of conditional jumps, while Clang just converts them to conditional moves.)
Update (2019-11-04): reworked to support single and double precision values with fully correct rounding. I also put a corresponding if statement above each branchless select as a comment for clarity. All incoming NaNs are converted to the base quiet NaN for speed and sanity, as there is no way to reliably convert an embedded NaN message between formats.
#include <cstdint> // uint32_t, uint64_t, etc.
#include <cstring> // memcpy
#include <climits> // CHAR_BIT
#include <limits> // numeric_limits
#include <utility> // is_integral_v, is_floating_point_v, forward
namespace std
{
template< typename T , typename U >
T bit_cast( U&& u ) {
static_assert( sizeof( T ) == sizeof( U ) );
union { T t; }; // prevent construction
std::memcpy( &t, &u, sizeof( t ) );
return t;
}
} // namespace std
template< typename T > struct native_float_bits;
template<> struct native_float_bits< float >{ using type = std::uint32_t; };
template<> struct native_float_bits< double >{ using type = std::uint64_t; };
template< typename T > using native_float_bits_t = typename native_float_bits< T >::type;
static_assert( sizeof( float ) == sizeof( native_float_bits_t< float > ) );
static_assert( sizeof( double ) == sizeof( native_float_bits_t< double > ) );
template< typename T, int SIG_BITS, int EXP_BITS >
struct raw_float_type_info {
using raw_type = T;
static constexpr int sig_bits = SIG_BITS;
static constexpr int exp_bits = EXP_BITS;
static constexpr int bits = sig_bits + exp_bits + 1;
static_assert( std::is_integral_v< raw_type > );
static_assert( sig_bits >= 0 );
static_assert( exp_bits >= 0 );
static_assert( bits <= sizeof( raw_type ) * CHAR_BIT );
static constexpr int exp_max = ( 1 << exp_bits ) - 1;
static constexpr int exp_bias = exp_max >> 1;
static constexpr raw_type sign = raw_type( 1 ) << ( bits - 1 );
static constexpr raw_type inf = raw_type( exp_max ) << sig_bits;
static constexpr raw_type qnan = inf | ( inf >> 1 );
static constexpr auto abs( raw_type v ) { return raw_type( v & ( sign - 1 ) ); }
static constexpr bool is_nan( raw_type v ) { return abs( v ) > inf; }
static constexpr bool is_inf( raw_type v ) { return abs( v ) == inf; }
static constexpr bool is_zero( raw_type v ) { return abs( v ) == 0; }
};
using raw_flt16_type_info = raw_float_type_info< std::uint16_t, 10, 5 >;
using raw_flt32_type_info = raw_float_type_info< std::uint32_t, 23, 8 >;
using raw_flt64_type_info = raw_float_type_info< std::uint64_t, 52, 11 >;
//using raw_flt128_type_info = raw_float_type_info< uint128_t, 112, 15 >;
template< typename T, int SIG_BITS = std::numeric_limits< T >::digits - 1,
int EXP_BITS = sizeof( T ) * CHAR_BIT - SIG_BITS - 1 >
struct float_type_info
: raw_float_type_info< native_float_bits_t< T >, SIG_BITS, EXP_BITS > {
using flt_type = T;
static_assert( std::is_floating_point_v< flt_type > );
};
template< typename E >
struct raw_float_encoder
{
using enc = E;
using enc_type = typename enc::raw_type;
template< bool DO_ROUNDING, typename F >
static auto encode( F value )
{
using flt = float_type_info< F >;
using raw_type = typename flt::raw_type;
static constexpr auto sig_diff = flt::sig_bits - enc::sig_bits;
static constexpr auto bit_diff = flt::bits - enc::bits;
static constexpr auto do_rounding = DO_ROUNDING && sig_diff > 0;
static constexpr auto bias_mul = raw_type( enc::exp_bias ) << flt::sig_bits;
if constexpr( !do_rounding ) { // fix exp bias
// when not rounding, fix exp first to avoid mixing float and binary ops
value *= std::bit_cast< F >( bias_mul );
}
auto bits = std::bit_cast< raw_type >( value );
auto sign = bits & flt::sign; // save sign
bits ^= sign; // clear sign
auto is_nan = flt::inf < bits; // compare before rounding!!
if constexpr( do_rounding ) {
static constexpr auto min_norm = raw_type( flt::exp_bias - enc::exp_bias + 1 ) << flt::sig_bits;
static constexpr auto sub_rnd = enc::exp_bias < sig_diff
? raw_type( 1 ) << ( flt::sig_bits - 1 + enc::exp_bias - sig_diff )
: raw_type( enc::exp_bias - sig_diff ) << flt::sig_bits;
static constexpr auto sub_mul = raw_type( flt::exp_bias + sig_diff ) << flt::sig_bits;
bool is_sub = bits < min_norm;
auto norm = std::bit_cast< F >( bits );
auto subn = norm;
subn *= std::bit_cast< F >( sub_rnd ); // round subnormals
subn *= std::bit_cast< F >( sub_mul ); // correct subnormal exp
norm *= std::bit_cast< F >( bias_mul ); // fix exp bias
bits = std::bit_cast< raw_type >( norm );
bits += ( bits >> sig_diff ) & 1; // add tie breaking bias
bits += ( raw_type( 1 ) << ( sig_diff - 1 ) ) - 1; // round up to half
//if( is_sub ) bits = std::bit_cast< raw_type >( subn );
bits ^= -is_sub & ( std::bit_cast< raw_type >( subn ) ^ bits );
}
bits >>= sig_diff; // truncate
//if( enc::inf < bits ) bits = enc::inf; // fix overflow
bits ^= -( enc::inf < bits ) & ( enc::inf ^ bits );
//if( is_nan ) bits = enc::qnan;
bits ^= -is_nan & ( enc::qnan ^ bits );
bits |= sign >> bit_diff; // restore sign
return enc_type( bits );
}
template< typename F >
static F decode( enc_type value )
{
using flt = float_type_info< F >;
using raw_type = typename flt::raw_type;
static constexpr auto sig_diff = flt::sig_bits - enc::sig_bits;
static constexpr auto bit_diff = flt::bits - enc::bits;
static constexpr auto bias_mul = raw_type( 2 * flt::exp_bias - enc::exp_bias ) << flt::sig_bits;
raw_type bits = value;
auto sign = bits & enc::sign; // save sign
bits ^= sign; // clear sign
auto is_norm = bits < enc::inf;
bits = ( sign << bit_diff ) | ( bits << sig_diff );
auto val = std::bit_cast< F >( bits ) * std::bit_cast< F >( bias_mul );
bits = std::bit_cast< raw_type >( val );
//if( !is_norm ) bits |= flt::inf;
bits |= -!is_norm & flt::inf;
return std::bit_cast< F >( bits );
}
};
using flt16_encoder = raw_float_encoder< raw_flt16_type_info >;
template< typename F >
auto quick_encode_flt16( F && value )
{ return flt16_encoder::encode< false >( std::forward< F >( value ) ); }
template< typename F >
auto encode_flt16( F && value )
{ return flt16_encoder::encode< true >( std::forward< F >( value ) ); }
template< typename F = float, typename X >
auto decode_flt16( X && value )
{ return flt16_encoder::decode< F >( std::forward< X >( value ) ); }
Of course full IEEE support isn't always needed. If your values don't require logarithmic resolution approaching zero, then linearizing them to a fixed point format is much faster, as was already mentioned.

Half to float:
float f = ((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13);
Float to half:
uint32_t x = *((uint32_t*)&f);
uint16_t h = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);

std::frexp extracts the significand and exponent from normal floats or doubles -- then you need to decide what to do with exponents that are too large to fit in a half-precision float (saturate...?), adjust accordingly, and put the half-precision number together. This article has C source code to show you how to perform the conversion.

Given your needs (-1000, 1000), perhaps it would be better to use a fixed-point representation.
//change to 20000 to SHORT_MAX if you don't mind whole numbers
//being turned into fractional ones
const int compact_range = 20000;
short compactFloat(double input) {
return round(input * compact_range / 1000);
}
double expandToFloat(short input) {
return ((double)input) * 1000 / compact_range;
}
This will give you accuracy to the nearest 0.05. If you change 20000 to SHORT_MAX you'll get a bit more accuracy but some whole numbers will end up as decimals on the other end.

Why so over-complicated? My implementation does not need any additional library, complies with the IEEE-754 FP16 format, manages both normalized and denormalized numbers, is branch-less, takes about 40-ish clock cycles for a back and forth conversion and ditches NaN or Inf for an extended range. That's the magical power of bit operations.
typedef unsigned short ushort;
typedef unsigned int uint;
uint as_uint(const float x) {
return *(uint*)&x;
}
float as_float(const uint x) {
return *(float*)&x;
}
float half_to_float(const ushort x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
const uint e = (x&0x7C00)>>10; // exponent
const uint m = (x&0x03FF)<<13; // mantissa
const uint v = as_uint((float)m)>>23; // evil log2 bit hack to count leading zeros in denormalized format
return as_float((x&0x8000)<<16 | (e!=0)*((e+112)<<23|m) | ((e==0)&(m!=0))*((v-37)<<23|((m<<(150-v))&0x007FE000))); // sign : normalized : denormalized
}
ushort float_to_half(const float x) { // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
const uint b = as_uint(x)+0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
const uint e = (b&0x7F800000)>>23; // exponent
const uint m = b&0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
return (b&0x80000000)>>16 | (e>112)*((((e-112)<<10)&0x7C00)|m>>13) | ((e<113)&(e>101))*((((0x007FF000+m)>>(125-e))+1)>>1) | (e>143)*0x7FFF; // sign : normalized : denormalized : saturate
}
Example for how to use it and to check that the conversion is correct:
#include <iostream>
void print_bits(const ushort x) {
for(int i=15; i>=0; i--) {
cout << ((x>>i)&1);
if(i==15||i==10) cout << " ";
if(i==10) cout << " ";
}
cout << endl;
}
void print_bits(const float x) {
uint b = *(uint*)&x;
for(int i=31; i>=0; i--) {
cout << ((b>>i)&1);
if(i==31||i==23) cout << " ";
if(i==23) cout << " ";
}
cout << endl;
}
int main() {
const float x = 1.0f;
const ushort x_compressed = float_to_half(x);
const float x_decompressed = half_to_float(x_compressed);
print_bits(x);
print_bits(x_compressed);
print_bits(x_decompressed);
return 0;
}
Output:
0 01111111 00000000000000000000000
0 01111 0000000000
0 01111111 00000000000000000000000
I have published an adapted version of this FP32<->FP16 conversion algorithm in this paper with detailed description on how the bit manipulation magic works. In this paper I also provide several ultra-fast conversion algorithms for different 16-bit Posit formats.

If you're sending a stream of information across, you could probably do better than this, especially if everything is in a consistent range, as your application seems to have.
Send a small header, that just consists of a float32 minimum and maximum, then you can send across your information as a 16 bit interpolation value between the two. As you also say that precision isn't much of an issue, you could even send 8bits at a time.
Your value would be something like, at reconstruction time:
float t = _t / numeric_limits<unsigned short>::max(); // With casting, naturally ;)
float val = h.min + t * (h.max - h.min);
Hope that helps.
-Tom

This question is already a bit old, but for the sake of completeness, you might also take a look at this paper for half-to-float and float-to-half conversion.
They use a branchless table-driven approach with relatively small look-up tables. It is completely IEEE-conformant and even beats Phernost's IEEE-conformant branchless conversion routines in performance (at least on my machine). But of course his code is much better suited to SSE and is not that prone to memory latency effects.

This conversion for 16-to-32-bit floating point is quite fast for cases where you do not have to account for infinities or NaNs, and can accept denormals-as-zero (DAZ). I.e. it is suitable for performance-sensitive calculations, but you should beware of division by zero if you expect to encounter denormals.
Note that this is most suitable for x86 or other platforms that have conditional moves or "set if" equivalents.
Strip the sign bit off the input
Align the most significant bit of the mantissa to the 22nd bit
Adjust the exponent bias
Set bits to all-zero if the input exponent is zero
Re-insert sign bit
The reverse applies for single-to-half-precision, with some additions.
void float32(float* __restrict out, const uint16_t in) {
uint32_t t1;
uint32_t t2;
uint32_t t3;
t1 = in & 0x7fff; // Non-sign bits
t2 = in & 0x8000; // Sign bit
t3 = in & 0x7c00; // Exponent
t1 <<= 13; // Align mantissa on MSB
t2 <<= 16; // Shift sign bit into position
t1 += 0x38000000; // Adjust bias
t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero
t1 |= t2; // Re-insert sign bit
*((uint32_t*)out) = t1;
};
void float16(uint16_t* __restrict out, const float in) {
uint32_t inu = *((uint32_t*)&in);
uint32_t t1;
uint32_t t2;
uint32_t t3;
t1 = inu & 0x7fffffff; // Non-sign bits
t2 = inu & 0x80000000; // Sign bit
t3 = inu & 0x7f800000; // Exponent
t1 >>= 13; // Align mantissa on MSB
t2 >>= 16; // Shift sign bit into position
t1 -= 0x1c000; // Adjust bias
t1 = (t3 > 0x38800000) ? 0 : t1; // Flush-to-zero
t1 = (t3 < 0x8e000000) ? 0x7bff : t1; // Clamp-to-max
t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero
t1 |= t2; // Re-insert sign bit
*((uint16_t*)out) = t1;
};
Note that you can change the constant 0x7bff to 0x7c00 for it to overflow to infinity.
See GitHub for source code.

Most of the approaches described in the other answers here either do not round correctly on conversion from float to half, throw away subnormals which is a problem since 2**-14 becomes your smallest non-zero number, or do unfortunate things with Inf / NaN. Inf is also a problem because the largest finite number in half is a bit less than 2^16. OpenEXR was unnecessarily slow and complicated, last I looked at it. A fast correct approach will use the FPU to do the conversion, either as a direct instruction, or using the FPU rounding hardware to make the right thing happen. Any half to float conversion should be no slower than a 2^16 element lookup table.
The following are hard to beat:
On OS X / iOS, you can use vImageConvert_PlanarFtoPlanar16F and vImageConvert_Planar16FtoPlanarF. See Accelerate.framework.
Intel ivybridge added SSE instructions for this. See f16cintrin.h.
Similar instructions were added to the ARM ISA for Neon. See vcvt_f32_f16 and vcvt_f16_f32 in arm_neon.h. On iOS you will need to use the arm64 or armv7s arch to get access to them.

This code converts a 32-bit floating point number to 16-bits and back.
#include <x86intrin.h>
#include <iostream>
int main()
{
float f32;
unsigned short f16;
f32 = 3.14159265358979323846;
f16 = _cvtss_sh(f32, 0);
std::cout << f32 << std::endl;
f32 = _cvtsh_ss(f16);
std::cout << f32 << std::endl;
return 0;
}
I tested with the Intel icpc 16.0.2:
$ icpc a.cpp
g++ 7.3.0:
$ g++ -march=native a.cpp
and clang++ 6.0.0:
$ clang++ -march=native a.cpp
It prints:
$ ./a.out
3.14159
3.14062
Documentation about these intrinsics is available at:
https://software.intel.com/en-us/node/524287
https://clang.llvm.org/doxygen/f16cintrin_8h.html

The question is old and has already been answered, but I figured it would be worth mentioning an open source C++ library that can create 16bit IEEE compliant half precision floats and has a class that acts pretty much identically to the built in float type, but with 16 bits instead of 32. It is the "half" class of the OpenEXR library. The code is under a permissive BSD style license. I don't believe it has any dependencies outside of the standard library.

I had this same exact problem, and found this link very helpful. Just import the file "ieeehalfprecision.c" into your project and use it like this :
float myFloat = 1.24;
uint16_t resultInHalf;
singles2halfp(&resultInHalf, &myFloat, 1); // it accepts a series of floats, so use 1 to input 1 float
// an example to revert the half float back
float resultInSingle;
halfp2singles(&resultInSingle, &resultInHalf, 1);
I also change some code (See the comment by the author (James Tursa) in the link) :
#define INT16_TYPE int16_t
#define UINT16_TYPE uint16_t
#define INT32_TYPE int32_t
#define UINT32_TYPE uint32_t

I have found an implementation of conversion from half-float to single-float format and back with using of AVX2. There are much more faster than software implementation of these algorithms. I hope it will be useful.
32-bit float to 16-bit float conversion:
#include <immintrin.h"
inline void Float32ToFloat16(const float * src, uint16_t * dst)
{
_mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_loadu_ps(src), 0));
}
void Float32ToFloat16(const float * src, size_t size, uint16_t * dst)
{
assert(size >= 8);
size_t fullAlignedSize = size&~(32-1);
size_t partialAlignedSize = size&~(8-1);
size_t i = 0;
for (; i < fullAlignedSize; i += 32)
{
Float32ToFloat16(src + i + 0, dst + i + 0);
Float32ToFloat16(src + i + 8, dst + i + 8);
Float32ToFloat16(src + i + 16, dst + i + 16);
Float32ToFloat16(src + i + 24, dst + i + 24);
}
for (; i < partialAlignedSize; i += 8)
Float32ToFloat16(src + i, dst + i);
if(partialAlignedSize != size)
Float32ToFloat16(src + size - 8, dst + size - 8);
}
16-bit float to 32-bit float conversion:
#include <immintrin.h"
inline void Float16ToFloat32(const uint16_t * src, float * dst)
{
_mm256_storeu_ps(dst, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)));
}
void Float16ToFloat32(const uint16_t * src, size_t size, float * dst)
{
assert(size >= 8);
size_t fullAlignedSize = size&~(32-1);
size_t partialAlignedSize = size&~(8-1);
size_t i = 0;
for (; i < fullAlignedSize; i += 32)
{
Float16ToFloat32<align>(src + i + 0, dst + i + 0);
Float16ToFloat32<align>(src + i + 8, dst + i + 8);
Float16ToFloat32<align>(src + i + 16, dst + i + 16);
Float16ToFloat32<align>(src + i + 24, dst + i + 24);
}
for (; i < partialAlignedSize; i += 8)
Float16ToFloat32<align>(src + i, dst + i);
if (partialAlignedSize != size)
Float16ToFloat32<false>(src + size - 8, dst + size - 8);
}

Thanks Code for decimal to single precision
We actually can try to edit the same code to half precision , however it is not possible with gcc C compiler , so do the following
sudo apt install clang
Then try the following code
// A C code to convert Decimal value to IEEE 16-bit floating point Half precision
#include <stdio.h>
void printBinary(int n, int i)
{
int k;
for (k = i - 1; k >= 0; k--) {
if ((n >> k) & 1)
printf("1");
else
printf("0");
}
}
typedef union {
__fp16 f;
struct
{
unsigned int mantissa : 10;
unsigned int exponent : 5;
unsigned int sign : 1;
} raw;
} myfloat;
// Driver Code
int main()
{
myfloat var;
var.f = 11;
printf("%d | ", var.raw.sign);
printBinary(var.raw.exponent, 5);
printf(" | ");
printBinary(var.raw.mantissa, 10);
printf("\n");
return 0;
}
Compile the code in your terminal
clang code_name.c -o code_name
./code_name
Here
__fp16
is a 2 byte float data type supported in clang C compiler

Related

How to quickly divide a big uinteger into a word?

I am currently developing a class to work with big unsigned integers. However, I need incomplete functionality, namely:
bi_uint+=bi_uint - Already implemented. No complaints.
bi_uint*=std::uint_fast64_t - Already implemented. No complaints.
bi_uint/=std::uint_fast64_t - Implemented but works very slowly, also requires a type that is twice as wide as uint_fast64_t. In the test case, division was 35 times slower than multiplication
Next, I will give my implementation of division, which is based on a simple long division algorithm:
#include <climits>
#include <cstdint>
#include <limits>
#include <vector>
class bi_uint
{
public:
using u64_t = std::uint_fast64_t;
constexpr static std::size_t u64_bits = CHAR_BIT * sizeof(u64_t);
using u128_t = unsigned __int128;
static_assert(sizeof(u128_t) >= sizeof(u64_t) * 2);
//little-endian
std::vector<u64_t> data;
//User should guarantee data.size()>0 and val>0
void self_div(const u64_t val)
{
auto it = data.rbegin();
if(data.size() == 1) {
*it /= val;
return;
}
u128_t rem = 0;
if(*it < val) {
rem = *it++;
data.pop_back();
}
u128_t r = rem % val;
while(it != data.rend()) {
rem = (r << u64_bits) + *it;
const u128_t q = rem / val;
r = rem % val;
*it++ = static_cast<u64_t>(q);
}
}
};
You can see that the unsigned __int128 type was used, thefore, this option is not portable and is tied to a single compiler - GCC and also require x64 platform.
After reading the page about division algorithms, I feel the appropriate algorithm would be "Newton-Raphson division". However, the "Newton–Raphson division" algorithm seems complicated to me. I guess there is a simpler algorithm for dividing the type "big_uint/uint" that would have almost the same performance.
Q: How to fast divide a bi_uint into a u64_t?
I have about 10^6 iterations, each iteration uses all the operations listed
If this is easily achievable, then I would like to have portability and not use unsigned __int128. Otherwise, I prefer to abandon portability in favor of an easier way.
EDIT1:
This is an academic project, I am not able to use third-party libraries.

Part 1 (See Part 2 below)
I managed to speedup your division code 5x times on my old laptop (and even 7.5x times on GodBolt servers) using Barrett Reduction, this is a technique that allows to replace single division by several multiplications and additions. Implemented whole code from sctracth just today.
If you want you can jump directly to code location at the end of my post, without reading long description, as code is fully runnable without any knowledge or dependency.
Code below is only for Intel x64, because I used Intel only instructions and only 64-bit variants of them. Sure it can be re-written for x32 too and for other processors, because Barrett algorithm is generic.
To explain whole Barrett Reduction in short pseudo-code I'll write it in Python as it is simplest language suitable for understandable pseudo-code:
# https://www.nayuki.io/page/barrett-reduction-algorithm
def BarrettR(n, s):
return (1 << s) // n
def BarrettDivMod(x, n, r, s):
q = (x * r) >> s
t = x - q * n
return (q, t) if t < n else (q + 1, t - n)
Basically in pseudo code above BarrettR() is done only single time for same divisor (you use same single-word divisor for whole big integer division). BarrettDivMod() is used each time when you want to make division or modulus operations, basically given input x and divisor n it returns tuple (x / n, x % n), nothing else, but does it faster than regular division instruction.
In below C++ code I implement same two functions of Barrett, but do some C++ specific optimizations to make it even more faster. Optimizations are possible due to fact that divisor n is always 64-bit, x is 128-bit but higher half is always smaller than n (last assumption happens because higher half in your big integer division is always a remainder modulus n).
Barrett algorithm works with divisor n that is NOT a power of 2, so divisors like 0, 1, 2, 4, 8, 16, ... are not allowed. This trivial case of divisor you can cover just by doing right bit-shift of big integer, because dividing by power of 2 is just a bit-shift. Any other divisor is allowed, including even divisors that are not power of 2.
Also it is important to note that my BarrettDivMod() accepts ONLY dividend x that is strictly smaller than divisor * 2^64, in other words higher half of 128-bit dividend x should be smaller than divisor. This is always true in your case of your big integer divison function, as higher half is always a remainder modulus divisor. This rule for x should be checked by you, it is checked in my BarrettDivMod() only as DEBUG assertion that is removed in release.
You can notice that BarrettDivMod() has two big branches, these are two variants of same algorithm, first uses CLang/GCC only type unsigned __int128, second uses only 64-bit instructions and hence suitable for MSVC.
I tried to target three compilers CLang/GCC/MSVC, but some how MSVC version got only 2x faster with Barrett, while CLang/GCC are 5x faster. Maybe I did some bug in MSVC code.
You can see that I used your class bi_uint for time measurement of two versions of code - with regular divide and with Barrett. Important to note that I changed your code quite significantly, first to not use u128 (so that MSVC version compiles that has no u128), second not to modify data vector, so it does read only division and doesn't store final result (this read-only is needed for me to run speed tests very fast without copying data vector on each test iteration). So your code is broken in my snippet, it can't-be copy pasted to be used straight away, I only used your code for speed measurement.
Barrett reduction works faster not only because division is slower than multiplication, but also because multiplication and addition are both very well pipelined on moder CPUs, modern CPU can execute several mul or add instructions within one cycle, but only if these several mul/add don't depend on each other's result, in other words CPU can run several instructions in parallel within one cycle. As far as I know division can't be run in parallel, because there is only single module within CPU to make division, but still it is a bit pipelined, because after 50% of first division is done second division can be started in parallel at beginning of CPU pipeline.
On some computers you may notice that regular Divide version is much slower sometimes, this happens because CLang/GCC do fallback to library-based Soft implementation of Division even for 128 bit dividend. In this case my Barrett may show even 7-10x times speedup, as it doesn't use library functions.
To overcome issue described above, about Soft division, it is better to add Assembly code with usage of DIV instruction directly, or to find some Intrinsic function that implements this inside your compiler (I think CLang/GCC have such intrinsic). Also I can write this Assembly implementation if needed, just tell me in comments.
Update. As promised, implemented Assembly variant of 128 bit division for CLang/GCC, function UDiv128Asm(). After this change it is used as a main implementation for CLang/GCC 128 division instead of regular u128(a) / b. You may come back to regular u128 impementation by replacing #if 0 with #if 1 inside body of UDiv128() function.
Try it online!
#include <cstdint>
#include <bit>
#include <stdexcept>
#include <string>
#include <immintrin.h>
#if defined(_MSC_VER) && !defined(__clang__)
#define IS_MSVC 1
#else
#define IS_MSVC 0
#endif
#if IS_MSVC
#include <intrin.h>
#endif
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg: '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#ifdef _DEBUG
#define DASSERT_MSG(cond, msg) ASSERT_MSG(cond, msg)
#else
#define DASSERT_MSG(cond, msg)
#endif
#define DASSERT(cond) DASSERT_MSG(cond, "")
using u16 = uint16_t;
using u32 = uint32_t;
using i64 = int64_t;
using u64 = uint64_t;
using UllPtr = unsigned long long *;
inline int GetExp(double x) {
return int((std::bit_cast<uint64_t>(x) >> 52) & 0x7FF) - 1023;
}
inline size_t BitSizeWrong(uint64_t x) {
return x == 0 ? 0 : (GetExp(x) + 1);
}
inline size_t BitSize(u64 x) {
size_t r = 0;
if (x >= (u64(1) << 32)) {
x >>= 32;
r += 32;
}
while (x >= 0x100) {
x >>= 8;
r += 8;
}
while (x) {
x >>= 1;
++r;
}
return r;
}
#if !IS_MSVC
inline u64 UDiv128Asm(u64 h, u64 l, u64 d, u64 * r) {
u64 q;
asm (R"(
.intel_syntax
mov rdx, %V[h]
mov rax, %V[l]
div %V[d]
mov %V[r], rdx
mov %V[q], rax
)"
: [q] "=r" (q), [r] "=r" (*r)
: [h] "r" (h), [l] "r" (l), [d] "r" (d)
: "rax", "rdx"
);
return q;
}
#endif
inline std::pair<u64, u64> UDiv128(u64 hi, u64 lo, u64 d) {
#if IS_MSVC
u64 r, q = _udiv128(hi, lo, d, &r);
return std::make_pair(q, r);
#else
#if 0
using u128 = unsigned __int128;
auto const dnd = (u128(hi) << 64) | lo;
return std::make_pair(u64(dnd / d), u64(dnd % d));
#else
u64 r, q = UDiv128Asm(hi, lo, d, &r);
return std::make_pair(q, r);
#endif
#endif
}
inline std::pair<u64, u64> UMul128(u64 a, u64 b) {
#if IS_MSVC
u64 hi, lo = _umul128(a, b, &hi);
return std::make_pair(hi, lo);
#else
using u128 = unsigned __int128;
auto const x = u128(a) * b;
return std::make_pair(u64(x >> 64), u64(x));
#endif
}
inline std::pair<u64, u64> USub128(u64 a_hi, u64 a_lo, u64 b_hi, u64 b_lo) {
u64 r_hi, r_lo;
_subborrow_u64(_subborrow_u64(0, a_lo, b_lo, (UllPtr)&r_lo), a_hi, b_hi, (UllPtr)&r_hi);
return std::make_pair(r_hi, r_lo);
}
inline std::pair<u64, u64> UAdd128(u64 a_hi, u64 a_lo, u64 b_hi, u64 b_lo) {
u64 r_hi, r_lo;
_addcarry_u64(_addcarry_u64(0, a_lo, b_lo, (UllPtr)&r_lo), a_hi, b_hi, (UllPtr)&r_hi);
return std::make_pair(r_hi, r_lo);
}
inline int UCmp128(u64 a_hi, u64 a_lo, u64 b_hi, u64 b_lo) {
if (a_hi != b_hi)
return a_hi < b_hi ? -1 : 1;
return a_lo == b_lo ? 0 : a_lo < b_lo ? -1 : 1;
}
std::pair<u64, size_t> BarrettRS64(u64 n) {
// https://www.nayuki.io/page/barrett-reduction-algorithm
ASSERT_MSG(n >= 3 && (n & (n - 1)) != 0, "n " + std::to_string(n))
size_t const nbits = BitSize(n);
// 2^s = q * n + r; 2^s = (2^64 + q0) * n + r; 2^s - n * 2^64 = q0 * n + r
u64 const dnd_hi = (nbits >= 64 ? 0ULL : (u64(1) << nbits)) - n;
auto const q0 = UDiv128(dnd_hi, 0, n).first;
return std::make_pair(q0, nbits);
}
template <bool Use128 = true, bool Adjust = true>
std::pair<u64, u64> BarrettDivMod64(u64 x_hi, u64 x_lo, u64 n, u64 r, size_t s) {
// ((x_hi * 2^64 + x_lo) * (2^64 + r)) >> (64 + s)
DASSERT(x_hi < n);
#if !IS_MSVC
if constexpr(Use128) {
using u128 = unsigned __int128;
u128 const xf = (u128(x_hi) << 64) | x_lo;
u64 q = u64((u128(x_hi) * r + xf + u64((u128(x_lo) * r) >> 64)) >> s);
if (s < 64) {
u64 t = x_lo - q * n;
if constexpr(Adjust) {
u64 const mask = ~u64(i64(t - n) >> 63);
q += mask & 1;
t -= mask & n;
}
return std::make_pair(q, t);
} else {
u128 t = xf - u128(q) * n;
return t < n ? std::make_pair(q, u64(t)) : std::make_pair(q + 1, u64(t) - n);
}
} else
#endif
{
auto const w1a = UMul128(x_lo, r).first;
auto const [w2b, w1b] = UMul128(x_hi, r);
auto const w2c = x_hi, w1c = x_lo;
u64 w1, w2 = _addcarry_u64(0, w1a, w1b, (UllPtr)&w1);
w2 += _addcarry_u64(0, w1, w1c, (UllPtr)&w1);
w2 += w2b + w2c;
if (s < 64) {
u64 q = (w2 << (64 - s)) | (w1 >> s);
u64 t = x_lo - q * n;
if constexpr(Adjust) {
u64 const mask = ~u64(i64(t - n) >> 63);
q += mask & 1;
t -= mask & n;
}
return std::make_pair(q, t);
} else {
u64 const q = w2;
auto const [b_hi, b_lo] = UMul128(q, n);
auto const [t_hi, t_lo] = USub128(x_hi, x_lo, b_hi, b_lo);
return t_hi != 0 || t_lo >= n ? std::make_pair(q + 1, t_lo - n) : std::make_pair(q, t_lo);
}
}
}
#include <random>
#include <iomanip>
#include <iostream>
#include <chrono>
void TestBarrett() {
std::mt19937_64 rng{123}; //{std::random_device{}()};
for (size_t i = 0; i < (1 << 11); ++i) {
size_t const nbits = rng() % 63 + 2;
u64 n = 0;
do {
n = (u64(1) << (nbits - 1)) + rng() % (u64(1) << (nbits - 1));
} while (!(n >= 3 && (n & (n - 1)) != 0));
auto const [br, bs] = BarrettRS64(n);
for (size_t j = 0; j < (1 << 6); ++j) {
u64 const hi = rng() % n, lo = rng();
auto const [ref_q, ref_r] = UDiv128(hi, lo, n);
u64 bar_q = 0, bar_r = 0;
for (size_t k = 0; k < 2; ++k) {
bar_q = 0; bar_r = 0;
if (k == 0)
std::tie(bar_q, bar_r) = BarrettDivMod64<true>(hi, lo, n, br, bs);
else
std::tie(bar_q, bar_r) = BarrettDivMod64<false>(hi, lo, n, br, bs);
ASSERT_MSG(bar_q == ref_q && bar_r == ref_r, "i " + std::to_string(i) + ", j " + std::to_string(j) + ", k " + std::to_string(k) +
", nbits " + std::to_string(nbits) + ", n " + std::to_string(n) + ", bar_q " + std::to_string(bar_q) +
", ref_q " + std::to_string(ref_q) + ", bar_r " + std::to_string(bar_r) + ", ref_r " + std::to_string(ref_r));
}
}
}
}
class bi_uint
{
public:
using u64_t = std::uint64_t;
constexpr static std::size_t u64_bits = 8 * sizeof(u64_t);
//little-endian
std::vector<u64_t> data;
static auto constexpr DefPrep = [](auto n){
return std::make_pair(false, false);
};
static auto constexpr DefDivMod = [](auto dnd_hi, auto dnd_lo, auto dsr, auto const & prep){
return UDiv128(dnd_hi, dnd_lo, dsr);
};
//User should guarantee data.size()>0 and val>0
template <typename PrepT = decltype(DefPrep), typename DivModT = decltype(DefDivMod)>
void self_div(const u64_t val, PrepT const & Prep = DefPrep, DivModT const & DivMod = DefDivMod)
{
auto it = data.rbegin();
if(data.size() == 1) {
*it /= val;
return;
}
u64_t rem_hi = 0, rem_lo = 0;
if(*it < val) {
rem_lo = *it++;
//data.pop_back();
}
auto const prep = Prep(val);
u64_t r = rem_lo % val;
u64_t q = 0;
while(it != data.rend()) {
rem_hi = r;
rem_lo = *it;
std::tie(q, r) = DivMod(rem_hi, rem_lo, val, prep);
//*it++ = static_cast<u64_t>(q);
it++;
auto volatile out = static_cast<u64_t>(q);
}
}
};
void TestSpeed() {
auto Time = []{
static auto const gtb = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::high_resolution_clock::now() - gtb).count();
};
std::mt19937_64 rng{123};
std::vector<u64> limbs, divisors;
for (size_t i = 0; i < (1 << 17); ++i)
limbs.push_back(rng());
for (size_t i = 0; i < (1 << 8); ++i) {
size_t const nbits = rng() % 63 + 2;
u64 n = 0;
do {
n = (u64(1) << (nbits - 1)) + rng() % (u64(1) << (nbits - 1));
} while (!(n >= 3 && (n & (n - 1)) != 0));
divisors.push_back(n);
}
std::cout << std::fixed << std::setprecision(3);
double div_time = 0;
{
bi_uint x;
x.data = limbs;
auto const tim = Time();
for (auto dsr: divisors)
x.self_div(dsr);
div_time = Time() - tim;
std::cout << "Divide time " << div_time << " sec" << std::endl;
}
{
bi_uint x;
x.data = limbs;
for (size_t i = 0; i < 2; ++i) {
if (IS_MSVC && i == 0)
continue;
auto const tim = Time();
for (auto dsr: divisors)
x.self_div(dsr, [](auto n){ return BarrettRS64(n); },
[i](auto dnd_hi, auto dnd_lo, auto dsr, auto const & prep){
return i == 0 ? BarrettDivMod64<true>(dnd_hi, dnd_lo, dsr, prep.first, prep.second) :
BarrettDivMod64<false>(dnd_hi, dnd_lo, dsr, prep.first, prep.second);
});
double const bar_time = Time() - tim;
std::cout << "Barrett" << (i == 0 ? "128" : "64 ") << " time " << bar_time << " sec, boost " << div_time / bar_time << std::endl;
}
}
}
int main() {
try {
TestBarrett();
TestSpeed();
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
}
}
Output:
Divide time 3.171 sec
Barrett128 time 0.675 sec, boost 4.695
Barrett64 time 0.642 sec, boost 4.937
Part 2
As you have a very interesting question, after few days when I first published this post, I decided to implement from scratch all big integer math.
Below code implements math operations +, -, *, /, <<, >> for natural big numbers (positive integers), and +, -, *, / for floating big numbers. Both types of numbers are of arbitrary size (even millions of bits). Besides those as you requested, I fully implemented Newton-Raphson (both square and cubic variants) and Goldschmidt fast division algorithms.
Here is code snippet only for Newton-Raphson/Golschmidt functions, remaining code as it is very large is linked below on external server:
BigFloat & DivNewtonRaphsonSquare(BigFloat b) {
// https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
auto a = *this;
a.exp_ += b.SetScale(0);
if (b.sign_) {
a.sign_ = !a.sign_;
b.sign_ = false;
}
thread_local BigFloat two, c_48_17, c_32_17;
thread_local size_t static_prec = 0;
if (static_prec != BigFloat::prec_) {
two = 2;
c_48_17 = BigFloat(48) / BigFloat(17);
c_32_17 = BigFloat(32) / BigFloat(17);
static_prec = BigFloat::prec_;
}
BigFloat x = c_48_17 - c_32_17 * b;
for (size_t i = 0, num_iters = std::ceil(std::log2(double(static_prec + 1)
/ std::log2(17.0))) + 0.1; i < num_iters; ++i)
x = x * (two - b * x);
*this = a * x;
return BitNorm();
}
BigFloat & DivNewtonRaphsonCubic(BigFloat b) {
// https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
auto a = *this;
a.exp_ += b.SetScale(0);
if (b.sign_) {
a.sign_ = !a.sign_;
b.sign_ = false;
}
thread_local BigFloat one, c_140_33, c_m64_11, c_256_99;
thread_local size_t static_prec = 0;
if (static_prec != BigFloat::prec_) {
one = 1;
c_140_33 = BigFloat(140) / BigFloat(33);
c_m64_11 = BigFloat(-64) / BigFloat(11);
c_256_99 = BigFloat(256) / BigFloat(99);
static_prec = BigFloat::prec_;
}
BigFloat e, y, x = c_140_33 + b * (c_m64_11 + b * c_256_99);
for (size_t i = 0, num_iters = std::ceil(std::log2(double(static_prec + 1)
/ std::log2(99.0)) / std::log2(3.0)) + 0.1; i < num_iters; ++i) {
e = one - b * x;
y = x * e;
x = x + y + y * e;
}
*this = a * x;
return BitNorm();
}
BigFloat & DivGoldschmidt(BigFloat b) {
// https://en.wikipedia.org/wiki/Division_algorithm#Goldschmidt_division
auto a = *this;
a.exp_ += b.SetScale(0);
if (b.sign_) {
a.sign_ = !a.sign_;
b.sign_ = false;
}
BigFloat one = 1, two = 2, f;
for (size_t i = 0;; ++i) {
f = two - b;
a *= f;
b *= f;
if (i % 3 == 0 && (one - b).GetScale() < -i64(prec_) + i64(bit_sizeof(Word)))
break;
}
*this = a;
return BitNorm();
}
See Output: below, it will show that Newton-Raphson and Goldschmidt methods are actually 10x times slower than regular School-grade (called Reference in output) algorithm. Between each other these 3 advanced algorithms are about same speed. Probably Raphson/Goldschmidt could be faster if to use Fast Fourier Transform for multiplication, because multiplication of two large numbers takes 95% of time of these algorithms. In code below all results of Raphson/Goldschmidt algorithms are not only time-measured but also checked for correctness of results compared to School-grade (Reference) algorithm (see diff 2^... in console output, this shows how large is difference of result compared to school grade).
FULL SOURCE CODE HERE. Full code is so huge that it didn't fit into this StackOverflow due to SO limit of 30 000 characters per post, although I wrote this code from scracth specifically for this post. That's why providing external download link (PasteBin server), also click Try it online! linke below, it is same copy of code that is run live on GodBolt's linux servers:
Try it online!
Output:
========== 1 K bits ==========
Reference 0.000029 sec
Raphson2 0.000066 sec, boost 0.440x, diff 2^-8192
Raphson3 0.000092 sec, boost 0.317x, diff 2^-8192
Goldschmidt 0.000080 sec, boost 0.365x, diff 2^-1022
========== 2 K bits ==========
Reference 0.000071 sec
Raphson2 0.000177 sec, boost 0.400x, diff 2^-16384
Raphson3 0.000283 sec, boost 0.250x, diff 2^-16384
Goldschmidt 0.000388 sec, boost 0.182x, diff 2^-2046
========== 4 K bits ==========
Reference 0.000319 sec
Raphson2 0.000875 sec, boost 0.365x, diff 2^-4094
Raphson3 0.001122 sec, boost 0.285x, diff 2^-32768
Goldschmidt 0.000881 sec, boost 0.362x, diff 2^-32768
========== 8 K bits ==========
Reference 0.000484 sec
Raphson2 0.002281 sec, boost 0.212x, diff 2^-65536
Raphson3 0.002341 sec, boost 0.207x, diff 2^-65536
Goldschmidt 0.002432 sec, boost 0.199x, diff 2^-8189
========== 16 K bits ==========
Reference 0.001199 sec
Raphson2 0.009042 sec, boost 0.133x, diff 2^-16382
Raphson3 0.009519 sec, boost 0.126x, diff 2^-131072
Goldschmidt 0.009047 sec, boost 0.133x, diff 2^-16380
========== 32 K bits ==========
Reference 0.004311 sec
Raphson2 0.039151 sec, boost 0.110x, diff 2^-32766
Raphson3 0.041058 sec, boost 0.105x, diff 2^-262144
Goldschmidt 0.045517 sec, boost 0.095x, diff 2^-32764
========== 64 K bits ==========
Reference 0.016273 sec
Raphson2 0.165656 sec, boost 0.098x, diff 2^-524288
Raphson3 0.210301 sec, boost 0.077x, diff 2^-65535
Goldschmidt 0.208081 sec, boost 0.078x, diff 2^-65534
========== 128 K bits ==========
Reference 0.059469 sec
Raphson2 0.725865 sec, boost 0.082x, diff 2^-1048576
Raphson3 0.735530 sec, boost 0.081x, diff 2^-1048576
Goldschmidt 0.703991 sec, boost 0.084x, diff 2^-131069
========== 256 K bits ==========
Reference 0.326368 sec
Raphson2 3.007454 sec, boost 0.109x, diff 2^-2097152
Raphson3 2.977631 sec, boost 0.110x, diff 2^-2097152
Goldschmidt 3.363632 sec, boost 0.097x, diff 2^-262141
========== 512 K bits ==========
Reference 1.138663 sec
Raphson2 12.827783 sec, boost 0.089x, diff 2^-524287
Raphson3 13.799401 sec, boost 0.083x, diff 2^-524287
Goldschmidt 15.836072 sec, boost 0.072x, diff 2^-524286

On most of the modern CPUs, division is indeed much slower than multiplication.
Referring to
https://agner.org/optimize/instruction_tables.pdf
That on Intel Skylake an MUL/IMUL has a latency of 3-4 cycles; while an DIV/IDIV could take 26-90 cycles; which is 7 - 23 times slower than MUL; so your initial benchmark result isn't really a surprise.
If you happen to be on x86 CPU, as showing in the answer below, if this is indeed the bottleneck you could try to utilize AVX/SSE instructions. Basically you'd need to rely on special instructions than a general one like DIV/IDIV.
How to divide a __m256i vector by an integer variable?

How can I subtract two IPv6 addresses (128bit numbers) in C/C++?

I'm storing the IP address in sockaddr_in6 which supports an array of four 32-bit, addr[4]. Essentially a 128 bit number.
I'm trying to calculate number of IPs in a given IPv6 range (how many IPs between). So it's a matter of subtracting one from another using two arrays with a length of four.
The problem is since there's no 128bit data type, I can't convert into decimal.
Thanks a ton!

You could use some kind of big-int library (if you can tolerate LGPL, GMP is the choice). Fortunately, 128 bit subtraction is easy to simulate by hand if necessary. Here is a quick and dirty demonstration of computing the absolute value of (a-b), for 128 bit values:
#include <iostream>
#include <iomanip>
struct U128
{
unsigned long long hi;
unsigned long long lo;
};
bool subtract(U128& a, U128 b)
{
unsigned long long carry = b.lo > a.lo;
a.lo -= b.lo;
unsigned long long carry2 = b.hi > a.hi || a.hi == b.hi && carry;
a.hi -= carry;
a.hi -= b.hi;
return carry2 != 0;
}
int main()
{
U128 ipAddressA = { 45345, 345345 };
U128 ipAddressB = { 45345, 345346 };
bool carry = subtract(ipAddressA, ipAddressB);
// Carry being set means that we underflowed; that ipAddressB was > ipAddressA.
// Lets just compute 0 - ipAddressA as a means to calculate the negation
// (0-x) of our current value. This gives us the absolute value of the
// difference.
if (carry)
{
ipAddressB = ipAddressA;
ipAddressA = { 0, 0 };
subtract(ipAddressA, ipAddressB);
}
// Print gigantic hex string of the 128-bit value
std::cout.fill ('0');
std::cout << std::hex << std::setw(16) << ipAddressA.hi << std::setw(16) << ipAddressA.lo << std::endl;
}
This gives you the absolute value of the difference. If the range is not huge (64 bits or less), then ipAddressA.lo can be your answer as a simple unsigned long long.
If you have perf concerns, you can make use of compiler intrinsics for taking advantage of certain architectures, such as amd64 if you want it to be optimal on that processor. _subborrow_u64 is the amd64 intrinsic for the necessary subtraction work.

The in6_addr structure stores the address in network byte order - or 'big endian' - with the most significant byte # s6_addr[0]. You can't count on the other union members being consistently named, or defined. Even If you accessed the union through a (non-portable) uint32_t field, the values would have to be converted with ntohl. So a portable method of finding the difference needs some work.
You can convert the in6_addr to uint64_t[2]. Sticking with typical 'bignum' conventions, we use [0] for the low 64-bits and [1] for the high 64-bits:
static inline void
in6_to_u64 (uint64_t dst[2], const struct in6_addr *src)
{
uint64_t hi = 0, lo = 0;
for (unsigned int i = 0; i < 8; i++)
{
hi = (hi << 8) | src->s6_addr[i];
lo = (lo << 8) | src->s6_addr[i + 8];
}
dst[0] = lo, dst[1] = hi;
}
and the difference:
static inline unsigned int
u64_diff (uint64_t d[2], const uint64_t x[2], const uint64_t y[2])
{
unsigned int b = 0, bi;
for (unsigned int i = 0; i < 2; i++)
{
uint64_t di, xi, yi, tmp;
xi = x[i], yi = y[i];
tmp = xi - yi;
di = tmp - b, bi = tmp > xi;
d[i] = di, b = bi | (di > tmp);
}
return b; /* borrow flag = (x < y) */
}

How to calculate (A*B)%C? [duplicate]

This question already has answers here:
Multiply two overflowing integers modulo a third
(2 answers)
Closed 9 years ago.
Can someone help me how to calculate (A*B)%C, where 1<=A,B,C<=10^18 in C++, without big-num, just a mathematical approach.

Off the top of my head (not extensively tested)
typedef unsigned long long BIG;
BIG mod_multiply( BIG A, BIG B, BIG C )
{
BIG mod_product = 0;
A %= C;
while (A) {
B %= C;
if (A & 1) mod_product = (mod_product + B) % C;
A >>= 1;
B <<= 1;
}
return mod_product;
}
This has complexity O(log A) iterations. You can probably replace most of the % with a conditional subtraction, for a bit more performance.
typedef unsigned long long BIG;
BIG mod_multiply( BIG A, BIG B, BIG C )
{
BIG mod_product = 0;
// A %= C; may or may not help performance
B %= C;
while (A) {
if (A & 1) {
mod_product += B;
if (mod_product > C) mod_product -= C;
}
A >>= 1;
B <<= 1;
if (B > C) B -= C;
}
return mod_product;
}
This version has only one long integer modulo -- it may even be faster than the large-chunk method, depending on how your processor implements integer modulo.
Live demo: https://ideone.com/1pTldb -- same result as Yakk's.

An implementation of this stack overflow answer prior:
#include <stdint.h>
#include <tuple>
#include <iostream>
typedef std::tuple< uint32_t, uint32_t > split_t;
split_t split( uint64_t a )
{
static const uint32_t mask = -1;
auto retval = std::make_tuple( mask&a, ( a >> 32 ) );
// std::cout << "(" << std::get<0>(retval) << "," << std::get<1>(retval) << ")\n";
return retval;
}
typedef std::tuple< uint64_t, uint64_t, uint64_t, uint64_t > cross_t;
template<typename Lambda>
cross_t cross( split_t lhs, split_t rhs, Lambda&& op )
{
return std::make_tuple(
op(std::get<0>(lhs), std::get<0>(rhs)),
op(std::get<1>(lhs), std::get<0>(rhs)),
op(std::get<0>(lhs), std::get<1>(rhs)),
op(std::get<1>(lhs), std::get<1>(rhs))
);
}
// c must have high bit unset:
uint64_t a_times_2_k_mod_c( uint64_t a, unsigned k, uint64_t c )
{
a %= c;
for (unsigned i = 0; i < k; ++i)
{
a <<= 1;
a %= c;
}
return a;
}
// c must have about 2 high bits unset:
uint64_t a_times_b_mod_c( uint64_t a, uint64_t b, uint64_t c )
{
// ensure a and b are < c:
a %= c;
b %= c;
auto Z = cross( split(a), split(b), [](uint32_t lhs, uint32_t rhs)->uint64_t {
return (uint64_t)lhs * (uint64_t)rhs;
} );
uint64_t to_the_0;
uint64_t to_the_32_a;
uint64_t to_the_32_b;
uint64_t to_the_64;
std::tie( to_the_0, to_the_32_a, to_the_32_b, to_the_64 ) = Z;
// std::cout << to_the_0 << "+ 2^32 *(" << to_the_32_a << "+" << to_the_32_b << ") + 2^64 * " << to_the_64 << "\n";
// this line is the one that requires 2 high bits in c to be clear
// if you just add 2 of them then do a %c, then add the third and do
// a %c, you can relax the requirement to "one high bit must be unset":
return
(to_the_0
+ a_times_2_k_mod_c(to_the_32_a+to_the_32_b, 32, c) // + will not overflow!
+ a_times_2_k_mod_c(to_the_64, 64, c) )
%c;
}
int main()
{
uint64_t retval = a_times_b_mod_c( 19010000000000000000, 1011000000000000, 1231231231231211 );
std::cout << retval << "\n";
}
The idea here is to split your 64-bit integer into a pair of 32-bit integers, which are safe to multiply in 64-bit land.
We express a*b as (a_high * 2^32 + a_low) * (b_high * 2^32 + b_low), do the 4-fold multiplication (keeping track of the 232 factors without storing them in our bits), then note that doing a * 2^k % c can be done via a series of k repeats of this pattern: ((a*2 %c) *2%c).... So we can take this 3 to 4 element polynomial of 64-bit integers in 232 and reduce it without having to worry about things.
The expensive part is the a_times_2_k_mod_c function (the only loop).
You can make it go many times faster if you know that c has more than one high bit clear.
You could instead replace the a %= c with subtraction a -= (a>=c)*c;
Doing both isn't all that practical.
Live example

How get smallest n, that 2 ^ n >= x for given integer x in O(1)?

How for given unsigned integer x find the smallest n, that 2 ^ n ≥ x in O(1)? in other words I want to find the index of higher set bit in binary format of x (plus 1 if x is not power of 2) in O(1) (not depended on size of integer and size of byte).

If you have no memory constraints, then you can use a lookup table (one entry for each possible value of x) to achieve O(1) time.
If you want a practical solution, most processors will have some kind of "find highest bit set" opcode. On x86, for instance, it's BSR. Most compilers will have a mechanism to write raw assembler.

Ok, since so far nobody has posted a compile-time solution, here's mine. The precondition is that your input value is a compile-time constant. If you have that, it's all done at compile-time.
#include <iostream>
#include <iomanip>
// This should really come from a template meta lib, no need to reinvent it here,
// but I wanted this to compile as is.
namespace templ_meta {
// A run-of-the-mill compile-time if.
template<bool Cond, typename T, typename E> struct if_;
template< typename T, typename E> struct if_<true , T, E> {typedef T result_t;};
template< typename T, typename E> struct if_<false, T, E> {typedef E result_t;};
// This so we can use a compile-time if tailored for types, rather than integers.
template<int I>
struct int2type {
static const int result = I;
};
}
// This does the actual work.
template< int I, unsigned int Idx = 0>
struct index_of_high_bit {
static const unsigned int result =
templ_meta::if_< I==0
, templ_meta::int2type<Idx>
, index_of_high_bit<(I>>1),Idx+1>
>::result_t::result;
};
// just some testing
namespace {
template< int I >
void test()
{
const unsigned int result = index_of_high_bit<I>::result;
std::cout << std::setfill('0')
<< std::hex << std::setw(2) << std::uppercase << I << ": "
<< std::dec << std::setw(2) << result
<< '\n';
}
}
int main()
{
test<0>();
test<1>();
test<2>();
test<3>();
test<4>();
test<5>();
test<7>();
test<8>();
test<9>();
test<14>();
test<15>();
test<16>();
test<42>();
return 0;
}
'twas fun to do that.

In <cmath> there are logarithm functions that will perform this computation for you.
ceil(log(x) / log(2));

Some math to transform the expression:
int n = ceil(log(x)/log(2));
This is obviously O(1).

It's a question about finding the highest bit set (as lshtar and Oli Charlesworth pointed out). Bit Twiddling Hacks gives a solution which takes about 7 operations for 32 Bit Integers and about 9 operations for 64 Bit Integers.

You can use precalculated tables.
If your number is in [0,255] interval, simple table look up will work.
If it's bigger, then you may split it by bytes and check them from high to low.

Perhaps this link will help.
Warning : the code is not exactly straightforward and seems rather unmaintainable.
uint64_t v; // Input value to find position with rank r.
unsigned int r; // Input: bit's desired rank [1-64].
unsigned int s; // Output: Resulting position of bit with rank r [1-64]
uint64_t a, b, c, d; // Intermediate temporaries for bit count.
unsigned int t; // Bit count temporary.
// Do a normal parallel bit count for a 64-bit integer,
// but store all intermediate steps.
// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
a = v - ((v >> 1) & ~0UL/3);
// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
c = (b + (b >> 4)) & ~0UL/0x11;
// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
d = (c + (c >> 8)) & ~0UL/0x101;
t = (d >> 32) + (d >> 48);
// Now do branchless select!
s = 64;
// if (r > t) {s -= 32; r -= t;}
s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
t = (d >> (s - 16)) & 0xff;
// if (r > t) {s -= 16; r -= t;}
s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
t = (c >> (s - 8)) & 0xf;
// if (r > t) {s -= 8; r -= t;}
s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
t = (b >> (s - 4)) & 0x7;
// if (r > t) {s -= 4; r -= t;}
s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
t = (a >> (s - 2)) & 0x3;
// if (r > t) {s -= 2; r -= t;}
s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
t = (v >> (s - 1)) & 0x1;
// if (r > t) s--;
s -= ((t - r) & 256) >> 8;
s = 65 - s;

As has been mentioned, the length of the binary representation of x + 1 is the n you're looking for (unless x is in itself a power of two, meaning 10.....0 in a binary representation).
I seriously doubt there exists a true solution in O(1), unless you consider translations to binary representation to be O(1).

For a 32 bit int, the following pseudocode will be O(1).
highestBit(x)
bit = 1
highest = 0
for i 1 to 32
if x & bit == 1
highest = i
bit = bit * 2
return highest + 1
It doesn't matter how big x is, it always checks all 32 bits. Thus constant time.
If the input can be any integer size, say the input is n digits long. Then any solution reading the input, will read n digits and must be at least O(n). Unless someone comes up solution without reading the input, it is impossible to find a O(1) solution.

After some search in internet I found this 2 versions for 32 bit unsigned integer number. I have tested them and they work. It is clear for me why second one works, but still now I'm thinking about first one...
1.
unsigned int RoundUpToNextPowOf2(unsigned int v)
{
unsigned int r = 1;
if (v > 1)
{
float f = (float)v;
unsigned int const t = 1U << ((*(unsigned int *)&f >> 23) - 0x7f);
r = t << (t < v);
}
return r;
}
2.
unsigned int RoundUpToNextPowOf2(unsigned int v)
{
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
edit: First one in clear as well.

An interesting question. What do you mean by not depending on the size
of int or the number of bits in a byte? To encounter a different number
of bits in a byte, you'll have to use a different machine, with
a different set of machine instructions, which may or may not affect the
answer.
Anyway, based sort of vaguely on the first solution proposed by Mihran,
I get:
int
topBit( unsigned x )
{
int r = 1;
if ( x > 1 ) {
if ( frexp( static_cast<double>( x ), &r ) != 0.5 ) {
++ r;
}
}
return r - 1;
}
This works within the constraint that the input value must be exactly
representable in a double; if the input is unsigned long long, this
might not be the case, and on some of the more exotic platforms, it
might not even be the case for unsigned.
The only other constant time (with respect to the number of bits) I can
think of is:
int
topBit( unsigned x )
{
return x == 0 ? 0.0 : ceil( log2( static_cast<double>( x ) ) );
}
, which has the same constraint with regards to x being exactly
representable in a double, and may also suffer from rounding errors
inherent in the floating point operations (although if log2 is
implemented correctly, I don't think that this should be the case). If
your compiler doesn't support log2 (a C++11 feature, but also present
in C90, so I would expect most compilers to already have implemented
it), then of course, log( x ) / log( 2 ) could be used, but I suspect
that this will increase the risk of a rounding error resulting in
a wrong result.
FWIW, I find the O(1) on the number of bits a bit illogical, for the
reasons I specified above: the number of bits is just one of the many
"constant factors" which depend on the machine on which you run.
Anyway, I came up with the following purely integer solution, which is
O(lg 1) for the number of bits, and O(1) for everything else:
template< int k >
struct TopBitImpl
{
static int const k2 = k / 2;
static unsigned const m = ~0U << k2;
int operator()( unsigned x ) const
{
unsigned r = ((x & m) != 0) ? k2 : 0;
return r + TopBitImpl<k2>()(r == 0 ? x : x >> k2);
}
};
template<>
struct TopBitImpl<1>
{
int operator()( unsigned x ) const
{
return 0;
}
};
int
topBit( unsigned x )
{
return TopBitImpl<std::numeric_limits<unsigned>::digits>()(x)
+ (((x & (x - 1)) != 0) ? 1 : 0);
}
A good compiler should be able to inline the recursive calls, resulting
in close to optimal code.

C/C++ - Convert 24-bit signed integer to float

I'm programming in C++. I need to convert a 24-bit signed integer (stored in a 3-byte array) to float (normalizing to [-1.0,1.0]).
The platform is MSVC++ on x86 (which means the input is little-endian).
I tried this:
float convert(const unsigned char* src)
{
int i = src[2];
i = (i << 8) | src[1];
i = (i << 8) | src[0];
const float Q = 2.0 / ((1 << 24) - 1.0);
return (i + 0.5) * Q;
}
I'm not entirely sure, but it seems the results I'm getting from this code are incorrect. So, is my code wrong and if so, why?

You are not sign extending the 24 bits into an integer; the upper bits will always be zero. This code will work no matter what your int size is:
if (i & 0x800000)
i |= ~0xffffff;
Edit: Problem 2 is your scaling constant. In simple terms, you want to multiply by the new maximum and divide by the old maximum, assuming that 0 remains at 0.0 after conversion.
const float Q = 1.0 / 0x7fffff;
Finally, why are you adding 0.5 in the final conversion? I could understand if you were trying to round to an integer value, but you're going the other direction.
Edit 2: The source you point to has a very detailed rationale for your choices. Not the way I would have chosen, but perfectly defensible nonetheless. My advice for the multiplier still holds, but the maximum is different because of the 0.5 added factor:
const float Q = 1.0 / (0x7fffff + 0.5);
Because the positive and negative magnitudes are the same after the addition, this should scale both directions correctly.

Since you are using a char array, it does not necessarily follow that the input is little endian by virtue of being x86; the char array makes the byte order architecture independent.
Your code is somewhat over complicated. A simple solution is to shift the 24 bit data to scale it to a 32bit value (so that the machine's natural signed arithmetic will work), and then use a simple ratio of the result with the maximum possible value (which is INT_MAX less 256 because of the vacant lower 8 bits).
#include <limits.h>
float convert(const unsigned char* src)
{
int i = src[2] << 24 | src[1] << 16 | src[0] << 8 ;
return i / (float)(INT_MAX - 256) ;
}
Test code:
unsigned char* makeS24( unsigned int i, unsigned char* s24 )
{
s24[2] = (unsigned char)(i >> 16) ;
s24[1] = (unsigned char)((i >> 8) & 0xff);
s24[0] = (unsigned char)(i & 0xff);
return s24 ;
}
#include <iostream>
int main()
{
unsigned char s24[3] ;
volatile int x = INT_MIN / 2 ;
std::cout << convert( makeS24( 0x800000, s24 )) << std::endl ; // -1.0
std::cout << convert( makeS24( 0x7fffff, s24 )) << std::endl ; // 1.0
std::cout << convert( makeS24( 0, s24 )) << std::endl ; // 0.0
std::cout << convert( makeS24( 0xc00000, s24 )) << std::endl ; // -0.5
std::cout << convert( makeS24( 0x400000, s24 )) << std::endl ; // 0.5
}

Since it's not symmetrical, this is probably the best compromise.
Maps -((2^23)-1) to -1.0 and ((2^23)-1) to 1.0.
(Note: this is the same conversion style used by 24 bit WAV files)
float convert( const unsigned char* src )
{
  int i = ( ( src[ 2 ] << 24 ) | ( src[ 1 ] << 16 ) | ( src[ 0 ] << 8 ) ) >> 8;
return ( ( float ) i ) / 8388607.0;
}

The solution that works for me:
/**
* Convert 24 byte that are saved into a char* and represent a float
* in little endian format to a C float number.
*/
float convert(const unsigned char* src)
{
float num_float;
// concatenate the chars (short integers) and
// save them to a long int
long int num_integer = (
((src[2] & 0xFF) << 16) |
((src[1] & 0xFF) << 8) |
(src[0] & 0xFF)
) & 0xFFFFFFFF;
// copy the bits from the long int variable
// to the float.
memcpy(&num_float, &num_integer, 4);
return num_float;
}

Works for me:
float convert(const char* stream)
{
int fromStream =
(0x00 << 24) +
(stream[2] << 16) +
(stream[1] << 8) +
stream[0];
return (float)fromStream;
}

Looks like you're treating it as an 24-bit unsigned integer. If the most significant bit is 1, you need to make i negative by setting the remaining 8 bits to 1 as well.

I'm not sure if it's good programming practice, but this seems to work (at least with g++ on 32-bit Linux, haven't tried it on anything else yet) and is certainly more elegant than extracting byte-by-byte from a char array, especially if it's not really a char array but rather a stream (in my case, it's a file stream) that you read from (if it is a char array, you can use memcpy instead of istream::read).
Just load the 24-bit variable into the less significant 3 bytes of a signed 32-bit (signed long). Then shift the long variable one byte to the left, so that the sign bit appears where it's meant to. Finally, just normalize the 32-bit variable, and you're all set.
union _24bit_LE{
char access;
signed long _long;
}_24bit_LE_buf;
float getnormalized24bitsample(){
std::ifstream::read(&_24bit_LE_buf.access+1, 3);
return (_24bit_LE_buf._long<<8) / (0x7fffffff + .5);
}
(Strangely, it doesn't seem to work when you just read into the 3 more significant bytes right away).
EDIT: it turns out this method seems to have some problems I don't fully understand yet. Better don't use it for the time being.

This one, got from here, works for me.
typedef union {
struct {
unsigned short lo;
unsigned short hi;
} u16;
unsigned int u32;
signed int i32;
float f;
}Versatype32;
//Bipolar version (-1.0 to ~1.0)
void fInt24_to_float(float* dest, const char* src, size_t length) {
Versatype32 xTemp;
while (length--) {
xTemp.u32 = *(int*)src;
//Check if Negative by right shifting 8
xTemp.u32 <<= 8; //(If it's a negative, we'll know) (Props to Norman Wong)
//Convert to float
xTemp.f = (float)xTemp.i32;
//Skip divide down if zero
if (xTemp.u32 != 0) {
//Divide by (1<<31 or 2^31)
xTemp.u16.hi -= 0x0F80; //BAM! Bitmagic!
}
*dest = xTemp.f;
//Move to next set
dest++;
src += 3;
} //Are we done yet?
//Yes!
return;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

32-bit to 16-bit Floating Point Conversion - c++

Half to float: float f = ((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13); Float to half: uint32_t x = ((uint32_t)&f); uint16_t h = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);

Related

How to quickly divide a big uinteger into a word?

How can I subtract two IPv6 addresses (128bit numbers) in C/C++?

How to calculate (A*B)%C? [duplicate]

How get smallest n, that 2 ^ n >= x for given integer x in O(1)?

C/C++ - Convert 24-bit signed integer to float

Categories

Resources

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

32-bit to 16-bit Floating Point Conversion - c++

Half to float: float f = ((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13); Float to half: uint32_t x = *((uint32_t*)&f); uint16_t h = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);

Related

How to quickly divide a big uinteger into a word?

How can I subtract two IPv6 addresses (128bit numbers) in C/C++?

How to calculate (A*B)%C? [duplicate]

How get smallest n, that 2 ^ n >= x for given integer x in O(1)?

C/C++ - Convert 24-bit signed integer to float

Categories

Resources

Half to float: float f = ((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13); Float to half: uint32_t x = ((uint32_t)&f); uint16_t h = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);