Is this constexpr sqrt function portable?

Is this constexpr sqrt function portable? - c++

I wrote this implementation of sqrt that is finite in complexity and precise up to the last digit when double is ieee754 double. The question is that is this portable on devices of various endian (assuming 0LL is still 64 bit)? get_fraction returns the 52bits plus the 1 bit at the begining. Small doubles are treated separately and ensured that they also have 1 in the 53rd bit. The c++ part numeric_limits nan can easily be replaced with a constant.
Code:
static inline constexpr int16_t get_exponent(double x)
{
uint64_t bits = *(uint64_t*)&x;
int16_t val = ((bits & 0x7FF0000000000000ULL) >> 52) - 1023;
if(val != -1023)
return val;
uint64_t temp_fractal= (bits & 0x000FFFFFFFFFFFFFULL);
for (int i = 51; i >= 0;--i) {
if(!(temp_fractal & (0x01ULL<<i))) --val;
else break;
}
return val;
}
static inline constexpr uint64_t get_fraction(double x)
{
uint64_t bits = *(uint64_t*)&x;
if (bits & 0x7FF0000000000000ULL)
return (bits & 0x000FFFFFFFFFFFFFULL) | 0x0010000000000000ULL;
uint64_t temp_fraction = bits & 0x000FFFFFFFFFFFFFULL;
for (int i = 51; i >= 0; --i) {
temp_fraction<<=1;
if(0x0010000000000000ULL & temp_fraction) break;
}
return temp_fraction;
}
static inline constexpr bool is_reserved(double x)
{
return get_exponent(x) == 1024;
}
static inline constexpr double my_abs(double x)
{
uint64_t bits = *(uint64_t*)&x;
bits &= 0x7FFFFFFFFFFFFFFFULL;
return *(double*)&bits;
}
constexpr double make_double(bool sign, int16_t exponent, uint64_t fractal)
{
uint64_t data = (fractal & 0x000FFFFFFFFFFFFFULL);
assert((fractal & 0xFFF0000000000000ULL) == 0x0010000000000000ULL);
if (exponent < -1023) {
fractal >>= (-1022 - exponent);
data = fractal;
exponent = -1023;
}
else if (exponent > 1023) {
return (1-2*sign)*std::numeric_limits<double>::infinity();
}
{
data |= ((uint64_t)((uint16_t)(exponent + 1023))) << 52;
if (sign)
data |= 0x8000000000000000ULL;
return *(double*)&data;
}
}
constexpr double my_sqrt(double x)
{
if(!x || is_reserved(x))
return x;
if(x < 0)
return -std::numeric_limits<double>::quiet_NaN();
uint64_t fraction = get_fraction(x);
int16_t exponent = get_exponent(x);
//C standard says it rounds to zero
int16_t half_exponent = ((exponent-1024)/2)+512;
uint64_t test_fraction = 0x0010000000000000ULL;
double test = make_double(0, half_exponent, test_fraction);
if (test * test > x) half_exponent -= 1;
//just to be safe
test = make_double(0, half_exponent, test_fraction);
if (test * test > x) half_exponent -= 1;
//find each bit except last one, binary search for result
for (int i = 51; i > 0; --i) {
test = make_double(0, half_exponent, test_fraction | (0x01ULL<<i));
if(test*test<x) test_fraction |= (0x01ULL << i);
}
double del1 = my_abs(x - test*test);
double temp = make_double(0, half_exponent, test_fraction | 0x01ULL);
double del2 = my_abs(x - temp * temp);
//see if the whole fraction needs to round up by one
if (x > temp * temp) {
test_fraction += 2;
//rounding up by one made the fraction too large
if (test_fraction >= 0x0020000000000000ULL) {
test_fraction >>= 1;
half_exponent -= 1;
}
double temp2 = make_double(0, half_exponent, test_fraction);
double del3 = my_abs(x - temp2 * temp2);
if(del3 <del2) return temp2;
else return temp;
}
else if(del2<del1) return temp;
else return make_double(0, half_exponent, test_fraction);
}
Edit: add some comments
Edit2: add missing functions

Related

Looking for nbit adder in c++

I was trying to build 17bit adder, when overflow occurs it should round off should appear just like int32.
eg: In int32 add, If a = 2^31 -1
int res = a+1
res= -2^31-1
Code I tried, this is not working & is there a better way. Do I need to convert decimal to binary & then perform 17bit operation
int addOvf(int32_t result, int32_t a, int32_t b)
{
int max = (-(0x01<<16))
int min = ((0x01<<16) -1)
int range_17bit = (0x01<<17);
if (a >= 0 && b >= 0 && (a > max - b)) {
printf("...OVERFLOW.........a=%0d b=%0d",a,b);
}
else if (a < 0 && b < 0 && (a < min - b)) {
printf("...UNDERFLOW.........a=%0d b=%0d",a,b);
}
result = a+b;
if(result<min) {
while(result<min){ result=result + range_17bit; }
}
else if(result>min){
while(result>max){ result=result - range_17bit; }
}
return result;
}
int main()
{
int32_t res,x,y;
x=-65536;
y=-1;
res =addOvf(res,x,y);
printf("Value of x=%0d y=%0d res=%0d",x,y,res);
return 0;
}

You have your constants for max/min int17 reversed and off by one. They should be
max_int17 = (1 << 16) - 1 = 65535
and
min_int17 = -(1 << 16) = -65536.
Then I believe that max_int_n + m == min_int_n + (m-1) and min_int_n - m == max_int_n - (m-1), where n is the bit count and m is some integer in [min_int_n, ... ,max_int_n]. So putting that all together the function to treat two int32's as though they are int17's and add them would be like
int32_t add_as_int17(int32_t a, int32_t b) {
static const int32_t max_int17 = (1 << 16) - 1;
static const int32_t min_int17 = -(1 << 16);
auto sum = a + b;
if (sum < min_int17) {
auto m = min_int17 - sum;
return max_int17 - (m - 1);
} else if (sum > max_int17) {
auto m = sum - max_int17;
return min_int17 + (m - 1);
}
return sum;
}
There is probably some more clever way to do that but I believe the above is correct, assuming I understand what you want.

C++ - How to count the length of the digits after the dot in a double / float?

How to count the length of the digits after the dot in a double / float?
Without std::string.
So the length after the dot of 1234.5678 is 4.
And how should you correctly use epsilon in such a situation?
I have something like this, one version with epsilon and another without, neither work fully.
// Version 1.
template <typename T> inline constexpr
unsigned int get_length(T x, const bool& include_dot = false) {
unsigned int len = 0;
x = abs(x);
auto c = x - floor(x);
T factor = 10;
T eps = epsilon() * c;
while (c > eps && c < 1 - eps) {
c = x * factor;
c = c - floor(c);
factor *= 10;
eps = epsilon() * x * factor;
++len;
}
if (include_dot && len > 0) { ++len; }
return len;
}
// Version 2.
template <typename T> inline constexpr
unsigned int get_length(const T& x, const bool& include_dot = false, const unsigned int& max_consequtive_zeros = 6) {
unsigned int len = 0;
unsigned int zero_count = 0;
short digit;
auto decimals = get_decimals(x);
while (decimals != 0 && len < 14) {
decimals *= 10;
digit = decimals;
if (digit == 0) {
if (zero_count >= max_consequtive_zeros) { break; }
++zero_count;
}
else { zero_count = 0; }
decimals -= digit;
++len;
// std::cout << len << ": " << decimals << " zeros: " << zero_count << "\n";
}
if (include_dot && len > 0) { ++len; }
return len - zero_count;
}

A floating point number doesn't store a value in decimal format. It stores it in binary.
For example, if you try to store 234.56 in a float, the closest value that can actually be stored is:
234.55999755859375
1234.5678 is 1234.5677490234375
(go play with https://www.h-schmidt.net/FloatConverter/IEEE754.html to see for yourself)
As such, the length (in decimal digits) of a number in a float is ill-defined. To keep your sanity, please define an epsilon based on the magnitude, not on the number of digits.

Convert C++ double to DEC double

I want to be able to take a user given double and write out in the DEC 64 dpfp format (http://www.wsmr.army.mil/RCCsite/Documents/106%20Previous%20Versions/106-07/appendixO.pdf). Having trouble getting this to line up correctly, anyone have experience or have written conversion functions for DEC types?

This seems pretty straight forward, let me take a shot at it. Note that I don't have any way of testing this for correctness.
std::vector<unsigned char> ToDEC64Float(double d)
{
uint64_t dec_bits = 0ULL;
if (d != 0.0)
{
assert(sizeof(double) == sizeof(uint64_t));
uint64_t bits = *reinterpret_cast<uint64_t*>(&d);
uint64_t fraction = bits & 0x000fffffffffffffULL;
int exp = (int)((bits >> 52) & 0x7ff) - 1023;
bool sign = (bool)(bits & 0x8000000000000000ULL);
// convert the individual values for the new format
fraction <<= 3;
exp += 1 + 128;
if (exp > 255)
throw std::overflow_error("overflow");
if (exp < 0 || (exp == 0 && fraction != 0))
throw std::underflow_error("underflow");
dec_bits = (uint64_t)sign << 63 | (uint64_t)exp << 55 | fraction;
}
std::vector<unsigned char> result;
for (int i = 0; i < 64; i+=8)
result.push_back((unsigned char)((dec_bits >> i) & 0xff));
return result;
}

double static const DECBytesToDouble(uint64_t value)
{
//DEC Byte Conversion Constants
static const float MANTISSA_CONSTANT = 0.5;
static const int32_t EXPONENT_BIAS = 128;
uint8_t * byte_array = (uint8_t*)&value;
uint8_t first = byte_array[0];
uint8_t second = byte_array[1];
uint8_t third = byte_array[2];
uint8_t fourth = byte_array[3];
uint8_t fifth = byte_array[4];
uint8_t sixth = byte_array[5];
uint8_t seventh = byte_array[6];
uint8_t eighth = byte_array[7];
// |second |first|fourth|third|sixth|fifth|eighth|seventh|
// |s|exponent|mantissa |
bool sign = second & 0x80;
std::cout<<"(DECBytesToDouble) Sign: "<<sign<<std::endl;
int32_t exponent = ((second & 0x7F) << 1) + ((first >> 7) & 0x1);
std::cout<<"(DECBytesToDouble) Exponent: "<<exponent<<std::endl;
int64_t mantissa = ((int64_t)(first & 0x7F) << 48) + ((int64_t)fourth << 40)
+ ((int64_t)third << 32) + ((int64_t)sixth << 24) + ((int64_t)fifth << 16)
+ ((int64_t)eighth << 8) + (int64_t) seventh;
std::cout<<"(DECBytesToDouble) Fraction: "<<mantissa<<std::endl;
double fraction = MANTISSA_CONSTANT;
for (int32_t i=0; i<55; i++)
{
fraction += ((mantissa >> i) & 0x1) * pow(2,i-56);
}//for
return pow(-1,sign)*fraction*pow(2,exponent-EXPONENT_BIAS);
}//DECBytesToDouble
uint64_t static const DoubleToDECBytes(double value)
{
static const int32_t EXPONENT_BIAS = 128;
uint64_t dec_bits = 0ULL;
if (value != 0.0)
{
uint64_t bits = *reinterpret_cast<uint64_t*>(&value);
uint64_t fraction = bits & 0x000fffffffffffffULL;
int exp = (int)((bits >> 52) & 0x7ff) - 1023;
bool sign = false;
if(value < 0)
{
sign = true;
}//if
std::cout<<"(DoubleToDECBytes) Sign: "<<sign<<std::endl;
// convert the individual values for the new format
fraction <<= 3;
exp += EXPONENT_BIAS + 1;
std::cout<<"(DoubleToDECBytes) Exponent: "<<exp<<std::endl;
std::cout<<"(DoubleToDECBytes) Fraction: "<<fraction<<std::endl;
if (exp > 255)
throw std::overflow_error("overflow");
if (exp < 0 || (exp == 0 && fraction != 0))
throw std::underflow_error("underflow");
dec_bits = (uint64_t)(sign << 63) | (uint64_t)(exp << 55) | fraction;
//|second |first|fourth|third|sixth|fifth|eighth|seventh|
uint8_t * byte_array = (uint8_t*)&dec_bits;
uint8_t first = byte_array[0];
uint8_t second = byte_array[1];
uint8_t third = byte_array[2];
uint8_t fourth = byte_array[3];
uint8_t fifth = byte_array[4];
uint8_t sixth = byte_array[5];
uint8_t seventh = byte_array[6];
uint8_t eighth = byte_array[7];
byte_array[7] = second;
byte_array[6] = first;
byte_array[5] = fourth;
byte_array[4] = third;
byte_array[3] = sixth;
byte_array[2] = fifth;
byte_array[1] = eighth;
byte_array[0] = seventh;
std::cout<<"(DoubleToDECBytes) Guess ="<<dec_bits<<std::endl;
}//if
/*std::vector<unsigned char> result;
for (int i = 0; i < 64; i+=8)
{
result.push_back((unsigned char)((dec_bits >> i) & 0xff));
}//for
uint64_t final_result = 0;
memcpy(&final_result, &result[0], sizeof(uint64_t));
std::cout<<"Final result: "<<final_result<<std::endl;*/
return dec_bits;
}//DoubleToDECBytes
Output:
input uint64_t value: 9707381994276473045
(DECBytesToDouble) Sign: 0
(DECBytesToDouble) Exponent: 145
(DECBytesToDouble) Fraction: 24184718387676855
output double value: 109527.7465
(DoubleToDECBytes) Sign: 0
(DoubleToDECBytes) Exponent: 145
(DoubleToDECBytes) Fraction: 24184718387676848
(DoubleToDECBytes) Guess =9705411669439479893
Converted double, uint64_t: 9705411669439479893
uint64_t difference: 1970324836993152
(DECBytesToDouble) Sign: 0
(DECBytesToDouble) Exponent: 0
(DECBytesToDouble) Fraction: 24184718387676848
output double value: 0.0000

I came to find that integrating libvaxdata C library into my C++ solution was the best way to go. In my use case situation all that was required was some byte flipping, however the routines work flawlessly.
I recommend the libvaxdata library when dealing with conversion to/from IEEE/DEC types.
http://pubs.usgs.gov/of/2005/1424/

Division not crossing over bytes

I'm trying to do division on a uint128_t that is made up of 2 uint64_ts. Weirdly enough, the function works for uint64_ts with only the lower value set and the upper value = 0. I don't understand why.
Here's the code for the division and bit shift
class uint128_t{
private:
uint64_t UPPER, LOWER;
public:
// lots of stuff
uint128_t operator<<(int shift){
uint128_t out;
if (shift >= 128)
out = uint128_t(0, 0);
else if ((128 > shift) && (shift >= 64))
out = uint128_t(LOWER << (64 - shift), 0);
else if (shift < 64)
out = uint128_t((UPPER << shift) + (LOWER >> (64 - shift)), LOWER << shift);
return out;
}
uint128_t operator<<=(int shift){
*this = *this << shift;
return *this;
}
uint128_t operator/(uint128_t rhs){
// copy of numerator = copyn
uint128_t copyn(*this), quotient = 0;// constructor: uint128_t(T), uint128_t(S, T), uint128_t(uint128_t), etc
while (copyn >= rhs){
// copy of denomiator = copyd
// temp is the current quotient bit being worked with
uint128_t copyd(rhs), temp(1);
// shift the divosr to the highest bit
while (copyn > (copyd << 1)){
copyd <<= 1;
temp <<= 1;
}
copyn -= copyd;
quotient += temp;
}
return quotient;
}
// more stuff
};
Please ignore my blatant disregard for memory management.

out = uint128_t(LOWER << (64 - shift), 0); is wrong - it should be shift - 64 instead.
As a style note, ALL_CAPITALS are usually reserved for constants only. Variables and members should use mostly lowercase.

try this:
// some bit operations stuff
const unsigned char de_brujin_bit_map_64 [] =
{
0,1,2,7,3,13,8,19,4,25,14,28,9,34,20,40,5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
63,6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
};
inline uint8_t trailing_zero_count(uint64_t x) { return x?de_brujin_bit_map_64[(lower_bit(x)*0x0218A392CD3D5DBFL) >> 58]:64; }
inline uint8_t leading_zero_count(uint64_t x) { return x?(63-de_brujin_bit_map_64[(upper_bit(x)*0x0218A392CD3D5DBFL) >> 58]):64; }
inline uint64_t lower_bit(uint64_t x) { return x & -(int64_t&)x; }
inline uint64_t upper_bit(uint64_t x)
{
if(!x) return 0;
x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x |= x >> 32;
return (x >> 1) + 1;
}
inline uint128_t upper_bit(const uint128_t x)
{
if(x.upper()) return uint128_t(upper_bit(x.upper()), 0);
else return uint128_t(0, upper_bit(x.lower()));
}
inline uint128_t lower_bit(const uint128_t x)
{
if(x.lower()) return uint128_t(0, lower_bit(x.lower()));
else return uint128_t(lower_bit(x.upper()), 0);
}
inline uint8_t trailing_zero_count(const uint128_t& x) { return x.lower()?trailing_zero_count(x.lower()):(64+trailing_zero_count(x.upper())); }
inline uint8_t leading_zero_count(const uint128_t& x) { return x.upper()?leading_zero_count(x.upper()):(64+leading_zero_count(x.lower())); }
// division operator
uint128_t uint128_t::operator/(const uint128_t& rhs) const
{
if(rhs == 0) return uint128_t(0); // !!!! zero division
if(rhs == rhs) return uint128_t(1);
if(rhs > *this) return uint128_t(0);
if(rhs == 1) return *this;
if(!upper_ && !rhs.upper_) return uint128_t(0, lower_/rhs.lower_);
if(lower_bit(rhs) == rhs) return *this >> trailing_zero_count(rhs);
uint128_t result;
uint128_t bit_mask = upper_bit();
uint128_t denom = 1;
do
{
bit_mask >>= 1;
denom <<= 1;
if(*this & bit_mask) denom |= 1;
result <<= 1;
if(denom >= rhs) { denom -= rhs; result |= 1; }
}
while (bit_mask.lower_ != 1);
return result;
}

anyway, this version is a little bit faster :)
ensure, 4000 iterations against 127:
uint128_t divident = uint128_t(0xffffffffffffffffULL, 0xffffffffffffffffULL);
uint128_t divisor = 10;
{
uint32_t iter_count = 0;
uint128_t copyn(divident), quotient = 0;
while (copyn >= divisor)
{
++iter_count;
uint128_t copyd(divisor), temp(1);
while ((copyn >> 1) > copyd) { ++iter_count; copyd <<= 1; temp <<= 1; }
copyn -= copyd;
quotient += temp;
}
std::cout << "iterations: " << std::dec << iter_count << std::endl;
}
{
uint32_t iter_count = 0;
uint128_t bit_pos = dtl::bits::upper_bit(divident);
uint128_t tmp = 1, quotient = 0;
do
{
++iter_count;
bit_pos >>= 1;
tmp <<= 1;
if(divident & bit_pos) tmp |= 1;
quotient <<= 1;
if(tmp >= divisor) { tmp -= divisor; quotient |= 1; }
}
while (bit_pos != 1);
std::cout << "iterations: " << std::dec << iter_count << std::endl;
}

Implement division with bit-wise operator

How can I implement division using bit-wise operators (not just division by powers of 2)?
Describe it in detail.

The standard way to do division is by implementing binary long-division. This involves subtraction, so as long as you don't discount this as not a bit-wise operation, then this is what you should do. (Note that you can of course implement subtraction, very tediously, using bitwise logical operations.)
In essence, if you're doing Q = N/D:
Align the most-significant ones of N and D.
Compute t = (N - D);.
If (t >= 0), then set the least significant bit of Q to 1, and set N = t.
Left-shift N by 1.
Left-shift Q by 1.
Go to step 2.
Loop for as many output bits (including fractional) as you require, then apply a final shift to undo what you did in Step 1.

Division of two numbers using bitwise operators.
#include <stdio.h>
int remainder, divisor;
int division(int tempdividend, int tempdivisor) {
int quotient = 1;
if (tempdivisor == tempdividend) {
remainder = 0;
return 1;
} else if (tempdividend < tempdivisor) {
remainder = tempdividend;
return 0;
}
do{
tempdivisor = tempdivisor << 1;
quotient = quotient << 1;
} while (tempdivisor <= tempdividend);
/* Call division recursively */
quotient = quotient + division(tempdividend - tempdivisor, divisor);
return quotient;
}
int main() {
int dividend;
printf ("\nEnter the Dividend: ");
scanf("%d", &dividend);
printf("\nEnter the Divisor: ");
scanf("%d", &divisor);
printf("\n%d / %d: quotient = %d", dividend, divisor, division(dividend, divisor));
printf("\n%d / %d: remainder = %d", dividend, divisor, remainder);
getch();
}

int remainder =0;
int division(int dividend, int divisor)
{
int quotient = 1;
int neg = 1;
if ((dividend>0 &&divisor<0)||(dividend<0 && divisor>0))
neg = -1;
// Convert to positive
unsigned int tempdividend = (dividend < 0) ? -dividend : dividend;
unsigned int tempdivisor = (divisor < 0) ? -divisor : divisor;
if (tempdivisor == tempdividend) {
remainder = 0;
return 1*neg;
}
else if (tempdividend < tempdivisor) {
if (dividend < 0)
remainder = tempdividend*neg;
else
remainder = tempdividend;
return 0;
}
while (tempdivisor<<1 <= tempdividend)
{
tempdivisor = tempdivisor << 1;
quotient = quotient << 1;
}
// Call division recursively
if(dividend < 0)
quotient = quotient*neg + division(-(tempdividend-tempdivisor), divisor);
else
quotient = quotient*neg + division(tempdividend-tempdivisor, divisor);
return quotient;
}
void main()
{
int dividend,divisor;
char ch = 's';
while(ch != 'x')
{
printf ("\nEnter the Dividend: ");
scanf("%d", &dividend);
printf("\nEnter the Divisor: ");
scanf("%d", &divisor);
printf("\n%d / %d: quotient = %d", dividend, divisor, division(dividend, divisor));
printf("\n%d / %d: remainder = %d", dividend, divisor, remainder);
_getch();
}
}

I assume we are discussing division of integers.
Consider that I got two number 1502 and 30, and I wanted to calculate 1502/30. This is how we do this:
First we align 30 with 1501 at its most significant figure; 30 becomes 3000. And compare 1501 with 3000, 1501 contains 0 of 3000. Then we compare 1501 with 300, it contains 5 of 300, then compare (1501-5*300) with 30. At so at last we got 5*(10^1) = 50 as the result of this division.
Now convert both 1501 and 30 into binary digits. Then instead of multiplying 30 with (10^x) to align it with 1501, we multiplying (30) in 2 base with 2^n to align. And 2^n can be converted into left shift n positions.
Here is the code:
int divide(int a, int b){
if (b != 0)
return;
//To check if a or b are negative.
bool neg = false;
if ((a>0 && b<0)||(a<0 && b>0))
neg = true;
//Convert to positive
unsigned int new_a = (a < 0) ? -a : a;
unsigned int new_b = (b < 0) ? -b : b;
//Check the largest n such that b >= 2^n, and assign the n to n_pwr
int n_pwr = 0;
for (int i = 0; i < 32; i++)
{
if (((1 << i) & new_b) != 0)
n_pwr = i;
}
//So that 'a' could only contain 2^(31-n_pwr) many b's,
//start from here to try the result
unsigned int res = 0;
for (int i = 31 - n_pwr; i >= 0; i--){
if ((new_b << i) <= new_a){
res += (1 << i);
new_a -= (new_b << i);
}
}
return neg ? -res : res;
}
Didn't test it, but you get the idea.

This solution works perfectly.
#include <stdio.h>
int division(int dividend, int divisor, int origdiv, int * remainder)
{
int quotient = 1;
if (dividend == divisor)
{
*remainder = 0;
return 1;
}
else if (dividend < divisor)
{
*remainder = dividend;
return 0;
}
while (divisor <= dividend)
{
divisor = divisor << 1;
quotient = quotient << 1;
}
if (dividend < divisor)
{
divisor >>= 1;
quotient >>= 1;
}
quotient = quotient + division(dividend - divisor, origdiv, origdiv, remainder);
return quotient;
}
int main()
{
int n = 377;
int d = 7;
int rem = 0;
printf("Quotient : %d\n", division(n, d, d, &rem));
printf("Remainder: %d\n", rem);
return 0;
}

Implement division without divison operator:
You will need to include subtraction. But then it is just like you do it by hand (only in the basis of 2). The appended code provides a short function that does exactly this.
uint32_t udiv32(uint32_t n, uint32_t d) {
// n is dividend, d is divisor
// store the result in q: q = n / d
uint32_t q = 0;
// as long as the divisor fits into the remainder there is something to do
while (n >= d) {
uint32_t i = 0, d_t = d;
// determine to which power of two the divisor still fits the dividend
//
// i.e.: we intend to subtract the divisor multiplied by powers of two
// which in turn gives us a one in the binary representation
// of the result
while (n >= (d_t << 1) && ++i)
d_t <<= 1;
// set the corresponding bit in the result
q |= 1 << i;
// subtract the multiple of the divisor to be left with the remainder
n -= d_t;
// repeat until the divisor does not fit into the remainder anymore
}
return q;
}

The below method is the implementation of binary divide considering both numbers are positive. If subtraction is a concern we can implement that as well using binary operators.
Code
-(int)binaryDivide:(int)numerator with:(int)denominator
{
if (numerator == 0 || denominator == 1) {
return numerator;
}
if (denominator == 0) {
#ifdef DEBUG
NSAssert(denominator == 0, #"denominator should be greater then 0");
#endif
return INFINITY;
}
// if (numerator <0) {
// numerator = abs(numerator);
// }
int maxBitDenom = [self getMaxBit:denominator];
int maxBitNumerator = [self getMaxBit:numerator];
int msbNumber = [self getMSB:maxBitDenom ofNumber:numerator];
int qoutient = 0;
int subResult = 0;
int remainingBits = maxBitNumerator-maxBitDenom;
if (msbNumber >= denominator) {
qoutient |=1;
subResult = msbNumber - denominator;
}
else {
subResult = msbNumber;
}
while (remainingBits>0) {
int msbBit = (numerator & (1 << (remainingBits-1)))>0 ? 1 : 0;
subResult = (subResult << 1) |msbBit;
if (subResult >= denominator) {
subResult = subResult-denominator;
qoutient = (qoutient << 1) | 1;
}
else {
qoutient = qoutient << 1;
}
remainingBits--;
}
return qoutient;
}
-(int)getMaxBit:(int)inputNumber
{
int maxBit =0;
BOOL isMaxBitSet = NO;
for (int i=0; i<sizeof(inputNumber)*8; i++) {
if (inputNumber & (1 << i) ) {
maxBit = i;
isMaxBitSet=YES;
}
}
if (isMaxBitSet) {
maxBit += 1;
}
return maxBit;
}
-(int)getMSB:(int)bits ofNumber:(int)number
{
int numbeMaxBit = [self getMaxBit:number];
return number >> (numbeMaxBit -bits);
}

For integers:
public class Division {
public static void main(String[] args) {
System.out.println("Division: " + divide(100, 9));
}
public static int divide(int num, int divisor) {
int sign = 1;
if((num > 0 && divisor < 0) || (num < 0 && divisor > 0))
sign = -1;
return divide(Math.abs(num), Math.abs(divisor), Math.abs(divisor)) * sign;
}
public static int divide(int num, int divisor, int sum) {
if (sum > num) {
return 0;
}
return 1 + divide(num, divisor, sum + divisor);
}
}

With the usual caveats about C's behaviour with shifts, this ought to work for unsigned quantities regardless of the native size of an int...
static unsigned int udiv(unsigned int a, unsigned int b) {
unsigned int c = 1, result = 0;
if (b == 0) return (unsigned int)-1 /*infinity*/;
while (((int)b > 0) && (b < a)) { b = b<<1; c = c<<1; }
do {
if (a >= b) { a -= b; result += c; }
b = b>>1; c = c>>1;
} while (c);
return result;
}

This is my solution to implement division with only bitwise operations:
int align(int a, int b) {
while (b < a) b <<= 1;
return b;
}
int divide(int a, int b) {
int temp = b;
int result = 0;
b = align(a, b);
do {
result <<= 1;
if (a >= b) {
// sub(a,b) is a self-defined bitwise function for a minus b
a = sub(a,b);
result = result | 1;
}
b >>= 1;
} while (b >= temp);
return result;
}

Unsigned Long Division (JavaScript) - based on Wikipedia article: https://en.wikipedia.org/wiki/Division_algorithm:
"Long division is the standard algorithm used for pen-and-paper division of multi-digit numbers expressed in decimal notation. It shifts gradually from the left to the right end of the dividend, subtracting the largest possible multiple of the divisor (at the digit level) at each stage; the multiples then become the digits of the quotient, and the final difference is then the remainder.
When used with a binary radix, this method forms the basis for the (unsigned) integer division with remainder algorithm below."
Function divideWithoutDivision at the end wraps it to allow negative operands. I used it to solve leetcode problem "Product of Array Except Self"
function longDivision(N, D) {
let Q = 0; //quotient and remainder
let R = 0;
let n = mostSignificantBitIn(N);
for (let i = n; i >= 0; i--) {
R = R << 1;
R = setBit(R, 0, getBit(N, i));
if (R >= D) {
R = R - D;
Q = setBit(Q, i, 1);
}
}
//return [Q, R];
return Q;
}
function mostSignificantBitIn(N) {
for (let i = 31; i >= 0; i--) {
if (N & (1 << i))
return i ;
}
return 0;
}
function getBit(N, i) {
return (N & (1 << i)) >> i;
}
function setBit(N, i, value) {
return N | (value << i);
}
function divideWithoutDivision(dividend, divisor) {
let negativeResult = (dividend < 0) ^ (divisor < 0);
dividend = Math.abs(dividend);
divisor = Math.abs(divisor);
let quotient = longDivision(dividend, divisor);
return negativeResult ? -quotient : quotient;
}

All these solutions are too long. The base idea is to write the quotient (for example, 5=101) as 100 + 00 + 1 = 101.
public static Point divide(int a, int b) {
if (a < b)
return new Point(0,a);
if (a == b)
return new Point(1,0);
int q = b;
int c = 1;
while (q<<1 < a) {
q <<= 1;
c <<= 1;
}
Point r = divide(a-q, b);
return new Point(c + r.x, r.y);
}
public static class Point {
int x;
int y;
public Point(int x, int y) {
this.x = x;
this.y = y;
}
public int compare(Point b) {
if (b.x - x != 0) {
return x - b.x;
} else {
return y - b.y;
}
}
#Override
public String toString() {
return " (" + x + " " + y + ") ";
}
}

Since bit wise operations work on bits that are either 0 or 1, each bit represents a power of 2, so if I have the bits
1010
that value is 10.
Each bit is a power of two, so if we shift the bits to the right, we divide by 2
1010 --> 0101
0101 is 5
so, in general if you want to divide by some power of 2, you need to shift right by the exponent you raise two to, to get that value
so for instance, to divide by 16, you would shift by 4, as 2^^4 = 16.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Is this constexpr sqrt function portable? - c++

Related

Looking for nbit adder in c++

C++ - How to count the length of the digits after the dot in a double / float?

Convert C++ double to DEC double

Division not crossing over bytes

Implement division with bit-wise operator

Categories

Resources