I have the following question, which is actually from a coding test I recently took:
Question:
A function f(n) = a*n + b*n*(floor(log(n)/log(2))) + c*n*n*n exists.
At a particular value, let f(n) = k;
Given k, a, b, c, find n.
For a given value of k, if no n value exists, then return 0.
Limits:
1 <= n < 2^63-1
0 < a, b < 100
0 <= c < 100
0 < k < 2^63-1
The logic here is that since f(n) is purely increasing for a given a, b and c, I can find n by binary search.
The code I wrote was as follows:
#include<iostream>
#include<stdlib.h>
#include<math.h>
using namespace std;
unsigned long long logToBase2Floor(unsigned long long n){
return (unsigned long long)(double(log(n))/double(log(2)));
}
#define f(n, a, b, c) (a*n + b*n*(logToBase2Floor(n)) + c*n*n*n)
unsigned long long findNByBinarySearch(unsigned long long k, unsigned long long a, unsigned long long b, unsigned long long c){
unsigned long long low = 1;
unsigned long long high = (unsigned long long)(pow(2, 63)) - 1;
unsigned long long n;
while(low<=high){
n = (low+high)/2;
cout<<"\n\n k= "<<k;
cout<<"\n f(n,a,b,c)= "<<f(n,a,b,c)<<" low = "<<low<<" mid="<<n<<" high = "<<high;
if(f(n,a,b,c) == k)
return n;
else if(f(n,a,b,c) < k)
low = n+1;
else high = n-1;
}
return 0;
}
I then tried it with a few test cases:
int main(){
unsigned long long n, a, b, c;
n = (unsigned long long)pow(2,63)-1;
a = 99;
b = 99;
c = 99;
cout<<"\nn="<<n<<" a="<<a<<" b="<<b<<" c="<<c<<" k = "<<f(n, a, b, c);
cout<<"\nANSWER: "<<findNByBinarySearch(f(n, a, b, c), a, b, c)<<endl;
n = 1000;
cout<<"\nn="<<n<<" a="<<a<<" b="<<b<<" c="<<c<<" k = "<<f(n, a, b, c);
cout<<"\nANSWER: "<<findNByBinarySearch(f(n, a, b, c), a, b, c)<<endl;
return 0;
}
Then something weird happened.
The code works for the test case n = (unsigned long long)pow(2,63)-1;, correctly returning that value of n. But it did not work for n=1000. I printed the output and saw the following:
n=1000 a=99 b=99 c=99 k = 99000990000
k= 99000990000
f(n,a,b,c)= 4611686018427387904 low = 1 mid=4611686018427387904 high = 9223372036854775807
...
...
k= 99000990000
f(n,a,b,c)= 172738215936 low = 1 mid=67108864 high = 134217727
k= 99000990000
f(n,a,b,c)= 86369107968 low = 1 mid=33554432 high = 67108863
k= 99000990000
f(n,a,b,c)= 129553661952 low = 33554433 mid=50331648 high = 67108863**
...
...
k= 99000990000
f(n,a,b,c)= 423215328047139441 low = 37748737 mid=37748737 high = 37748737
ANSWER: 0
Something didn't seem right mathematically. How was it that the value of f(1000) was greater than the value of f(33554432)?
So I tried the same code in Python, and got the following values:
>>> f(1000, 99, 99, 99)
99000990000L
>>> f(33554432, 99, 99, 99)
3740114254432845378355200L
So, the value is definitely larger.
Questions:
What is happening exactly?
How should I solve it?
What is happening exactly?
The problem is here:
unsigned long long low = 1;
// Side note: This is simply (2ULL << 62) - 1
unsigned long long high = (unsigned long long)(pow(2, 63)) - 1;
unsigned long long n;
while (/* irrelevant */) {
n = (low + high) / 2;
// Some stuff that do not modify n...
f(n, a, b, c) // <-- Here!
}
In the first iteration, you have low = 1 and high = 2^63 - 1, which mean that n = 2^63 / 2 = 2^62. Now, let's look at f:
#define f(n, a, b, c) (/* I do not care about this... */ + c*n*n*n)
You have n^3 in f, so for n = 2^62, n^3 = 2^186, which is probably way too large for your unsigned long long (which is likely to be 64-bits long).
How should I solve it?
The main issue here is overflow when doing the binary search, so you should simply handle the overflowing case separatly.
Preamble: I am using ull_t because I am lazy, and you should avoid macro in C++, prefer using a function and let the compiler inline it. Also, I prefer a loop against using the log function to compute the log2 of an unsigned long long (see the bottom of this answer for the implementation of log2 and is_overflow).
using ull_t = unsigned long long;
constexpr auto f (ull_t n, ull_t a, ull_t b, ull_t c) {
if (n == 0ULL) { // Avoid log2(0)
return 0ULL;
}
if (is_overflow(n, a, b, c)) {
return 0ULL;
}
return a * n + b * n * log2(n) + c * n * n * n;
}
Here is slightly modified binary search version:
constexpr auto find_n (ull_t k, ull_t a, ull_t b, ull_t c) {
constexpr ull_t max = std::numeric_limits<ull_t>::max();
auto lb = 1ULL, ub = (1ULL << 63) - 1;
while (lb <= ub) {
if (ub > max - lb) {
// This should never happens since ub < 2^63 and lb <= ub so lb + ub < 2^64
return 0ULL;
}
// Compute middle point (no overflow guarantee).
auto tn = (lb + ub) / 2;
// If there is an overflow, then change the upper bound.
if (is_overflow(tn, a, b, c)) {
ub = tn - 1;
}
// Otherwize, do a standard binary search...
else {
auto val = f(tn, a, b, c);
if (val < k) {
lb = tn + 1;
}
else if (val > k) {
ub = tn - 1;
}
else {
return tn;
}
}
}
return 0ULL;
}
As you can see, there is only one test that is relevant here, which is is_overflow(tn, a, b, c) (the first test regarding lb + ub is irrelevant here since ub < 2^63 and lb <= ub < 2^63 so ub + lb < 2^64 which is ok for unsigned long long in our case).
Complete implementation:
#include <limits>
#include <type_traits>
using ull_t = unsigned long long;
template <typename T,
typename = std::enable_if_t<std::is_integral<T>::value>>
constexpr auto log2 (T n) {
T log = 0;
while (n >>= 1) ++log;
return log;
}
constexpr bool is_overflow (ull_t n, ull_t a, ull_t b, ull_t c) {
ull_t max = std::numeric_limits<ull_t>::max();
if (n > max / a) {
return true;
}
if (n > max / b) {
return true;
}
if (b * n > max / log2(n)) {
return true;
}
if (c != 0) {
if (n > max / c) return true;
if (c * n > max / n) return true;
if (c * n * n > max / n) return true;
}
if (a * n > max - c * n * n * n) {
return true;
}
if (a * n + c * n * n * n > max - b * n * log2(n)) {
return true;
}
return false;
}
constexpr auto f (ull_t n, ull_t a, ull_t b, ull_t c) {
if (n == 0ULL) {
return 0ULL;
}
if (is_overflow(n, a, b, c)) {
return 0ULL;
}
return a * n + b * n * log2(n) + c * n * n * n;
}
constexpr auto find_n (ull_t k, ull_t a, ull_t b, ull_t c) {
constexpr ull_t max = std::numeric_limits<ull_t>::max();
auto lb = 1ULL, ub = (1ULL << 63) - 1;
while (lb <= ub) {
if (ub > max - lb) {
return 0ULL; // Problem here
}
auto tn = (lb + ub) / 2;
if (is_overflow(tn, a, b, c)) {
ub = tn - 1;
}
else {
auto val = f(tn, a, b, c);
if (val < k) {
lb = tn + 1;
}
else if (val > k) {
ub = tn - 1;
}
else {
return tn;
}
}
}
return 0ULL;
}
Compile time check:
Below is a little piece of code that you can use to check if the above code at compile time (since everything is constexpr):
template <unsigned long long n, unsigned long long a,
unsigned long long b, unsigned long long c>
struct check: public std::true_type {
enum {
k = f(n, a, b, c)
};
static_assert(k != 0, "Value out of bound for (n, a, b, c).");
static_assert(n == find_n(k, a, b, c), "");
};
template <unsigned long long a,
unsigned long long b,
unsigned long long c>
struct check<0, a, b, c>: public std::true_type {
static_assert(a != a, "Ambiguous values for n when k = 0.");
};
template <unsigned long long n>
struct check<n, 0, 0, 0>: public std::true_type {
static_assert(n != n, "Ambiguous values for n when a = b = c = 0.");
};
#define test(n, a, b, c) static_assert(check<n, a, b, c>::value, "");
test(1000, 99, 99, 0);
test(1000, 99, 99, 99);
test(453333, 99, 99, 99);
test(495862, 99, 99, 9);
test(10000000, 1, 1, 0);
Note: The maximum value of k is about 2^63, so for a given triplet (a, b, c), the maximum value of n is the one such as f(n, a, b, c) < 2 ^ 63 and f(n + 1, a, b, c) >= 2 ^ 63. For a = b = c = 99, this maximum value is n = 453333 (empirically found), which is why I tested it above.
Related
The link to the particular question is as follows: https://leetcode.com/problems/nth-magical-number/. My code is showing "Time Limit Exceeded" and I cannot figure out where actually lies the error in my code. My code is as follows:
class Solution {
public:
typedef unsigned int ull;
int gcd(int A,int B)
{
if(B==0)
return A;
else
return gcd(B,A%B);
}
int nthMagicalNumber(int N, int A, int B) {
ull m;
if(A<B)
{
int t;
t=A;
A=B;
B=t;
}
ull lcm=(A*B)/gcd(A,B);
ull l=2;
ull h=1e9;
ull n;
while(l<=h)
{
m=l+(h-1)/2;
n=(m/A)+(m/B)-(m/lcm);
if(n==N)
break;
else if(n<N)
l=m+1;
else if(n>N)
h=m-1;
}
ull x=(1e9)+7;
return (int)(m%x);
}
};
Can someone let me know where I am wrong and how can I correct the error?
You might want to use a debugger for that. This solution is just very similar (using Binary Search with gcd) and would pass through:
// This block might trivially optimize the exec time;
// Can be removed;
const static auto __optimize__ = []() {
std::ios::sync_with_stdio(false);
std::cin.tie(NULL);
return 0;
}();
#include <cstdint>
#include <vector>
const static struct Solution {
using SizeType = std::uint_fast64_t;
static constexpr SizeType kMod = 1e9 + 7;
const SizeType nthMagicalNumber(
const SizeType N,
const SizeType A,
const SizeType B
) {
const SizeType lowest_common_multiple = A * B / getGreatestCommonDivisor(A, B);
SizeType lo = 2;
SizeType hi = 1e14;
while (lo < hi) {
SizeType mid = lo + (hi - lo) / 2;
if (mid / A + mid / B - mid / lowest_common_multiple < N) {
lo = mid + 1;
} else {
hi = mid;
}
}
return lo % kMod;
}
// __gcd(a, b)
static const SizeType getGreatestCommonDivisor(
const SizeType a,
const SizeType b
) {
if (not b) {
return a;
}
return getGreatestCommonDivisor(b, a % b);
}
};
Instead of ull, here we are using std::uint_fast64_t.
References
For additional details, please see the Discussion Board which you can find plenty of well-explained accepted solutions in there, with a variety of languages including efficient algorithms and asymptotic time/space complexity analysis1, 2.
You can try this approach in C++ for that question my solution got accepted:
class Solution {
public:
long long gcd(long long a, long long b) {
if(b == 0)
return a;
return gcd(b, a % b);
}
int nthMagicalNumber(long long n, long long a, long long b) {
long long end = max(a, b) * (n + 1);
long long min = 1;
long long lcm = (a / gcd(a, b)) * b;
long long curr, mid;
while(min <= end) {
mid = (end - min)/2 + min;
curr = (mid / a) + (mid / b) - (mid / lcm);
if(curr > n) {
end = mid - 1;
} else if(curr < n) {
min = mid + 1;
} else {
break;
}
}
while((mid % a != 0) && (mid % b != 0)) {
mid--;
}
return mid % (1000000007);
}
};
I'm a computer science student and i have a problem where i must use the fast modular exponentation.
With this code that i made, the corrector says me that the output in some cases is incorrect, but it shouldn't be.
unsigned long long int pow(int a, int n, int M)
{
if(n==0)
return 1;
if(n==1)
return a;
unsigned long long tmp=pow(a, n/2, M)%M;
if(n%2==0)
return ((tmp)*(tmp))%M;
return ((tmp*tmp)*(a%M))%M;
}
Instead with this other code i pass all the test cases.
unsigned long long int pow(int a, int n, int M)
{
if(n==0)
return 1;
if(n==1)
return a;
unsigned long long tmp;
if(n%2==0){
tmp=pow(a, n/2, M)%M;
return (tmp*tmp)%M;
}
tmp=pow(a, n-1, M)%M;
return (tmp*(a%M))%M;
}
So my question is why with the first code i don't pass all the test cases?
First, if n == 1, the return value should be a % M, not a. Second, the product (tmp * tmp) * (a % M) can overflow, and should be computed as ((tmp * tmp) % M) * (a % M).
The condition n == 1 doesn't need any special treatment, and the code can be simplified to:
unsigned long long int pow(unsigned int a, unsigned int n, unsigned int m) {
if (n == 0)
return 1;
auto tmp = pow(a, n / 2, m) % m;
tmp *= tmp;
if (n % 2)
tmp = (tmp % m) * (a % m);
return tmp % m;
}
Problem statement : Given two integers n and m, output Fn mod m (that is, the remainder of Fn when divided by m).
Input Format. The input consists of two integers n and m given on the same line (separated by a space).
Constraints. 1 ≤ n ≤ 10^18, 2 ≤ m ≤ 10^5
Output Format. Output Fn mod m.
I tried the following program and it didn't work. The method pi is returning the right Pisano period though for any number as per http://webspace.ship.edu/msrenault/fibonacci/fiblist.htm
#include <iostream>
long long pi(long long m) {
long long result = 2;
for (long long fn2 = 1, fn1 = 2 % m, fn = 3 % m;
fn1 != 1 || fn != 1;
fn2 = fn1, fn1 = fn, fn = (fn1 + fn2) % m
) {
result++;
}
return result;
}
long long get_fibonaccihuge(long long n, long long m) {
long long periodlength = pi(m);
int patternRemainder = n % periodlength;
long long *sum = new long long[patternRemainder];
sum[0] = 0;
sum[1] = 1;
for (int i = 2; i <= patternRemainder; ++i)
{
sum[i] = sum[i - 1] + sum[i - 2];
}
return sum[patternRemainder] % m;
}
int main() {
long long n, m;
std::cin >> n >> m;
std::cout << get_fibonaccihuge(n, m) << '\n';
}
The exact program/logic is working well in python as expected. What's wrong withthis cpp program ? Is it the data types ?
Performing 10^18 additions isn't going to be very practical. Even on a teraflop computer, 10^6 seconds is still 277 hours.
But 10^18 ~= 2^59.8 so there'll be up to 60 halving steps.
Calculate (a,b) --> (a^2 + b^2, 2ab + b^2) to go from (n-1,n)th to (2n-1,2n)th consecutive Fibonacci number pairs in one step.
At each step perform the modulus calculation for each operation. You'll need to accommodate integers up to 3*1010 ≤ 235 in magnitude (i.e. up to 35 bits).
(cf. a related older answer of mine).
This was my solution for this problem, it works well and succeeded in the submission test ...
i used a simpler way to get the pisoano period ( pisano period is the main tricky part in this problem ) ... i wish to be helpful
#include <iostream>
using namespace std;
unsigned long long get_fibonacci_huge_naive(unsigned long long n, unsigned long long m)
{
if (n <= 1)
return n;
unsigned long long previous = 0;
unsigned long long current = 1;
for (unsigned long long i = 0; i < n - 1; ++i)
{
unsigned long long tmp_previous = previous;
previous = current;
current = tmp_previous + current;
}
return current % m;
}
long long get_pisano_period(long long m)
{
long long a = 0, b = 1, c = a + b;
for (int i = 0; i < m * m; i++)
{
c = (a + b) % m;
a = b;
b = c;
if (a == 0 && b == 1)
{
return i + 1;
}
}
}
unsigned long long get_fibonacci_huge_faster(unsigned long long n, unsigned long long m)
{
n = n % get_pisano_period(m);
unsigned long long F[n + 1] = {};
F[0] = 0;
F[-1] = 1;
for (int i = 1; i <= n; i++)
{
F[i] = F[i - 1] + F[i - 2];
F[i] = F[i] % m;
}
return F[n];
}
int main()
{
unsigned long long n, m;
std::cin >> n >> m;
std::cout << get_fibonacci_huge_faster(n, m) << '\n';
}
I'm trying to solve a coding challenge on hacker rank which requires one to calculate binomial coefficients mod a prime, i.e.
nchoosek(n, k, p)
I'm using the code from this answer that works for the first three sets of inputs but begins failing on the 4th. I stepped through it in the debugger and determined that the issue arises when:
n % p == 0 || k % p == 0
I just need to know how to modify my current solution to handle the specific cases where n % p == 0 or k % p == 0. None of the answers I've found on stack exchange seem to address this specific case. Here's my code:
#include <iostream>
#include <fstream>
long long FactorialExponent(long long n, long long p)
{
long long ex = 0;
do
{
n /= p;
ex += n;
}while(n > 0);
return ex;
}
unsigned long long ModularMultiply(unsigned long long a, unsigned long long b, unsigned long p) {
unsigned long long a1 = (a >> 21), a2 = a & ((1ull << 21) - 1);
unsigned long long temp = (a1 * b) % p; // doesn't overflow under the assumptions
temp = (temp << 21) % p; // this neither
temp += (a2 * b) % p; // nor this
return temp % p;
}
unsigned long long ModularInverse(unsigned long long k, unsigned long m) {
if (m == 0) return (k == 1 || k == -1) ? k : 0;
if (m < 0) m = -m;
k %= m;
if (k < 0) k += m;
int neg = 1;
unsigned long long p1 = 1, p2 = 0, k1 = k, m1 = m, q, r, temp;
while(k1 > 0) {
q = m1 / k1;
r = m1 % k1;
temp = q*p1 + p2;
p2 = p1;
p1 = temp;
m1 = k1;
k1 = r;
neg = !neg;
}
return neg ? m - p2 : p2;
}
// Preconditions: 0 <= k <= min(n,p-1); p > 1 prime
unsigned long long ChooseModTwo(unsigned long long n, unsigned long long k, unsigned long p)
{
// reduce n modulo p
n %= p;
// Trivial checks
if (n < k) {
return 0;
}
if (k == 0 || k == n) {
return 1;
}
// Now 0 < k < n, save a bit of work if k > n/2
if (k > n/2) {
k = n-k;
}
// calculate numerator and denominator modulo p
unsigned long long num = n, den = 1;
for(n = n-1; k > 1; --n, --k)
{
num = ModularMultiply(num, n, p);
den = ModularMultiply(den, k, p);
}
den = ModularInverse(den,p);
return ModularMultiply(num, den, p);
}
// Preconditions: 0 <= k <= n; p > 1 prime
long long ChooseModOne(long long n, long long k, const unsigned long p)
{
// For small k, no recursion is necessary
if (k < p) return ChooseModTwo(n,k,p);
unsigned long long q_n, r_n, q_k, r_k, choose;
q_n = n / p;
r_n = n % p;
q_k = k / p;
r_k = k % p;
choose = ChooseModTwo(r_n, r_k, p);
// If the exponent of p in choose(n,k) isn't determined to be 0
// before the calculation gets serious, short-cut here:
// if (choose == 0) return 0;
return ModularMultiply(choose, ChooseModOne(q_n, q_k, p), p);
}
unsigned long long ModularBinomialCoefficient(unsigned long long n, unsigned long long k, const unsigned long p)
{
// We deal with the trivial cases first
if (k < 0 || n < k) return 0;
if (k == 0 || k == n) return 1;
// Now check whether choose(n,k) is divisible by p
if (FactorialExponent(n, p) > FactorialExponent(k, p) + FactorialExponent(n - k, p)) return 0;
// If it's not divisible, do the generic work
return ChooseModOne(n, k, p);
}
int main() {
//std::ifstream fin ("input03.txt");
std::ifstream fin ("test.in");
int kMod = 1000003;
int T;
fin >> T;
int N = T;
//std::cin >> T;
unsigned long long n, k;
unsigned long long a, b;
int result[N];
int index = 0;
while (T--) {
fin >> n >> k;
a = ModularBinomialCoefficient(n - 3, k, kMod);
b = ModularBinomialCoefficient(n + k, n - 1, kMod);
// (1 / (n + k) * nCk(n - 3, k) * nCk(n + k, n - 1)) % 1000003
unsigned long long x = ModularMultiply(a, b, kMod);
unsigned long long y = ModularMultiply(x, ModularInverse((n + k), kMod), kMod);
result[index] = y;
index++;
}
for(int i = 0; i < N; i++) {
std::cout << result[i] << "\n";
}
return 0;
}
Input:
6
90 13
65434244 16341234
23424244 12341234
424175 341198
7452123 23472
56000168 16000048
Output:
815483
715724
92308
903465
241972
0 <-- Incorrect, should be: 803478
Constraints:
4 <= N <= 10^9
1 <= K <= N
You can use Lucas' theorem to reduce the problem to ceil(log_P(N)) subproblems with k, n < p: Write n = n_m * p^m + ... + n_0 and k = k_m * p^m + ... + k_0 in base p (n_i, k_i < p are the digits), then we have
C(n,k) = PROD(i = 0 to m, C(n_i, k_i)) (mod p)
The subproblems are easy to solve, because every factor of k! has an inverse modulo p. You get an algorithm with runtime complexity O(p log(n)), which is better than that of Ivaylo's code in case of p << n, if I understand it correctly.
int powmod(int x, int e, int p) {
if (e == 0) return 1;
if (e & 1) return (long long)x * powmod(x, e - 1, p) % p;
long long rt = powmod(x, e / 2, p);
return rt * rt % p;
}
int binom_coeff_mod_prime(int n, int k, int p) {
long long res = 1;
while (n || k) {
int N = n % p, K = k % p;
for (int i = N - K + 1; i <= N; ++i)
res = res * i % p;
for (int i = 1; i <= K; ++i)
res = res * powmod(i, p - 2, p) % p;
n /= p;
k /= p;
}
return res;
}
I suggest you use factorization to compute the number of combinations without division. I've got code for doing so here, originally inspired by Fast computation of multi-category number of combinations (I still would like to post a proper answer to that, if some kind souls would reopen it).
My code stores the result as a table of factors, doing the modular multiplication to expand the result should be quite straightforward.
Probably not practical for n in the range of 10**9, though, since the sieve will be quite massive and take a while to construct.
Is there a way to build e.g. (853467 * 21660421200929) % 100000000000007 without BigInteger libraries (note that each number fits into a 64 bit integer but the multiplication result does not)?
This solution seems inefficient:
int64_t mulmod(int64_t a, int64_t b, int64_t m) {
if (b < a)
std::swap(a, b);
int64_t res = 0;
for (int64_t i = 0; i < a; i++) {
res += b;
res %= m;
}
return res;
}
You should use Russian Peasant multiplication. It uses repeated doubling to compute all the values (b*2^i)%m, and adds them in if the ith bit of a is set.
uint64_t mulmod(uint64_t a, uint64_t b, uint64_t m) {
int64_t res = 0;
while (a != 0) {
if (a & 1) res = (res + b) % m;
a >>= 1;
b = (b << 1) % m;
}
return res;
}
It improves upon your algorithm because it takes O(log(a)) time, not O(a) time.
Caveats: unsigned, and works only if m is 63 bits or less.
Keith Randall's answer is good, but as he said, a caveat is that it works only if m is 63 bits or less.
Here is a modification which has two advantages:
It works even if m is 64 bits.
It doesn't need to use the modulo operation, which can be expensive on some processors.
(Note that the res -= m and temp_b -= m lines rely on 64-bit unsigned integer overflow in order to give the expected results. This should be fine since unsigned integer overflow is well-defined in C and C++. For this reason it's important to use unsigned integer types.)
uint64_t mulmod(uint64_t a, uint64_t b, uint64_t m) {
uint64_t res = 0;
uint64_t temp_b;
/* Only needed if b may be >= m */
if (b >= m) {
if (m > UINT64_MAX / 2u)
b -= m;
else
b %= m;
}
while (a != 0) {
if (a & 1) {
/* Add b to res, modulo m, without overflow */
if (b >= m - res) /* Equiv to if (res + b >= m), without overflow */
res -= m;
res += b;
}
a >>= 1;
/* Double b, modulo m */
temp_b = b;
if (b >= m - b) /* Equiv to if (2 * b >= m), without overflow */
temp_b -= m;
b += temp_b;
}
return res;
}
Both methods work for me. The first one is the same as yours, but I changed your numbers to excplicit ULL. Second one uses assembler notation, which should work faster.
There are also algorithms used in cryptography (RSA and RSA based cryptography mostly I guess), like already mentioned Montgomery reduction as well, but I think it will take time to implement them.
#include <algorithm>
#include <iostream>
__uint64_t mulmod1(__uint64_t a, __uint64_t b, __uint64_t m) {
if (b < a)
std::swap(a, b);
__uint64_t res = 0;
for (__uint64_t i = 0; i < a; i++) {
res += b;
res %= m;
}
return res;
}
__uint64_t mulmod2(__uint64_t a, __uint64_t b, __uint64_t m) {
__uint64_t r;
__asm__
( "mulq %2\n\t"
"divq %3"
: "=&d" (r), "+%a" (a)
: "rm" (b), "rm" (m)
: "cc"
);
return r;
}
int main() {
using namespace std;
__uint64_t a = 853467ULL;
__uint64_t b = 21660421200929ULL;
__uint64_t c = 100000000000007ULL;
cout << mulmod1(a, b, c) << endl;
cout << mulmod2(a, b, c) << endl;
return 0;
}
An improvement to the repeating doubling algorithm is to check how many bits at once can be calculated without an overflow. An early exit check can be done for both arguments -- speeding up the (unlikely?) event of N not being prime.
e.g. 100000000000007 == 0x00005af3107a4007, which allows 16 (or 17) bits to be calculated per each iteration. The actual number of iterations will be 3 with the example.
// just a conceptual routine
int get_leading_zeroes(uint64_t n)
{
int a=0;
while ((n & 0x8000000000000000) == 0) { a++; n<<=1; }
return a;
}
uint64_t mulmod(uint64_t a, uint64_t b, uint64_t n)
{
uint64_t result = 0;
int N = get_leading_zeroes(n);
uint64_t mask = (1<<N) - 1;
a %= n;
b %= n; // Make sure all values are originally in the proper range?
// n is not necessarily a prime -- so both a & b can end up being zero
while (a>0 && b>0)
{
result = (result + (b & mask) * a) % n; // no overflow
b>>=N;
a = (a << N) % n;
}
return result;
}
You could try something that breaks the multiplication up into additions:
// compute (a * b) % m:
unsigned int multmod(unsigned int a, unsigned int b, unsigned int m)
{
unsigned int result = 0;
a %= m;
b %= m;
while (b)
{
if (b % 2 != 0)
{
result = (result + a) % m;
}
a = (a * 2) % m;
b /= 2;
}
return result;
}
a * b % m equals a * b - (a * b / m) * m
Use floating point arithmetic to approximate a * b / m. The approximation leaves a value small enough for normal 64 bit integer operations, for m up to 63 bits.
This method is limited by the significand of a double, which is usually 52 bits.
uint64_t mod_mul_52(uint64_t a, uint64_t b, uint64_t m) {
uint64_t c = (double)a * b / m - 1;
uint64_t d = a * b - c * m;
return d % m;
}
This method is limited by the significand of a long double, which is usually 64 bits or larger. The integer arithmetic is limited to 63 bits.
uint64_t mod_mul_63(uint64_t a, uint64_t b, uint64_t m) {
uint64_t c = (long double)a * b / m - 1;
uint64_t d = a * b - c * m;
return d % m;
}
These methods require that a and b be less than m. To handle arbitrary a and b, add these lines before c is computed.
a = a % m;
b = b % m;
In both methods, the final % operation could be made conditional.
return d >= m ? d % m : d;
I can suggest an improvement for your algorithm.
You actually calculate a * b iteratively by adding each time b, doing modulo after each iteration. It's better to add each time b * x, whereas x is determined so that b * x won't overflow.
int64_t mulmod(int64_t a, int64_t b, int64_t m)
{
a %= m;
b %= m;
int64_t x = 1;
int64_t bx = b;
while (x < a)
{
int64_t bb = bx * 2;
if (bb <= bx)
break; // overflow
x *= 2;
bx = bb;
}
int64_t ans = 0;
for (; x < a; a -= x)
ans = (ans + bx) % m;
return (ans + a*b) % m;
}