Calculating Catalan Numbers mod prime number - c++

The following is the problem description:
let c[n] be the catalan number for n and p be a large prime eg.1000000007
I need to calculate c[n] % p where n ranges from {1,2,3,...,1000}
The problem which I am having is that on a 32 bit machine you get overflow when you calculate catalan number for such large integer. I am familiar with modulo arithmetic. Also
(a.b) % p = ((a % p)(b % p)) % p
this formula helps me to get away with the overflow in numerator separately but I have no idea how to deal with denominators.

For a modulus of 1000000007, avoiding overflow with only 32-bit integers is cumbersome. But any decent C implementation provides 64-bit integers (and any decent C++ implementation does too), so that shouldn't be necessary.
Then to deal with the denominators, one possibility is, as KerrekSB said in his comment, to calculate the modular inverse of the denominators modulo the prime p = 1000000007. You can calculate the modular inverse with the extended Euclidean algorithm or, equivalently, the continued fraction expansion of k/p. Then instead of dividing by k in the calculation, you multiply by its modular inverse.
Another option is to use Segner's recurrence relation for the Catalan numbers, which gives a calculation without divisions:
C(0) = 1
n
C(n+1) = ∑ C(i)*C(n-i)
0
Since you only need the Catalan numbers C(k) for k <= 1000, you can precalculate them, or quickly calculate them at program startup and store them in a lookup table.
If contrary to expectation no 64-bit integer type is available, you can calculate the modular product by splitting the factors into low and high 16 bits,
a = a1 + (a2 << 16) // 0 <= a1, a2 < (1 << 16)
b = b1 + (b2 << 16) // 0 <= b1, b2 < (1 << 16)
a*b = a1*b1 + (a1*b2 << 16) + (a2*b1 << 16) + (a2*b2 << 32)
To calculate a*b (mod m) with m <= (1 << 31), reduce each of the four products modulo m,
p1 = (a1*b1) % m;
p2 = (a1*b2) % m;
p3 = (a2*b1) % m;
p4 = (a2*b2) % m;
and the simplest way to incorporate the shifts is
for(i = 0; i < 16; ++i) {
p2 *= 2;
if (p2 >= m) p2 -= m;
}
the same for p3 and with 32 iterations for p4. Then
s = p1+p2;
if (s >= m) s -= m;
s += p3;
if (s >= m) s -= m;
s += p4;
if (s >= m) s -= m;
return s;
That way is not very fast, but for the few multiplications needed here, it's fast enough. A small speedup should be obtained by reducing the number of shifts; first calculate (p4 << 16) % m,
for(i = 0; i < 16; ++i) {
p4 *= 2;
if (p4 >= m) p4 -= m;
}
then all of p2, p3 and the current value of p4 need to be multiplied with 216 modulo m,
p4 += p3;
if (p4 >= m) p4 -= m;
p4 += p2;
if (p4 >= m) p4 -= m;
for(i = 0; i < 16; ++i) {
p4 *= 2;
if (p4 >= m) p4 -= m;
}
s = p4+p1;
if (s >= m) s -= m;
return s;

what about if you store the results using dynamic programming and while populating the lookup table, you can use MODULO division at each step. It will take care of the overflow for the 1000 Catalans and also will be faster than BigDecimal/BigInteger.
My solution:
public class Catalan {
private static long [] catalan= new long[1001];
private static final int MOD=1000000007;
public static void main(String[] args) {
precalc();
for (int i=1;i<=1000;i++){
System.out.println("Catalan number for "+i+" is: "+catalan[i]);
}
}
private static void precalc(){
for (int i=0;i<=1000;i++){
if (i==0 || i==1){
catalan[i]=1;
}
else{
long sum =0;long left, right;
for (int k=1;k<=i;k++){
left = catalan[k-1] % MOD;
right= catalan[i-k] % MOD;
sum =(sum+ (left * right)%MOD)%MOD;
}
catalan[i]=sum;
}
}
}
}

What about using a library for big integers? Try googling for it...

#include <stdio.h>
#include <stdlib.h>
/*
C(n) = (2n)!/(n+1)!n!
= (2n)(2n-1)(2n-2)..(n+2)/n!
*/
int p = 1000000007;
int gcd(int x, int y){
while(y!=0){
int wk = x % y;
x = y;
y = wk;
}
return x;
}
int catalanMod(n){
long long c = 1LL;
int i;
int *list,*wk;
//make array [(2n),(2n-1),(2n-2)..(n+2)]
wk = list = (int*)malloc(sizeof(int)*(n-1));
for(i=n+2;i<=2*n;++i){
*wk++ = i;
}
wk=list;
//[(2n),(2n-1),(2n-2)..(n+2)] / [1,2,3,..n]
//E.g C(10)=[13,17,19,4]
for(i=2;i<=n;++i){
int j,k,w;
for(w=i,j=0;j<n-1;++j){
while(1!=(k = gcd(wk[j], w))){
wk[j] /= k;
w /= k;
}
if(w == 1) break;
}
}
wk=list;
//Multiplication and modulo reduce
for(i=0;i<n-1;++i){
if(wk[i]==1)continue;
c = c * wk[i] % p;
}
free(list);
return c;
}

Simply, use the property, (a * b) % mod = (a % mod) * (b % mod)

Related

Want to minimize my code so it consumes time less than 1 sec. It uses concept of modular exponentiation .Correct output but exceeding time limit

The below code is to calculate 2^n where n is equal to 1 <= n <= 10^5. So to calculate such large numbers I have used concept of modular exponentian. The code is giving correct output but due to large number of test cases it is exceeding the time limit. I am not getting a way to minimize the solution so it consumes less time. As the "algo" function is called as many times as the number of test cases. So I want to put the logic used in "algo" function in the main() function so it consumes time less than 1 sec and also gives the correct output. Here "t" represents number of test cases and it's value is 1 <= t <= 10^5.
Any suggestions from your side would be of great help!!
#include<iostream>
#include<math.h>
using namespace std;
int algo(int x, int y){
long m = 1000000007;
if(y == 0){
return 1;
}
int k = algo(x,y/2);
if (y % 2 == 1){
return ((((1ll * k * k) % m) * x) % m);
} else if (y % 2 == 0){
return ((1ll * k * k) % m);
}
}
int main(void)
{
int n, t, k;
cin>>t; //t = number of test cases
for ( k = 0; k < t; k++)
{
cin >> n; //power of 2
cout<<"the value after algo is: "<<algo(2,n)<<endl;
}
return 0;
}
You can make use of binary shifts to find powers of two
#include <iostream>
using namespace std;
int main()
{
unsigned long long u = 1, w = 2, n = 10, p = 1000000007, r;
//n -> power of two
while (n != 0)
{
if ((n & 0x1) != 0)
u = (u * w) % p;
if ((n >>= 1) != 0)
w = (w * w) % p;
}
r = (unsigned long)u;
cout << r;
return 0;
}
This is the function that I often use to calculate
Any integer X raised to power Y modulo M
C++ Function to calculate (X^Y) mod M
int power(int x, int y, const int mod = 1e9+7)
{
int result = 1;
x = x % mod;
if (x == 0)
return 0;
while (y > 0)
{
if (y & 1)
result = ( (result % mod) * (x % mod) ) % mod;
y = y >> 1; // y = y / 2
x = ( (x % mod) * (x % mod) ) % mod;
}
return result;
}
Remove the Mod if you don't want.
Time Complexity of this Function is O(log2(Y))
There can be a case of over flow so use int , long , long long etc as per your need.
Well your variables won't sustain the boundary test cases, introducing 2^10000, 1 <= n <= 10^5. RIP algorithms
19950631168807583848837421626835850838234968318861924548520089498529438830221946631919961684036194597899331129423209124271556491349413781117593785932096323957855730046793794526765246551266059895520550086918193311542508608460618104685509074866089624888090489894838009253941633257850621568309473902556912388065225096643874441046759871626985453222868538161694315775629640762836880760732228535091641476183956381458969463899410840960536267821064621427333394036525565649530603142680234969400335934316651459297773279665775606172582031407994198179607378245683762280037302885487251900834464581454650557929601414833921615734588139257095379769119277800826957735674444123062018757836325502728323789270710373802866393031428133241401624195671690574061419654342324638801248856147305207431992259611796250130992860241708340807605932320161268492288496255841312844061536738951487114256315111089745514203313820202931640957596464756010405845841566072044962867016515061920631004186422275908670900574606417856951911456055068251250406007519842261898059237118054444788072906395242548339221982707404473162376760846613033778706039803413197133493654622700563169937455508241780972810983291314403571877524768509857276937926433221599399876886660808368837838027643282775172273657572744784112294389733810861607423253291974813120197604178281965697475898164531258434135959862784130128185406283476649088690521047580882615823961985770122407044330583075869039319604603404973156583208672105913300903752823415539745394397715257455290510212310947321610753474825740775273986348298498340756937955646638621874569499279016572103701364433135817214311791398222983845847334440270964182851005072927748364550578634501100852987812389473928699540834346158807043959118985815145779177143619698728131459483783202081474982171858011389071228250905826817436220577475921417653715687725614904582904992461028630081535583308130101987675856234343538955409175623400844887526162643568648833519463720377293240094456246923254350400678027273837755376406726898636241037491410966718557050759098100246789880178271925953381282421954028302759408448955014676668389697996886241636313376393903373455801407636741877711055384225739499110186468219696581651485130494222369947714763069155468217682876200362777257723781365331611196811280792669481887201298643660768551639860534602297871557517947385246369446923087894265948217008051120322365496288169035739121368338393591756418733850510970271613915439590991598154654417336311656936031122249937969999226781732358023111862644575299135758175008199839236284615249881088960232244362173771618086357015468484058622329792853875623486556440536962622018963571028812361567512543338303270029097668650568557157505516727518899194129711337690149916181315171544007728650573189557450920330185304847113818315407324053319038462084036421763703911550639789000742853672196280903477974533320468368795868580237952218629120080742819551317948157624448298518461509704888027274721574688131594750409732115080498190455803416826949787141316063210686391511681774304792596709376
Fear not my friend, someone did tried to solve the problem https://www.quora.com/What-is-2-raised-to-the-power-of-50-000, you are looking for Piyush Michael's answer , here is his sample code
#include <stdio.h>
int main()
{
int ul=16,000;
int rs=50,000;
int s=0,carry[ul],i,j,k,ar[ul];
ar[0]=2;
for(i=1;i<ul;i++)ar[i]=0;
for(j=1;j<rs;j++)
{for(k=0;k<ul;k++)carry[k]=0;
for(i=0;i<ul;i++)
{ar[i]=ar[i]*2+carry[i];
if(ar[i]>9)
{carry[i+1]=ar[i]/10;
ar[i]=ar[i]%10;
}
}
}
for(j=ul-1;j>=0;j--)printf("%d",ar[j]);
for(i=0;i<ul-1;i++)s+=ar[i];
printf("\n\n%d",s);
}

Floating point exception with big numbers and inaccurate results when changing integers to longs

I was given this problem which gives you 2 integers n and m, n representing the number of figures of the second number, and the second number is a binary number. For example:
n= 2
m= 11
After figuring out the binary number, you have to get that number to be 0. You are only allowed to substract 1 from the binary or divide by two. For example, in this case it would be:
3->2->1 so the result is 2 since there were 2 operations made
My program does work and displays the binary number and the number of operations. But when given big numbers, it throws floating point exception. My theory is that it is because ints are too short. But when I change them to longs, it throws a completely inaccurate result.
#include <bits/stdc++.h>
using namespace std;
int main()
{
int n, m, g, l, f, x, k, w;
cin >> n >> m;
g = pow(10, n);
k = pow(2, n - 1);
for (int i = 1; i <= n; i++)
{
l = m % g;
f = l / (g / 10);
if (f == 1)
{
x += k;
}
k /= 2;
g /= 10;
}
cout << x << endl;
while (x != 1)
{
if (x % 2 == 0)
{
x /= 2;
}
else
{
x -= 1;
}
w++;
}
cout << w;
return 0;
}

Modulo of difference of terms

As an answer to a particular problem, I have to print n*k^n - (n-1)*k.
for(i=0;i<n;i++){
c=(c%p*k%p)%p;
c=(c%p*n%p)%p;
d=((n-1)%p*k%p)%p;
s=(c%p-d%p)%p;
cout<<s<<endl;
}
Initially c=1, p=1000000007 and s is my final answer.
I have to take the modulo of s with respect to p.
For large values of n, s becomes negative. This happens because the modulo value changes. So even if c>d, it is possible that c%p<d%p. For n=1000000000 and k=25, s=-727999801. I am not being able to think of a suitable workaround.
-2 % 7 = 5, because -2 = 7 * (-1) + 5, while c++ modulo operation would return -2, so to get positive number you just need to add p.
if (s < 0) s += p;
I advise you to rewrite your code in the following way:
for (int i = 0; i < n; i++) {
c = (c * (k % p)) % p;
}
c = (c * (n % p)) % p;
int d = ((n - 1) % p * (k % p)) % p;
int s = (c - d) % p;
if (s < 0) s += p;
cout << s << endl;
To check with some small inputs you can use the following line:
cout << (n * (int)pow(k, n) - (n-1)*k) % p << endl;
Try to run with this input:
const int n = 5;
const int p = 7;
const int k = 10;
int c = 1;
You will see that without if (s < 0) s += p; it is -1. This line fixes it to 6 - the right answer.

Ways to do modulo multiplication with primitive types

Is there a way to build e.g. (853467 * 21660421200929) % 100000000000007 without BigInteger libraries (note that each number fits into a 64 bit integer but the multiplication result does not)?
This solution seems inefficient:
int64_t mulmod(int64_t a, int64_t b, int64_t m) {
if (b < a)
std::swap(a, b);
int64_t res = 0;
for (int64_t i = 0; i < a; i++) {
res += b;
res %= m;
}
return res;
}
You should use Russian Peasant multiplication. It uses repeated doubling to compute all the values (b*2^i)%m, and adds them in if the ith bit of a is set.
uint64_t mulmod(uint64_t a, uint64_t b, uint64_t m) {
int64_t res = 0;
while (a != 0) {
if (a & 1) res = (res + b) % m;
a >>= 1;
b = (b << 1) % m;
}
return res;
}
It improves upon your algorithm because it takes O(log(a)) time, not O(a) time.
Caveats: unsigned, and works only if m is 63 bits or less.
Keith Randall's answer is good, but as he said, a caveat is that it works only if m is 63 bits or less.
Here is a modification which has two advantages:
It works even if m is 64 bits.
It doesn't need to use the modulo operation, which can be expensive on some processors.
(Note that the res -= m and temp_b -= m lines rely on 64-bit unsigned integer overflow in order to give the expected results. This should be fine since unsigned integer overflow is well-defined in C and C++. For this reason it's important to use unsigned integer types.)
uint64_t mulmod(uint64_t a, uint64_t b, uint64_t m) {
uint64_t res = 0;
uint64_t temp_b;
/* Only needed if b may be >= m */
if (b >= m) {
if (m > UINT64_MAX / 2u)
b -= m;
else
b %= m;
}
while (a != 0) {
if (a & 1) {
/* Add b to res, modulo m, without overflow */
if (b >= m - res) /* Equiv to if (res + b >= m), without overflow */
res -= m;
res += b;
}
a >>= 1;
/* Double b, modulo m */
temp_b = b;
if (b >= m - b) /* Equiv to if (2 * b >= m), without overflow */
temp_b -= m;
b += temp_b;
}
return res;
}
Both methods work for me. The first one is the same as yours, but I changed your numbers to excplicit ULL. Second one uses assembler notation, which should work faster.
There are also algorithms used in cryptography (RSA and RSA based cryptography mostly I guess), like already mentioned Montgomery reduction as well, but I think it will take time to implement them.
#include <algorithm>
#include <iostream>
__uint64_t mulmod1(__uint64_t a, __uint64_t b, __uint64_t m) {
if (b < a)
std::swap(a, b);
__uint64_t res = 0;
for (__uint64_t i = 0; i < a; i++) {
res += b;
res %= m;
}
return res;
}
__uint64_t mulmod2(__uint64_t a, __uint64_t b, __uint64_t m) {
__uint64_t r;
__asm__
( "mulq %2\n\t"
"divq %3"
: "=&d" (r), "+%a" (a)
: "rm" (b), "rm" (m)
: "cc"
);
return r;
}
int main() {
using namespace std;
__uint64_t a = 853467ULL;
__uint64_t b = 21660421200929ULL;
__uint64_t c = 100000000000007ULL;
cout << mulmod1(a, b, c) << endl;
cout << mulmod2(a, b, c) << endl;
return 0;
}
An improvement to the repeating doubling algorithm is to check how many bits at once can be calculated without an overflow. An early exit check can be done for both arguments -- speeding up the (unlikely?) event of N not being prime.
e.g. 100000000000007 == 0x00005af3107a4007, which allows 16 (or 17) bits to be calculated per each iteration. The actual number of iterations will be 3 with the example.
// just a conceptual routine
int get_leading_zeroes(uint64_t n)
{
int a=0;
while ((n & 0x8000000000000000) == 0) { a++; n<<=1; }
return a;
}
uint64_t mulmod(uint64_t a, uint64_t b, uint64_t n)
{
uint64_t result = 0;
int N = get_leading_zeroes(n);
uint64_t mask = (1<<N) - 1;
a %= n;
b %= n; // Make sure all values are originally in the proper range?
// n is not necessarily a prime -- so both a & b can end up being zero
while (a>0 && b>0)
{
result = (result + (b & mask) * a) % n; // no overflow
b>>=N;
a = (a << N) % n;
}
return result;
}
You could try something that breaks the multiplication up into additions:
// compute (a * b) % m:
unsigned int multmod(unsigned int a, unsigned int b, unsigned int m)
{
unsigned int result = 0;
a %= m;
b %= m;
while (b)
{
if (b % 2 != 0)
{
result = (result + a) % m;
}
a = (a * 2) % m;
b /= 2;
}
return result;
}
a * b % m equals a * b - (a * b / m) * m
Use floating point arithmetic to approximate a * b / m. The approximation leaves a value small enough for normal 64 bit integer operations, for m up to 63 bits.
This method is limited by the significand of a double, which is usually 52 bits.
uint64_t mod_mul_52(uint64_t a, uint64_t b, uint64_t m) {
uint64_t c = (double)a * b / m - 1;
uint64_t d = a * b - c * m;
return d % m;
}
This method is limited by the significand of a long double, which is usually 64 bits or larger. The integer arithmetic is limited to 63 bits.
uint64_t mod_mul_63(uint64_t a, uint64_t b, uint64_t m) {
uint64_t c = (long double)a * b / m - 1;
uint64_t d = a * b - c * m;
return d % m;
}
These methods require that a and b be less than m. To handle arbitrary a and b, add these lines before c is computed.
a = a % m;
b = b % m;
In both methods, the final % operation could be made conditional.
return d >= m ? d % m : d;
I can suggest an improvement for your algorithm.
You actually calculate a * b iteratively by adding each time b, doing modulo after each iteration. It's better to add each time b * x, whereas x is determined so that b * x won't overflow.
int64_t mulmod(int64_t a, int64_t b, int64_t m)
{
a %= m;
b %= m;
int64_t x = 1;
int64_t bx = b;
while (x < a)
{
int64_t bb = bx * 2;
if (bb <= bx)
break; // overflow
x *= 2;
bx = bb;
}
int64_t ans = 0;
for (; x < a; a -= x)
ans = (ans + bx) % m;
return (ans + a*b) % m;
}

Fast n choose k mod p for large n?

What I mean by "large n" is something in the millions. p is prime.
I've tried
http://apps.topcoder.com/wiki/display/tc/SRM+467
But the function seems to be incorrect (I tested it with 144 choose 6 mod 5 and it gives me 0 when it should give me 2)
I've tried
http://online-judge.uva.es/board/viewtopic.php?f=22&t=42690
But I don't understand it fully
I've also made a memoized recursive function that uses the logic (combinations(n-1, k-1, p)%p + combinations(n-1, k, p)%p) but it gives me stack overflow problems because n is large
I've tried Lucas Theorem but it appears to be either slow or inaccurate.
All I'm trying to do is create a fast/accurate n choose k mod p for large n. If anyone could help show me a good implementation for this I'd be very grateful. Thanks.
As requested, the memoized version that hits stack overflows for large n:
std::map<std::pair<long long, long long>, long long> memo;
long long combinations(long long n, long long k, long long p){
if (n < k) return 0;
if (0 == n) return 0;
if (0 == k) return 1;
if (n == k) return 1;
if (1 == k) return n;
map<std::pair<long long, long long>, long long>::iterator it;
if((it = memo.find(std::make_pair(n, k))) != memo.end()) {
return it->second;
}
else
{
long long value = (combinations(n-1, k-1,p)%p + combinations(n-1, k,p)%p)%p;
memo.insert(std::make_pair(std::make_pair(n, k), value));
return value;
}
}
So, here is how you can solve your problem.
Of course you know the formula:
comb(n,k) = n!/(k!*(n-k)!) = (n*(n-1)*...(n-k+1))/k!
(See http://en.wikipedia.org/wiki/Binomial_coefficient#Computing_the_value_of_binomial_coefficients)
You know how to compute the numerator:
long long res = 1;
for (long long i = n; i > n- k; --i) {
res = (res * i) % p;
}
Now, as p is prime the reciprocal of each integer that is coprime with p is well defined i.e. a-1 can be found. And this can be done using Fermat's theorem ap-1=1(mod p) => a*ap-2=1(mod p) and so a-1=ap-2.
Now all you need to do is to implement fast exponentiation(for example using the binary method):
long long degree(long long a, long long k, long long p) {
long long res = 1;
long long cur = a;
while (k) {
if (k % 2) {
res = (res * cur) % p;
}
k /= 2;
cur = (cur * cur) % p;
}
return res;
}
And now you can add the denominator to our result:
long long res = 1;
for (long long i = 1; i <= k; ++i) {
res = (res * degree(i, p- 2)) % p;
}
Please note I am using long long everywhere to avoid type overflow. Of course you don't need to do k exponentiations - you can compute k!(mod p) and then divide only once:
long long denom = 1;
for (long long i = 1; i <= k; ++i) {
denom = (denom * i) % p;
}
res = (res * degree(denom, p- 2)) % p;
EDIT: as per #dbaupp's comment if k >= p the k! will be equal to 0 modulo p and (k!)^-1 will not be defined. To avoid that first compute the degree with which p is in n*(n-1)...(n-k+1) and in k! and compare them:
int get_degree(long long n, long long p) { // returns the degree with which p is in n!
int degree_num = 0;
long long u = p;
long long temp = n;
while (u <= temp) {
degree_num += temp / u;
u *= p;
}
return degree_num;
}
long long combinations(int n, int k, long long p) {
int num_degree = get_degree(n, p) - get_degree(n - k, p);
int den_degree = get_degree(k, p);
if (num_degree > den_degree) {
return 0;
}
long long res = 1;
for (long long i = n; i > n - k; --i) {
long long ti = i;
while(ti % p == 0) {
ti /= p;
}
res = (res * ti) % p;
}
for (long long i = 1; i <= k; ++i) {
long long ti = i;
while(ti % p == 0) {
ti /= p;
}
res = (res * degree(ti, p-2, p)) % p;
}
return res;
}
EDIT: There is one more optimization that can be added to the solution above - instead of computing the inverse number of each multiple in k!, we can compute k!(mod p) and then compute the inverse of that number. Thus we have to pay the logarithm for the exponentiation only once. Of course again we have to discard the p divisors of each multiple. We only have to change the last loop with this:
long long denom = 1;
for (long long i = 1; i <= k; ++i) {
long long ti = i;
while(ti % p == 0) {
ti /= p;
}
denom = (denom * ti) % p;
}
res = (res * degree(denom, p-2, p)) % p;
For large k, we can reduce the work significantly by exploiting two fundamental facts:
If p is a prime, the exponent of p in the prime factorisation of n! is given by (n - s_p(n)) / (p-1), where s_p(n) is the sum of the digits of n in the base p representation (so for p = 2, it's popcount). Thus the exponent of p in the prime factorisation of choose(n,k) is (s_p(k) + s_p(n-k) - s_p(n)) / (p-1), in particular, it is zero if and only if the addition k + (n-k) has no carry when performed in base p (the exponent is the number of carries).
Wilson's theorem: p is a prime, if and only if (p-1)! ≡ (-1) (mod p).
The exponent of p in the factorisation of n! is usually calculated by
long long factorial_exponent(long long n, long long p)
{
long long ex = 0;
do
{
n /= p;
ex += n;
}while(n > 0);
return ex;
}
The check for divisibility of choose(n,k) by p is not strictly necessary, but it's reasonable to have that first, since it will often be the case, and then it's less work:
long long choose_mod(long long n, long long k, long long p)
{
// We deal with the trivial cases first
if (k < 0 || n < k) return 0;
if (k == 0 || k == n) return 1;
// Now check whether choose(n,k) is divisible by p
if (factorial_exponent(n) > factorial_exponent(k) + factorial_exponent(n-k)) return 0;
// If it's not divisible, do the generic work
return choose_mod_one(n,k,p);
}
Now let us take a closer look at n!. We separate the numbers ≤ n into the multiples of p and the numbers coprime to p. With
n = q*p + r, 0 ≤ r < p
The multiples of p contribute p^q * q!. The numbers coprime to p contribute the product of (j*p + k), 1 ≤ k < p for 0 ≤ j < q, and the product of (q*p + k), 1 ≤ k ≤ r.
For the numbers coprime to p we will only be interested in the contribution modulo p. Each of the full runs j*p + k, 1 ≤ k < p is congruent to (p-1)! modulo p, so altogether they produce a contribution of (-1)^q modulo p. The last (possibly) incomplete run produces r! modulo p.
So if we write
n = a*p + A
k = b*p + B
n-k = c*p + C
we get
choose(n,k) = p^a * a!/ (p^b * b! * p^c * c!) * cop(a,A) / (cop(b,B) * cop(c,C))
where cop(m,r) is the product of all numbers coprime to p which are ≤ m*p + r.
There are two possibilities, a = b + c and A = B + C, or a = b + c + 1 and A = B + C - p.
In our calculation, we have eliminated the second possibility beforehand, but that is not essential.
In the first case, the explicit powers of p cancel, and we are left with
choose(n,k) = a! / (b! * c!) * cop(a,A) / (cop(b,B) * cop(c,C))
= choose(a,b) * cop(a,A) / (cop(b,B) * cop(c,C))
Any powers of p dividing choose(n,k) come from choose(a,b) - in our case, there will be none, since we've eliminated these cases before - and, although cop(a,A) / (cop(b,B) * cop(c,C)) need not be an integer (consider e.g. choose(19,9) (mod 5)), when considering the expression modulo p, cop(m,r) reduces to (-1)^m * r!, so, since a = b + c, the (-1) cancel and we are left with
choose(n,k) ≡ choose(a,b) * choose(A,B) (mod p)
In the second case, we find
choose(n,k) = choose(a,b) * p * cop(a,A)/ (cop(b,B) * cop(c,C))
since a = b + c + 1. The carry in the last digit means that A < B, so modulo p
p * cop(a,A) / (cop(b,B) * cop(c,C)) ≡ 0 = choose(A,B)
(where we can either replace the division with a multiplication by the modular inverse, or view it as a congruence of rational numbers, meaning the numerator is divisible by p). Anyway, we again find
choose(n,k) ≡ choose(a,b) * choose(A,B) (mod p)
Now we can recur for the choose(a,b) part.
Example:
choose(144,6) (mod 5)
144 = 28 * 5 + 4
6 = 1 * 5 + 1
choose(144,6) ≡ choose(28,1) * choose(4,1) (mod 5)
≡ choose(3,1) * choose(4,1) (mod 5)
≡ 3 * 4 = 12 ≡ 2 (mod 5)
choose(12349,789) ≡ choose(2469,157) * choose(4,4)
≡ choose(493,31) * choose(4,2) * choose(4,4
≡ choose(98,6) * choose(3,1) * choose(4,2) * choose(4,4)
≡ choose(19,1) * choose(3,1) * choose(3,1) * choose(4,2) * choose(4,4)
≡ 4 * 3 * 3 * 1 * 1 = 36 ≡ 1 (mod 5)
Now the implementation:
// Preconditions: 0 <= k <= n; p > 1 prime
long long choose_mod_one(long long n, long long k, long long p)
{
// For small k, no recursion is necessary
if (k < p) return choose_mod_two(n,k,p);
long long q_n, r_n, q_k, r_k, choose;
q_n = n / p;
r_n = n % p;
q_k = k / p;
r_k = k % p;
choose = choose_mod_two(r_n, r_k, p);
// If the exponent of p in choose(n,k) isn't determined to be 0
// before the calculation gets serious, short-cut here:
/* if (choose == 0) return 0; */
choose *= choose_mod_one(q_n, q_k, p);
return choose % p;
}
// Preconditions: 0 <= k <= min(n,p-1); p > 1 prime
long long choose_mod_two(long long n, long long k, long long p)
{
// reduce n modulo p
n %= p;
// Trivial checks
if (n < k) return 0;
if (k == 0 || k == n) return 1;
// Now 0 < k < n, save a bit of work if k > n/2
if (k > n/2) k = n-k;
// calculate numerator and denominator modulo p
long long num = n, den = 1;
for(n = n-1; k > 1; --n, --k)
{
num = (num * n) % p;
den = (den * k) % p;
}
// Invert denominator modulo p
den = invert_mod(den,p);
return (num * den) % p;
}
To calculate the modular inverse, you can use Fermat's (so-called little) theorem
If p is prime and a not divisible by p, then a^(p-1) ≡ 1 (mod p).
and calculate the inverse as a^(p-2) (mod p), or use a method applicable to a wider range of arguments, the extended Euclidean algorithm or continued fraction expansion, which give you the modular inverse for any pair of coprime (positive) integers:
long long invert_mod(long long k, long long m)
{
if (m == 0) return (k == 1 || k == -1) ? k : 0;
if (m < 0) m = -m;
k %= m;
if (k < 0) k += m;
int neg = 1;
long long p1 = 1, p2 = 0, k1 = k, m1 = m, q, r, temp;
while(k1 > 0) {
q = m1 / k1;
r = m1 % k1;
temp = q*p1 + p2;
p2 = p1;
p1 = temp;
m1 = k1;
k1 = r;
neg = !neg;
}
return neg ? m - p2 : p2;
}
Like calculating a^(p-2) (mod p), this is an O(log p) algorithm, for some inputs it's significantly faster (it's actually O(min(log k, log p)), so for small k and large p, it's considerably faster), for others it's slower.
Overall, this way we need to calculate at most O(log_p k) binomial coefficients modulo p, where each binomial coefficient needs at most O(p) operations, yielding a total complexity of O(p*log_p k) operations.
When k is significantly larger than p, that is much better than the O(k) solution. For k <= p, it reduces to the O(k) solution with some overhead.
If you're calculating it more than once, there's another way that's faster. I'm going to post code in python because it'll probably be the easiest to convert into another language, although I'll put the C++ code at the end.
Calculating Once
Brute force:
def choose(n, k, m):
ans = 1
for i in range(k): ans *= (n-i)
for i in range(k): ans //= i
return ans % m
But the calculation can get into very big numbers, so we can use modular airthmetic tricks instead:
(a * b) mod m = (a mod m) * (b mod m) mod m
(a / (b*c)) mod m = (a mod m) / ((b mod m) * (c mod m) mod m)
(a / b) mod m = (a mod m) * (b mod m)^-1
Note the ^-1 at the end of the last equation. This is the multiplicative inverse of b mod m. It basically means that ((b mod m) * (b mod m)^-1) mod m = 1, just like how a * a^-1 = a * 1/a = 1 with (non-zero) integers.
This can be calculated in a few ways, one of which is the extended euclidean algorithm:
def multinv(n, m):
''' Multiplicative inverse of n mod m '''
if m == 1: return 0
m0, y, x = m, 0, 1
while n > 1:
y, x = x - n//m*y, y
m, n = n%m, m
return x+m0 if x < 0 else x
Note that another method, exponentiation, works only if m is prime. If it is, you can do this:
def powmod(b, e, m):
''' b^e mod m '''
# Note: If you use python, there's a built-in pow(b, e, m) that's probably faster
# But that's not in C++, so you can convert this instead:
P = 1
while e:
if e&1: P = P * b % m
e >>= 1; b = b * b % m
return P
def multinv(n, m):
''' Multiplicative inverse of n mod m, only if m is prime '''
return powmod(n, m-2, m)
But note that the Extended Euclidean Algorithm tends to still run faster, even though they technically have the same time complexity, O(log m), because it has a lower constant factor.
So now the full code:
def multinv(n, m):
''' Multiplicative inverse of n mod m in log(m) '''
if m == 1: return 0
m0, y, x = m, 0, 1
while n > 1:
y, x = x - n//m*y, y
m, n = n%m, m
return x+m0 if x < 0 else x
def choose(n, k, m):
num = den = 1
for i in range(k): num = num * (n-i) % m
for i in range(k): den = den * i % m
return num * multinv(den, m)
Querying Multiple Times
We can calculate the numerator and denominator separately, and then combine them. But notice that the product we're calculating for the numerator is n * (n-1) * (n-2) * (n-3) ... * (n-k+1). If you've ever learned about something called prefix sums, this is awfully similar. So let's apply it.
Precalculate fact[i] = i! mod m for i up to whatever the max value of n is, maybe 1e7 (ten million). Then, the numerator is (fact[n] * fact[n-k]^-1) mod m, and the denominator is fact[k]. So we can calculate choose(n, k, m) = fact[n] * multinv(fact[n-k], m) % m * multinv(fact[k], m) % m.
Python code:
MAXN = 1000 # Increase if necessary
MOD = 10**9+7 # A common mod that's used, change if necessary
fact = [1]
for i in range(1, MAXN+1):
fact.append(fact[-1] * i % MOD)
def multinv(n, m):
''' Multiplicative inverse of n mod m in log(m) '''
if m == 1: return 0
m0, y, x = m, 0, 1
while n > 1:
y, x = x - n//m*y, y
m, n = n%m, m
return x+m0 if x < 0 else x
def choose(n, k, m):
return fact[n] * multinv(fact[n-k] * fact[k] % m, m) % m
C++ code:
#include <iostream>
using namespace std;
const int MAXN = 1000; // Increase if necessary
const int MOD = 1e9+7; // A common mod that's used, change if necessary
int fact[MAXN+1];
int multinv(int n, int m) {
/* Multiplicative inverse of n mod m in log(m) */
if (m == 1) return 0;
int m0 = m, y = 0, x = 1, t;
while (n > 1) {
t = y;
y = x - n/m*y;
x = t;
t = m;
m = n%m;
n = t;
}
return x<0 ? x+m0 : x;
}
int choose(int n, int k, int m) {
return (long long) fact[n]
* multinv((long long) fact[n-k] * fact[k] % m, m) % m;
}
int main() {
fact[0] = 1;
for (int i = 1; i <= MAXN; i++) {
fact[i] = (long long) fact[i-1] * i % MOD;
}
cout << choose(4, 2, MOD) << '\n';
cout << choose(1e6, 1e3, MOD) << '\n';
}
Note that I'm casting to long long to avoid overflow.