CUDA not returning result - c++

I am trying to make a fraction calculator that calculates on a cuda devise, below is first the sequential version and then my try for a parallel version.
It runs without error, but for some reason do it not give the result back, I have been trying to get this to work for 2 weeks now, but can’t find the error!
Serilized version
int f(int x, int c, int n);
int gcd(unsigned int u, unsigned int v);
int main ()
{
clock_t start = clock();
srand ( time(NULL) );
int x = 1;
int y = 2;
int d = 1;
int c = rand() % 100;
int n = 323;
if(n % y == 0)
d = y;
while(d == 1)
{
x = f(x, c, n);
y = f(f(y, c, n), c, n);
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n);
if(d == n)
{
printf("\nd == n");
c = 0;
while(c == 0 || c == -2)
c = rand() % 100;
x = 2;
y = 2;
}
}
int d2 = n/d;
printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);
printf("\nResult: %d", d);
printf("\nResult2: %d", d2);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
}
int f(int x, int c, int n)
{
return (int)(pow((float)x, 2) + c) % n;
}
int gcd(unsigned int u, unsigned int v){
int shift;
/ * GCD(0,x) := x * /
if (u == 0 || v == 0)
return u | v;
/ * Let shift := lg K, where K is the greatest power of 2
dividing both u and v. * /
for (shift = 0; ((u | v) & 1) == 0; ++shift) {
u >>= 1;
v >>= 1;
}
while ((u & 1) == 0)
u >>= 1;
/ * From here on, u is always odd. * /
do {
while ((v & 1) == 0) / * Loop X * /
v >>= 1;
/ * Now u and v are both odd, so diff(u, v) is even.
Let u = min(u, v), v = diff(u, v)/2. * /
if (u < v) {
v -= u;
} else {
int diff = u - v;
u = v;
v = diff;
}
v >>= 1;
} while (v != 0);
return u << shift;
}
parallel version
#define threads 512
#define MaxBlocks 65535
#define RunningTheads (512*100)
__device__ int gcd(unsigned int u, unsigned int v)
{
int shift;
if (u == 0 || v == 0)
return u | v;
for (shift = 0; ((u | v) & 1) == 0; ++shift) {
u >>= 1;
v >>= 1;
}
while ((u & 1) == 0)
u >>= 1;
do {
while ((v & 1) == 0)
v >>= 1;
if (u < v) {
v -= u;
} else {
int diff = u - v;
u = v;
v = diff;
}
v >>= 1;
} while (v != 0);
return u << shift;
}
__device__ bool cuda_found;
__global__ void cudaKernal(int *cArray, int n, int *outr)
{
int index = blockIdx.x * threads + threadIdx.x;
int x = 1;
int y = 2;
int d = 4;
int c = cArray[index];
while(d == 1 && !cuda_found)
{
x = (int)(pow((float)x, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n);
}
if(d != 1 && !cuda_found)
{
cuda_found = true;
outr = &d;
}
}
int main ()
{
int n = 323;
int cArray[RunningTheads];
cArray[0] = 1;
for(int i = 1; i < RunningTheads-1; i++)
{
cArray[i] = i+2;
}
int dresult = 0;
int *dev_cArray;
int *dev_result;
HANDLE_ERROR(cudaMalloc((void**)&dev_cArray, RunningTheads*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_result, sizeof(int)));
HANDLE_ERROR(cudaMemcpy(dev_cArray, cArray, RunningTheads*sizeof(int), cudaMemcpyHostToDevice));
int TotalBlocks = ceil((float)RunningTheads/(float)threads);
if(TotalBlocks > MaxBlocks)
TotalBlocks = MaxBlocks;
printf("Blocks: %d\n", TotalBlocks);
printf("Threads: %d\n\n", threads);
cudaKernal<<<TotalBlocks,threads>>>(dev_cArray, n, dev_result);
HANDLE_ERROR(cudaMemcpy(&dresult, dev_result, sizeof(int), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(dev_cArray));
HANDLE_ERROR(cudaFree(dev_result));
if(dresult == 0)
dresult = 1;
int d2 = n/dresult;
printf("\nResult: %d", dresult);
printf("\nResult2: %d", d2);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
}

Lets have a look at your kernel code:
__global__ void cudaKernal(int *cArray, int n, int *outr)
{
int index = blockIdx.x * threads + threadIdx.x;
int x = 1;
int y = 2;
int d = 4;
int c = cArray[index];
while(d == 1 && !cuda_found) // always false because d is always 4
{
x = (int)(pow((float)x, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
y = (int)(pow((float)y, 2) + c) % n;
int abs = x - y;
if(abs < 0)
abs = abs * -1;
d = gcd(abs, n); // never writes to d because the loop won't
// be executed
}
if(d != 1 && !cuda_found) // maybe true if cuda_found was initalized
// with false
{
cuda_found = true; // Memory race here.
outr = &d; // you are changing the adresse where outr
// points to; the host code does not see this
// change. your cudaMemcpy dev -> host will copy
// the exact values back from device that have
// been uploaded by cudaMemcpy host -> dev
// if you want to set outr to 4 than write:
// *outr = d;
}
}

One of the problems is you don't return the result. In your code you just change outr which has local scope in your kernel function (i.e. changes are not seen outside this function). You should write *outr = d; to change the value of memory you're pointing with outr.
and I'm not sure if CUDA initializes global variables with zero. I mean are you sure cuda_found is always initialized with false?

Related

Product of all the nodes on the path of a tree

I was learning MO's Algorithm. In that I found a question. In which we have to make a program to take input n for n nodes of a tree then n-1 pairs of u and v denoting the connection between node u and node v. After that giving the n node values.
Then we will ask q queries. For each query we take input of k and l which denote the two nodes of that tree. Now we have to find the product of all the nodes in the path of k and l (including k and l).
I want to use MO's algorithm. https://codeforces.com/blog/entry/43230
But I am unable to make the code. Can anybody help me out in this.
The basic code for that would be:
int n, q;
int nxt[ N ], to[ N ], hd[ N ];
struct Que{
int u, v, id;
} que[ N ];
void init() {
// read how many nodes and how many queries
cin >> n >> q;
// read the edge of tree
for ( int i = 1 ; i < n ; ++ i ) {
int u, v; cin >> u >> v;
// save the tree using adjacency list
nxt[ i << 1 | 0 ] = hd[ u ];
to[ i << 1 | 0 ] = v;
hd[ u ] = i << 1 | 0;
nxt[ i << 1 | 1 ] = hd[ v ];
to[ i << 1 | 1 ] = u;
hd[ v ] = i << 1 | 1;
}
for ( int i = 0 ; i < q ; ++ i ) {
// read queries
cin >> que[ i ].u >> que[ i ].v;
que[ i ].id = i;
}
}
int dfn[ N ], dfn_, block_id[ N ], block_;
int stk[ N ], stk_;
void dfs( int u, int f ) {
dfn[ u ] = dfn_++;
int saved_rbp = stk_;
for ( int v_ = hd[ u ] ; v_ ; v_ = nxt[ v_ ] ) {
if ( to[ v_ ] == f ) continue;
dfs( to[ v_ ], u );
if ( stk_ - saved_rbp < SQRT_N ) continue;
for ( ++ block_ ; stk_ != saved_rbp ; )
block_id[ stk[ -- stk_ ] ] = block_;
}
stk[ stk_ ++ ] = u;
}
bool inPath[ N ];
void SymmetricDifference( int u ) {
if ( inPath[ u ] ) {
// remove this edge
} else {
// add this edge
}
inPath[ u ] ^= 1;
}
void traverse( int& origin_u, int u ) {
for ( int g = lca( origin_u, u ) ; origin_u != g ; origin_u = parent_of[ origin_u ] )
SymmetricDifference( origin_u );
for ( int v = u ; v != origin_u ; v = parent_of[ v ] )
SymmetricDifference( v );
origin_u = u;
}
void solve() {
// construct blocks using dfs
dfs( 1, 1 );
while ( stk_ ) block_id[ stk[ -- stk_ ] ] = block_;
// re-order our queries
sort( que, que + q, [] ( const Que& x, const Que& y ) {
return tie( block_id[ x.u ], dfn[ x.v ] ) < tie( block_id[ y.u ], dfn[ y.v ] );
} );
// apply mo's algorithm on tree
int U = 1, V = 1;
for ( int i = 0 ; i < q ; ++ i ) {
pass( U, que[ i ].u );
pass( V, que[ i ].v );
// we could our answer of que[ i ].id
}
}
This problem is a slight modification of the blog that you have shared.
Problem Tags:- MO's Algorithm, Trees, LCA, Binary Lifting, Sieve, Precomputation, Prime Factors
Precomputations:- Just we need to do some precomputations with seiveOfErothenesis to store the highest prime factor of each element possible in input constraints. Then using this we will store all the prime factors and their powers for each element in input array in another matrix.
Observation:- with the constraints you can see the there can be very few such primes possible for each element. For an element (10^6) there can be a maximum of 7 prime factors possible.
Modify MO Algo Given in blog:- Now in our compute method we just need to maintain a map that will store the current count of the prime factor. While adding or subtracting each element in solving the queries we will iterate on the prime factors of that element and divide our result(storing total no. of factors) with the old count of that prime and then update the count of that prime and the multiple our result with the new count.(This will be O(7) max for each addition/subtraction).
Complexity:- O(T * ((N + Q) * sqrt(N) * F)) where F is 7 in our case. F is the complexity of your check method().
T - no of test cases in input file.
N - the size of your input array.
Q - No. of queries.
Below is an implementation of the above approach in JAVA. computePrimePowers() and check() are the methods you would be interested in.
import java.util.*;
import java.io.*;
public class Main {
static int BLOCK_SIZE;
static int ar[];
static ArrayList<Integer> graph[];
static StringBuffer sb = new StringBuffer();
static boolean notPrime[] = new boolean[1000001];
static int hpf[] = new int[1000001];
static void seive(){
notPrime[0] = true;
notPrime[1] = true;
for(int i = 2; i < 1000001; i++){
if(!notPrime[i]){
hpf[i] = i;
for(int j = 2 * i; j < 1000001; j += i){
notPrime[j] = true;
hpf[j] = i;
}
}
}
}
static long modI[] = new long[1000001];
static void computeModI() {
for(int i = 1; i < 1000001; i++) {
modI[i] = pow(i, 1000000005);
}
}
static long pow(long x, long y) {
if (y == 0)
return 1;
long p = pow(x, y / 2);
p = (p >= 1000000007) ? p % 1000000007 : p;
p = p * p;
p = (p >= 1000000007) ? p % 1000000007 : p;
if ((y & 1) == 0)
return p;
else {
long tt = x * p;
return (tt >= 1000000007) ? tt % 1000000007 : tt;
}
}
public static void main(String[] args) throws Exception {
Reader s = new Reader();
int test = s.nextInt();
seive();
computeModI();
for(int ii = 0; ii < test; ii++){
int n = s.nextInt();
lcaTable = new int[19][n + 1];
graph = new ArrayList[n + 1];
arrPrimes = new int[n + 1][7][2];
primeCnt = new int[1000001];
visited = new int[n + 1];
ar = new int[n + 1];
for(int i = 0; i < graph.length; i++) graph[i] = new ArrayList<>();
for(int i = 1; i < n; i++){
int u = s.nextInt(), v = s.nextInt();
graph[u].add(v);
graph[v].add(u);
}
int ip = 1; while(ip <= n) ar[ip++] = s.nextInt();
computePrimePowers();
int q = s.nextInt();
LVL = new int[n + 1];
dfsTime = 0;
dfs(1, -1);
BLOCK_SIZE = (int) Math.sqrt(dfsTime);
int Q[][] = new int[q][4];
int i = 0;
while(q-- > 0) {
int u = s.nextInt(), v = s.nextInt();
Q[i][0] = lca(u, v);
if (l[u] > l[v]) {
int temp = u; u = v; v = temp;
}
if (Q[i][0] == u) {
Q[i][1] = l[u];
Q[i][2] = l[v];
}
else {
Q[i][1] = r[u]; // left at col1 in query
Q[i][2] = l[v]; // right at col2
}
Q[i][3] = i;
i++;
}
Arrays.sort(Q, new Comparator<int[]>() {
#Override
public int compare(int[] x, int[] y) {
int block_x = (x[1] - 1) / (BLOCK_SIZE + 1);
int block_y = (y[1] - 1) / (BLOCK_SIZE + 1);
if(block_x != block_y)
return block_x - block_y;
return x[2] - y[2];
}
});
solveQueries(Q);
}
System.out.println(sb);
}
static long res;
private static void solveQueries(int [][] Q) {
int M = Q.length;
long results[] = new long[M];
res = 1;
int curL = Q[0][1], curR = Q[0][1] - 1;
int i = 0;
while(i < M){
while (curL < Q[i][1]) check(ID[curL++]);
while (curL > Q[i][1]) check(ID[--curL]);
while (curR < Q[i][2]) check(ID[++curR]);
while (curR > Q[i][2]) check(ID[curR--]);
int u = ID[curL], v = ID[curR];
if (Q[i][0] != u && Q[i][0] != v) check(Q[i][0]);
results[Q[i][3]] = res;
if (Q[i][0] != u && Q[i][0] != v) check(Q[i][0]);
i++;
}
i = 0;
while(i < M) sb.append(results[i++] + "\n");
}
static int visited[];
static int primeCnt[];
private static void check(int x) {
if(visited[x] == 1){
for(int i = 0; i < 7; i++) {
int c = arrPrimes[x][i][1];
int pp = arrPrimes[x][i][0];
if(pp == 0) break;
long tem = res * modI[primeCnt[pp] + 1];
res = (tem >= 1000000007) ? tem % 1000000007 : tem;
primeCnt[pp] -= c;
tem = res * (primeCnt[pp] + 1);
res = (tem >= 1000000007) ? tem % 1000000007 : tem;
}
}
else if(visited[x] == 0){
for(int i = 0; i < 7; i++) {
int c = arrPrimes[x][i][1];
int pp = arrPrimes[x][i][0];
if(pp == 0) break;
long tem = res * modI[primeCnt[pp] + 1];
res = (tem >= 1000000007) ? tem % 1000000007 : tem;
primeCnt[pp] += c;
tem = res * (primeCnt[pp] + 1);
res = (tem >= 1000000007) ? tem % 1000000007 : tem;
}
}
visited[x] ^= 1;
}
static int arrPrimes[][][];
static void computePrimePowers() {
int n = arrPrimes.length;
int i = 0;
while(i < n) {
int ele = ar[i];
int k = 0;
while(ele > 1) {
int c = 0;
int pp = hpf[ele];
while(hpf[ele] == pp) {
c++; ele /= pp;
}
arrPrimes[i][k][0] = pp;
arrPrimes[i][k][1] = c;
k++;
}
i++;
}
}
static int dfsTime;
static int l[] = new int[1000001], r[] = new int[1000001], ID[] = new int[1000001], LVL[], lcaTable[][];
static void dfs(int u, int p){
l[u] = ++dfsTime;
ID[dfsTime] = u;
int i = 1;
while(i < 19) {
lcaTable[i][u] = lcaTable[i - 1][lcaTable[i - 1][u]];
i++;
}
i = 0;
while(i < graph[u].size()){
int v = graph[u].get(i);
i++;
if (v == p) continue;
LVL[v] = LVL[u] + 1;
lcaTable[0][v] = u;
dfs(v, u);
}
r[u] = ++dfsTime;
ID[dfsTime] = u;
}
static int lca(int u, int v){
if (LVL[u] > LVL[v]) {
int temp = u;
u = v; v = temp;
}
int i = 18;
while(i >= 0) {
if (LVL[v] - (1 << i) >= LVL[u]) v = lcaTable[i][v];
i--;
}
if (u == v) return u;
i = 18;
while(i >= 0){
if (lcaTable[i][u] != lcaTable[i][v]){
u = lcaTable[i][u];
v = lcaTable[i][v];
}
i--;
}
return lcaTable[0][u];
}
}
// SIMILAR SOLUTION FOR FINDING NUMBER OF DISTINCT ELEMENTS FROM U TO V
// USING MO's ALGORITHM
#include <bits/stdc++.h>
using namespace std;
const int MAXN = 40005;
const int MAXM = 100005;
const int LN = 19;
int N, M, K, cur, A[MAXN], LVL[MAXN], DP[LN][MAXN];
int BL[MAXN << 1], ID[MAXN << 1], VAL[MAXN], ANS[MAXM];
int d[MAXN], l[MAXN], r[MAXN];
bool VIS[MAXN];
vector < int > adjList[MAXN];
struct query{
int id, l, r, lc;
bool operator < (const query& rhs){
return (BL[l] == BL[rhs.l]) ? (r < rhs.r) : (BL[l] < BL[rhs.l]);
}
}Q[MAXM];
// Set up Stuff
void dfs(int u, int par){
l[u] = ++cur;
ID[cur] = u;
for (int i = 1; i < LN; i++) DP[i][u] = DP[i - 1][DP[i - 1][u]];
for (int i = 0; i < adjList[u].size(); i++){
int v = adjList[u][i];
if (v == par) continue;
LVL[v] = LVL[u] + 1;
DP[0][v] = u;
dfs(v, u);
}
r[u] = ++cur; ID[cur] = u;
}
// Function returns lca of (u) and (v)
inline int lca(int u, int v){
if (LVL[u] > LVL[v]) swap(u, v);
for (int i = LN - 1; i >= 0; i--)
if (LVL[v] - (1 << i) >= LVL[u]) v = DP[i][v];
if (u == v) return u;
for (int i = LN - 1; i >= 0; i--){
if (DP[i][u] != DP[i][v]){
u = DP[i][u];
v = DP[i][v];
}
}
return DP[0][u];
}
inline void check(int x, int& res){
// If (x) occurs twice, then don't consider it's value
if ( (VIS[x]) and (--VAL[A[x]] == 0) ) res--;
else if ( (!VIS[x]) and (VAL[A[x]]++ == 0) ) res++;
VIS[x] ^= 1;
}
void compute(){
// Perform standard Mo's Algorithm
int curL = Q[0].l, curR = Q[0].l - 1, res = 0;
for (int i = 0; i < M; i++){
while (curL < Q[i].l) check(ID[curL++], res);
while (curL > Q[i].l) check(ID[--curL], res);
while (curR < Q[i].r) check(ID[++curR], res);
while (curR > Q[i].r) check(ID[curR--], res);
int u = ID[curL], v = ID[curR];
// Case 2
if (Q[i].lc != u and Q[i].lc != v) check(Q[i].lc, res);
ANS[Q[i].id] = res;
if (Q[i].lc != u and Q[i].lc != v) check(Q[i].lc, res);
}
for (int i = 0; i < M; i++) printf("%d\n", ANS[i]);
}
int main(){
int u, v, x;
while (scanf("%d %d", &N, &M) != EOF){
// Cleanup
cur = 0;
memset(VIS, 0, sizeof(VIS));
memset(VAL, 0, sizeof(VAL));
for (int i = 1; i <= N; i++) adjList[i].clear();
// Inputting Values
for (int i = 1; i <= N; i++) scanf("%d", &A[i]);
memcpy(d + 1, A + 1, sizeof(int) * N);
// Compressing Coordinates
sort(d + 1, d + N + 1);
K = unique(d + 1, d + N + 1) - d - 1;
for (int i = 1; i <= N; i++) A[i] = lower_bound(d + 1, d + K + 1, A[i]) - d;
// Inputting Tree
for (int i = 1; i < N; i++){
scanf("%d %d", &u, &v);
adjList[u].push_back(v);
adjList[v].push_back(u);
}
// Preprocess
DP[0][1] = 1;
dfs(1, -1);
int size = sqrt(cur);
for (int i = 1; i <= cur; i++) BL[i] = (i - 1) / size + 1;
for (int i = 0; i < M; i++){
scanf("%d %d", &u, &v);
Q[i].lc = lca(u, v);
if (l[u] > l[v]) swap(u, v);
if (Q[i].lc == u) Q[i].l = l[u], Q[i].r = l[v];
else Q[i].l = r[u], Q[i].r = l[v];
Q[i].id = i;
}
sort(Q, Q + M);
compute();
}
}
Demo

Square Root in C/C++

I am trying to implement my own square root function which gives square root's integral part only e.g. square root of 3 = 1.
I saw the method here and tried to implement the method
int mySqrt(int x)
{
int n = x;
x = pow(2, ceil(log(n) / log(2)) / 2);
int y=0;
while (y < x)
{
y = (x + n / x) / 2;
x = y;
}
return x;
}
The above method fails for input 8. Also, I don't get why it should work.
Also, I tried the method here
int mySqrt(int x)
{
if (x == 0) return 0;
int x0 = pow(2, (log(x) / log(2))/2) ;
int y = x0;
int diff = 10;
while (diff>0)
{
x0 = (x0 + x / x0) / 2; diff = y - x0;
y = x0;
if (diff<0) diff = diff * (-1);
}
return x0;
}
In this second way, for input 3 the loop continues ... indefinitely (x0 toggles between 1 and 2).
I am aware that both are essentially versions of Netwon's method but I can't figure out why they fail in certain cases and how could I make them work for all cases. I guess i have the correct logic in implementation. I debugged my code but still I can't find a way to make it work.
This one works for me:
uintmax_t zsqrt(uintmax_t x)
{
if(x==0) return 0;
uintmax_t yn = x; // The 'next' estimate
uintmax_t y = 0; // The result
uintmax_t yp; // The previous estimate
do{
yp = y;
y = yn;
yn = (y + x/y) >> 1; // Newton step
}while(yn ^ yp); // (yn != yp) shortcut for dumb compilers
return y;
}
returns floor(sqrt(x))
Instead of testing for 0 with a single estimate, test with 2 estimates.
When I was writing this, I noticed the result estimate would sometimes oscillate. This is because, if the exact result is a fraction, the algorithm could only jump between the two nearest values. So, terminating when the next estimate is the same as the previous will prevent an infinite loop.
Try this
int n,i;//n is the input number
i=0;
while(i<=n)
{
if((i*i)==n)
{
cout<<"The number has exact root : "<<i<<endl;
}
else if((i*i)>n)
{
cout<<"The integer part is "<<(i-1)<<endl;
}
i++;
}
Hope this helps.
You can try there C sqrt implementations :
// return the number that was multiplied by itself to reach N.
unsigned square_root_1(const unsigned num) {
unsigned a, b, c, d;
for (b = a = num, c = 1; a >>= 1; ++c);
for (c = 1 << (c & -2); c; c >>= 2) {
d = a + c;
a >>= 1;
if (b >= d)
b -= d, a += c;
}
return a;
}
// return the number that was multiplied by itself to reach N.
unsigned square_root_2(unsigned n){
unsigned a = n > 0, b;
if (n > 3)
for (a = n >> 1, b = (a + n / a) >> 1; b < a; a = b, b = (a + n / a) >> 1);
return a ;
}
Example of usage :
#include <assert.h>
int main(void){
unsigned num, res ;
num = 1847902954, res = square_root_1(num), assert(res == 42987);
num = 2, res = square_root_2(num), assert(res == 1);
num = 0, res = square_root_2(num), assert(res == 0);
}
Source

Tetris Game array issue

So I'm trying to make a Tetris game and I've come across something odd that I'm unsure about.
I have an array called bottom which stores the value of the lowest block - so, if there is no block in the first column, "bottom" will be 20.
If there's a square block occupying that first column, bottom would be 18. The weird thing is, when I set a breakpoint in my code to try to view the values for bottom, it says there is only one value in the array. In addition, my board, which is a 25 by 10 array, has the same problem, it only displays one dimension.
It seems the problem has to do with some kind of pointer issue, because it says (int (*)[10]) and (int *), where I think it should be a (int [25][10]) and (int [10]). I tried looking up array pointers and references, but the main thing I found was how to make an array of pointers and I'm not really quite sure how to word my searches.
If someone might know what's going wrong please let me know!
main.cpp
#include <chrono>
#include "makeboard.h"
int main() {
//declares and defines board
int board[24][10];
for (int y = 0; y < 24; y++) {
for (int x = 0; x < 10; x++) {
board[y][x] = 0;
}
}
makeboard(board);
}
tiles.h
#ifndef tiles_h
#define tiles_h
class O {
int board[24][10];
int x, y;
public:
void set_O (int[24][10], int, int);
};
void O::set_O (int board[24][10], int y, int x) {
board[y][x] = 1;
board[y][x+1] = 1;
board[y-1][x] = 1;
board[y-1][x+1] = 1;
}
class I {
int board[24][10];
int x, y, d;
public:
void set_I (int[24][10], int, int, int);
};
void I::set_I (int board[24][10], int d, int y, int x) {
if (d == 1 || d == 3) {
board[y-3][x] = 1;
board[y-2][x] = 1;
board[y-1][x] = 1;
board[y][x] = 1;
}
if (d == 2 || d == 4) {
board[y][x-1] = 1;
board[y][x] = 1;
board[y][x+1] = 1;
board[y][x+2] = 1;
}
}
class S {
int board[24][10];
int x, y, d;
public:
void set_S (int[24][10], int, int, int);
};
void S::set_S (int board[24][10], int d, int y, int x) {
if (d == 1 || d == 3) {
board[y-1][x] = 1;
board[y-1][x+1] = 1;
board[y][x] = 1;
board[y][x-1] = 1;
}
if (d == 2 || d == 4) {
board[y-2][x] = 1;
board[y-1][x] = 1;
board[y-1][x+1] = 1;
board[y][x+1] = 1;
}
}
class Z {
int board[24][10];
int x, y, d;
public:
void set_Z (int[24][10], int, int, int);
};
void Z::set_Z (int board[24][10], int d, int y, int x) {
if (d == 1 || d == 3) {
board[y][x] = 1;
board[y][x-1] = 1;
board[y+1][x] = 1;
board[y+1][x+1] = 1;
}
if (d == 2 || d == 4) {
board[y-1][x+1] = 1;
board[y][x+1] = 1;
board[y][x] = 1;
board[y+1][x] = 1;
}
}
class T {
int board[24][10];
int d, x, y;
public:
void set_T (int[24][10], int, int, int);
};
void T::set_T (int board[24][10], int d, int y, int x) {
if (d == 1 && (board[y+1][x-1] != 1 || board[y+1][x] != 1 || board[y+1][x+1] != 1)) {
board[y-1][x] = 1;
board[y][x-1] = 1;
board[y][x] = 1;
board[y][x+1] = 1;
}
if (d == 2 && (board[y+2][x] != 1 || board[y+1][x+1] != 1)) {
board[y-1][x] = 1;
board[y][x] = 1;
board[y][x+1] = 1;
board[y+1][x] = 1;
}
if (d == 3 && (board[y+1][x-1] != 1 || board[y+2][x] != 1 || board[y+1][x+1] != 1)) {
board[y][x-1] = 1;
board[y][x] = 1;
board[y][x+1] = 1;
board[y+1][x] = 1;
}
if (d == 4 && (board[y+1][x-1] != 1 || board[y+2][x] != 1)) {
board[y-1][x] = 1;
board[y][x-1] = 1;
board[y][x] = 1;
board[y+1][x] = 1;
}
}
class J {
int board[24][10];
int d, x, y;
public:
void set_J (int[24][10], int, int, int);
};
void J::set_J (int board[24][10], int d, int y, int x) {
if (d == 1) {
board[y-1][x-1] = 1;
board[y-1][x] = 1;
board[y-1][x+1] = 1;
board[y][x+1] = 1;
}
if (d == 2) {
board[y-2][x] = 1;
board[y-1][x] = 1;
board[y][x] = 1;
board[y][x-1] = 1;
}
if (d == 3) {
board[y][x-1] = 1;
board[y][x] = 1;
board[y][x+1] = 1;
board[y-1][x-1] = 1;
}
if (d == 4) {
board[y-2][x] = 1;
board[y-2][x+1] = 1;
board[y-1][x] = 1;
board[y][x] = 1;
}
}
class L {
int board[24][10];
int d, x, y;
public:
void set_L (int[24][10], int, int, int);
};
void L::set_L (int board[24][10], int d, int y, int x) {
if (d == 1) {
board[y-1][x-1] = 1;
board[y-1][x] = 1;
board[y-1][x+1] = 1;
board[y][x-1] = 1;
}
if (d == 2) {
board[y-2][x] = 1;
board[y-1][x] = 1;
board[y][x] = 1;
board[y][x-1] = 1;
}
if (d == 3) {
board[y-1][x-1] = 1;
board[y-1][x] = 1;
board[y-1][x+1] = 1;
board[y][x+1] = 1;
}
if (d == 4) {
board[y-2][x] = 1;
board[y-1][x] = 1;
board[y][x] = 1;
board[y][x+1] = 1;
}
}
#endif
makeboard.cpp
#include <iostream>
#include <limits>
#include <thread>
#include "makeboard.h"
#include "clearscreen.h"
#include "isBottom.h"
#include "isPressed.h"
#include "tiles.h"
using namespace std;
string icon[3] = { " ", " o ", " o " };
void makeboard(int board[24][10]) {
time_t srand( time(NULL) );
int block = srand % 7 ;
block = 3;
//declares pieces
O o;
I i;
S s;
Z z;
T t;
J j;
L l;
//declares and defines initial bottom
int bottom[10];
for (int i = 0; i < 10; i++) bottom[i] = 23;
//declares and defines initial block position
int y = 3;
int x = 4;
int d = 1;
while (!isBottom(board, block, y, x, d, bottom)) {
if (isPressed(0) && x > 0) {
x--;
}
if (isPressed(2) && x < 10) {
x++;
}
if (isPressed(1)) {
d += 1;
if (d == 4) {
d = 1;
}
}
//moves tile down
y++;
//clears screen
clearscreen();
//clears non set pieces
for (int i = 0; i < 24; i++) {
for (int j = 0; j < 10; j++) {
if (board[i][j] == 1) {
board[i][j] = 0;
}
}
}
//adds blocks to board
switch (block) {
case 1:
o.set_O(board, y, x);
break;
case 2:
i.set_I(board, d, y, x);
break;
case 3:
s.set_S(board, d, y, x);
break;
case 4:
z.set_Z(board, d, y, x);
break;
case 5:
t.set_T(board, d, y, x);
break;
case 6:
j.set_J(board, d, y, x);
break;
case 7:
l.set_L(board, d, y, x);
break;
}
//builds board
cout << "╔══════════════════════════════╗" << endl;
for (int i = 4; i < 24; i++) {
cout << "║";
for (int j = 0; j < 10; j++) {
cout << icon[board[i][j]] ;
}
cout << "║" << endl;
}
cout << "╚══════════════════════════════╝" << endl;
cout << " 0 1 2 3 4 5 6 7 8 9 " << endl;
//resets initial tile position
if (isBottom(board, block, y, x, d, bottom)) {
y = 2;
//block = srand % 7;
}
//ends game
if (isBottom(board, block, 3, x, d, bottom)) {
cout << "You lose!";
return;
}
//delay
this_thread::sleep_for (chrono::milliseconds(100));
}
return;
}
clearscreen.cpp
#include <unistd.h>
#include <term.h>
#include <stdlib.h>
#include "clearscreen.h"
void clearscreen()
{
if (!cur_term)
{
void *a;
int result;
setupterm( NULL, STDOUT_FILENO, &result );
a = malloc(sizeof(int) *result);
free (a);
if (result <= 0) free (a); return;
}
putp( tigetstr( "clear" ) );
}
isBottom.cpp
#include "isBottom.h"
bool isBottom(int board[24][10], int block, int y, int x, int d, int bottom[10]) {
switch (block) {
case 1:
if (y == bottom[x] || y == bottom[x+1]) {
board[y][x] = 2;
board[y][x+1] = 2;
board[y-1][x] = 2;
board[y-1][x+1] = 2;
bottom[x] -= 2;
bottom[x+1] -= 2;
return true;
}
return false;
break;
case 2:
if (d == 1 || d == 3) {
if (y == bottom[x]) {
board[y-3][x] = 2;
board[y-2][x] = 2;
board[y-1][x] = 2;
board[y][x] = 2;
bottom[x] -= 4;
return true;
}
return false;
break;
}
if (d == 2 || d == 4) {
if (y == bottom[x-1] || y == bottom[x] || y == bottom[x+1] || y == bottom[x+2]) {
board[y][x-1] = 2;
board[y][x] = 2;
board[y][x+1] = 2;
board[y][x+2] = 2;
bottom[x-1]--;
bottom[x]--;
bottom[x+1]--;
bottom[x+2]--;
return true;
}
return false;
break;
}
case 3:
if (d == 1 || d == 3) {
if (y == bottom[x-1] || y == bottom[x] || y == bottom[x+1]) {
board[y-1][x] = 2;
board[y-1][x+1] = 2;
board[y][x] = 2;
board[y][x-1] = 2;
bottom[x-1] = 23 - y;
bottom[x] -= 2;
bottom[x+1] -= 2;
return true;
break;
}
return false;
break;
}
if (d == 2 || d == 4) {
if (y == bottom[x-1] || y == bottom[x]) {
board[y-2][x] = 2;
board[y-1][x] = 2;
board[y-1][x+1] = 2;
board[y][x+1] = 2;
bottom[x-1]--;
bottom[x] -= 1;
return true;
break;
}
return false;
break;
}
/*
case 3:
s.set_S(board, d, y, x);
break;
case 4:
z.set_Z(board, d, y, x);
break;
case 5:
t.set_T(board, d, y, x);
break;
case 6:
j.set_J(board, d, y, x);
break;
case 7:
l.set_L(board, d, y, x);
break;
*/
}
return true;
}
isPressed.cpp
#include <Carbon/Carbon.h>
#include "isPressed.h"
bool isPressed( unsigned short inKeyCode )
{
unsigned char keyMap[16];
GetKeys((BigEndianUInt32*) &keyMap);
return (0 != ((keyMap[ inKeyCode >> 3] >> (inKeyCode & 7)) & 1));
}
It depends on the scope of your array. For example:
int GetBottom(int* bottom);
int GetBottom2(const int (&bottom)[20]);
int main()
{
int localArray1d[20] = {};
int localArray2d[10][25] = {};
// putting a breakpoint here will allow you to see the full dimensions of the array because this function KNOWS what the object is (e.g. a 1d and 2d array respectively)
int lastBrick = GetBottom(localArray1d);
// When the array is passed to GetBottom, it's passed just as a pointer. Although it IS an array, the function GetBottom doesn't know that. We could just as simply pass it a single int*
int n = 0;
GetBottom(&n); // here we are only passing a single int pointer. GetBottom has no idea that your object is an array, it only knows it has an int*
lastBrick = GetBottom2(localArray1d);
// GetBottom2 only takes an array of 20 elements, so inspecting the object in that function allows you to see all the elements.
return 0;
}
int GetBottom(int* bottom)
{
// Having a breakpoint here will not allow you to see all the elements in an array since this function doesn't even know bottom IS an array.
}
int GetBottom2(const int (&bottom)[20])
{
// A breakpoint here will allow you to fully inspect bottom.
}
It's a little tricky when you refer to arrays the way you do, but an array like int array[5] degrades to int* array when you branch outside the scope in which it is defined. It's because arrays are r-values and need to degrade into a reference or pointer l-value (which lacks that info about how many elements there are) to pass them around. The gotcha part here is that you can still write a function which accepts int parameter[5] and the compiler will accept it, but will silently treat it like int* parameter. The same goes for the debugger.
So depending on your debugger, there's different ways to look at all the elements through a pointer anyway. For example, with this code:
int* ptr = some_array;
... in MSVC, I can only see the first element pointed to by ptr in the watch window. However, if I know that some_array has 10 elements, I can type ptr,10 in the watch window and it'll show me all 10 elements.
Also, again this is debugger-specific, but some debuggers are conveniently programmed to show the contents of standard containers no matter what in a beautifully-readable format. So if you can use contaners like std::vector, it'll make your debugging life easier if you're using such a debugger.

nCk modulo p when n % p or k % p == 0

I'm trying to solve a coding challenge on hacker rank which requires one to calculate binomial coefficients mod a prime, i.e.
nchoosek(n, k, p)
I'm using the code from this answer that works for the first three sets of inputs but begins failing on the 4th. I stepped through it in the debugger and determined that the issue arises when:
n % p == 0 || k % p == 0
I just need to know how to modify my current solution to handle the specific cases where n % p == 0 or k % p == 0. None of the answers I've found on stack exchange seem to address this specific case. Here's my code:
#include <iostream>
#include <fstream>
long long FactorialExponent(long long n, long long p)
{
long long ex = 0;
do
{
n /= p;
ex += n;
}while(n > 0);
return ex;
}
unsigned long long ModularMultiply(unsigned long long a, unsigned long long b, unsigned long p) {
unsigned long long a1 = (a >> 21), a2 = a & ((1ull << 21) - 1);
unsigned long long temp = (a1 * b) % p; // doesn't overflow under the assumptions
temp = (temp << 21) % p; // this neither
temp += (a2 * b) % p; // nor this
return temp % p;
}
unsigned long long ModularInverse(unsigned long long k, unsigned long m) {
if (m == 0) return (k == 1 || k == -1) ? k : 0;
if (m < 0) m = -m;
k %= m;
if (k < 0) k += m;
int neg = 1;
unsigned long long p1 = 1, p2 = 0, k1 = k, m1 = m, q, r, temp;
while(k1 > 0) {
q = m1 / k1;
r = m1 % k1;
temp = q*p1 + p2;
p2 = p1;
p1 = temp;
m1 = k1;
k1 = r;
neg = !neg;
}
return neg ? m - p2 : p2;
}
// Preconditions: 0 <= k <= min(n,p-1); p > 1 prime
unsigned long long ChooseModTwo(unsigned long long n, unsigned long long k, unsigned long p)
{
// reduce n modulo p
n %= p;
// Trivial checks
if (n < k) {
return 0;
}
if (k == 0 || k == n) {
return 1;
}
// Now 0 < k < n, save a bit of work if k > n/2
if (k > n/2) {
k = n-k;
}
// calculate numerator and denominator modulo p
unsigned long long num = n, den = 1;
for(n = n-1; k > 1; --n, --k)
{
num = ModularMultiply(num, n, p);
den = ModularMultiply(den, k, p);
}
den = ModularInverse(den,p);
return ModularMultiply(num, den, p);
}
// Preconditions: 0 <= k <= n; p > 1 prime
long long ChooseModOne(long long n, long long k, const unsigned long p)
{
// For small k, no recursion is necessary
if (k < p) return ChooseModTwo(n,k,p);
unsigned long long q_n, r_n, q_k, r_k, choose;
q_n = n / p;
r_n = n % p;
q_k = k / p;
r_k = k % p;
choose = ChooseModTwo(r_n, r_k, p);
// If the exponent of p in choose(n,k) isn't determined to be 0
// before the calculation gets serious, short-cut here:
// if (choose == 0) return 0;
return ModularMultiply(choose, ChooseModOne(q_n, q_k, p), p);
}
unsigned long long ModularBinomialCoefficient(unsigned long long n, unsigned long long k, const unsigned long p)
{
// We deal with the trivial cases first
if (k < 0 || n < k) return 0;
if (k == 0 || k == n) return 1;
// Now check whether choose(n,k) is divisible by p
if (FactorialExponent(n, p) > FactorialExponent(k, p) + FactorialExponent(n - k, p)) return 0;
// If it's not divisible, do the generic work
return ChooseModOne(n, k, p);
}
int main() {
//std::ifstream fin ("input03.txt");
std::ifstream fin ("test.in");
int kMod = 1000003;
int T;
fin >> T;
int N = T;
//std::cin >> T;
unsigned long long n, k;
unsigned long long a, b;
int result[N];
int index = 0;
while (T--) {
fin >> n >> k;
a = ModularBinomialCoefficient(n - 3, k, kMod);
b = ModularBinomialCoefficient(n + k, n - 1, kMod);
// (1 / (n + k) * nCk(n - 3, k) * nCk(n + k, n - 1)) % 1000003
unsigned long long x = ModularMultiply(a, b, kMod);
unsigned long long y = ModularMultiply(x, ModularInverse((n + k), kMod), kMod);
result[index] = y;
index++;
}
for(int i = 0; i < N; i++) {
std::cout << result[i] << "\n";
}
return 0;
}
Input:
6
90 13
65434244 16341234
23424244 12341234
424175 341198
7452123 23472
56000168 16000048
Output:
815483
715724
92308
903465
241972
0 <-- Incorrect, should be: 803478
Constraints:
4 <= N <= 10^9
1 <= K <= N
You can use Lucas' theorem to reduce the problem to ceil(log_P(N)) subproblems with k, n < p: Write n = n_m * p^m + ... + n_0 and k = k_m * p^m + ... + k_0 in base p (n_i, k_i < p are the digits), then we have
C(n,k) = PROD(i = 0 to m, C(n_i, k_i)) (mod p)
The subproblems are easy to solve, because every factor of k! has an inverse modulo p. You get an algorithm with runtime complexity O(p log(n)), which is better than that of Ivaylo's code in case of p << n, if I understand it correctly.
int powmod(int x, int e, int p) {
if (e == 0) return 1;
if (e & 1) return (long long)x * powmod(x, e - 1, p) % p;
long long rt = powmod(x, e / 2, p);
return rt * rt % p;
}
int binom_coeff_mod_prime(int n, int k, int p) {
long long res = 1;
while (n || k) {
int N = n % p, K = k % p;
for (int i = N - K + 1; i <= N; ++i)
res = res * i % p;
for (int i = 1; i <= K; ++i)
res = res * powmod(i, p - 2, p) % p;
n /= p;
k /= p;
}
return res;
}
I suggest you use factorization to compute the number of combinations without division. I've got code for doing so here, originally inspired by Fast computation of multi-category number of combinations (I still would like to post a proper answer to that, if some kind souls would reopen it).
My code stores the result as a table of factors, doing the modular multiplication to expand the result should be quite straightforward.
Probably not practical for n in the range of 10**9, though, since the sieve will be quite massive and take a while to construct.

Generating a sequence 1,3,8,22,60,164 .How to make my code more effficient?

#include <vector>
#include<iostream>
#include<stdio.h>
#define REP(i,n) for (ll i = 1; i <= n; i++)
using namespace std;
typedef unsigned long long int ll;
typedef vector<vector<ll> > matrix;
ll MOD = 1000000007;
const ll K = 2;
// computes A * B
matrix mul(matrix A, matrix B)
{
matrix C(K+1, vector<ll>(K+1));
REP(i, K) REP(j, K) REP(k, K)
C[i][j] = (C[i][j] + A[i][k] * B[k][j]) % MOD;
return C;
}
// computes A ^ p
matrix pow(matrix A, ll p)
{
if (p == 1)
return A;
if (p & 1)
return mul(A, pow(A, p-1));
matrix X = pow(A, p>>1);
return mul(X, X);
}
// returns the N-th term of Fibonacci sequence
ll fib(ll N)
{
// create vector F1
vector<ll> F1(K+1);
F1[1] = 1;
F1[2] = 3;
// create matrix T
matrix T(K+1, vector<ll>(K+1));
T[1][1] = 0, T[1][2] = 1;
T[2][1] = 2, T[2][2] = 2;
// raise T to the (N-1)th power
if (N == 1)
return 1;
T = pow(T, N-1);
// the answer is the first row of T . F1
ll res = 0;
REP(i, K)
res = (res + ((T[1][i] )* (F1[i]))) %MOD;
return res;
}
ll fib2(ll n)
{
if(n==1)
return 1;
ll a=1;ll b=3;ll c;
for(ll i=3;i<=n;i++)
{
c=(2*a+2*b)%MOD;
a=b;
b=c;
}
return c;
}
int main()
{
ll t;
scanf("%llu",&t);
// t=10000;
ll n=1;
while(t--)
{
scanf("%llu",&n);
//n=1;
// n++;
// n=1000000000;
printf("%llu\n",fib(n));
}
return 0;
}
I am writing a code to generate 1,3,8,22,60,164 a[n]=2*(a[n-1]+a[n-2]) mod 10^9+7 .I am using modular exponentiation and the matrix multiplication method to generate this sequence.How can I improve its time from 2.3 seconds for worst case i.e. n=10^9 .
10000 times to around .5 to 1 second?
Please give me suggestions to improve speed of this code.
I suspect vector is the main culprit here - dynamic allocation and numerical stuff don't mix very well.
2x2 matrices are way too small for any fancy algorithm to have any impact. I have a hunch that they'd actually be worse due to fanciness overhead.
Have you tried unrolling the loops and ditching the dynamic allocation?
I hope this is correct:
void mul(ll A[][2], ll B[][2], ll C[][2])
{
C[0][0] = (A[0][0] * B[0][0]) % MOD;
C[0][0] = (C[0][0] + A[0][1] * B[1][0]) % MOD;
C[0][1] = (A[0][0] * B[0][1]) % MOD;
C[0][1] = (C[0][1] + A[0][1] * B[1][1]) % MOD;
C[1][0] = (A[1][0] * B[0][0]) % MOD;
C[1][0] = (C[1][0] + A[1][1] * B[1][0]) % MOD;
C[1][1] = (A[1][0] * B[0][1]) % MOD;
C[1][1] = (C[1][1] + A[1][1] * B[1][1]) % MOD;
}
void pow(ll A[][2], ll p, ll out[][2])
{
if (p == 1)
{
out[0][0] = A[0][0];
out[0][1] = A[0][1];
out[1][0] = A[1][0];
out[1][1] = A[1][1];
return;
}
if (p & 1)
{
ll B[2][2] = {{0}};
pow(A, p - 1, B);
mul(A, B, out);
}
else
{
ll X[2][2] = {{0}};
pow(A, p >> 1, X);
mul(X, X, out);
}
}
ll fibv(ll N)
{
ll T[2][2] =
{
{2, 2},
{1, 0}
};
if (N == 1)
return 1;
ll RM[2][2] = {{0}};
pow(T, N-1, RM);
ll res = RM[0][1] % MOD;
res = (res + RM[0][0] * 3) % MOD;
return res;
}
https://en.wikipedia.org/wiki/Strassen_algorithm
You can even find it in video lectures at MIT Opencourseware Intro to Algorithms course or Stanford's Algorithms course
You will get a significant speedup if you specialise to 2x2 matrices:
struct matrix {
ll a, b, c, d ;
void Square() ;
void Mul (const matrix& M) ;
} ;