CUDA parallel program flow for Dijkstra Algorithm - c++

the code here is a cuda code and is meant to find shortest pair path using Dijkstra's algorithm.
My code logic works perfectly in a c program, not in Cuda. I'm using 1 block with N threads, N being user entered.
First doubt, every thread has their own copy of variables except the shared variable temp. Correct ?
When i print the results I'm storing all values in array d and print its value which is zero for all. This is possible only if the flow of control does not enter loop after s = threadIdx.x.
Please help, have been debugging this since last 24 Hrs.
Given Input is:
Number of vertices: 4
enter the source,destination and cost of the edge\n Enter -1 to end
Input\n Edges start from Zero : 0 1 1
enter the source,destination and cost of the edge\n Enter -1 to end
Input\n Edges start from Zero : 0 2 5
enter the source,destination and cost of the edge\n Enter -1 to end
Input\n Edges start from Zero : 0 3 2
enter the source,destination and cost of the edge\n Enter -1 to end
Input\n Edges start from Zero : 1 3 4
enter the source,destination and cost of the edge\n Enter -1 to end
Input\n Edges start from Zero : 2 3 7
enter the source,destination and cost of the edge\n Enter -1 to end
Input\n Edges start from Zero : -1 -1 -1
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<sys/time.h>
#define nano 1000000L
__global__ void dijkstras(int *a, int *b, int *n)
{
int i;
int d[10],p[10],v[10];
// d stores distnce/cost of each path
// p stores path taken
// v stores the nodes already travelled to
int k,u,s;
int check =0;
// shared memory on cuda device
__shared__ int temp[20];
for(i=0; i < (*n)*(*n); i++)
{
temp[i] = a[i];
}
check = check + 1;
__syncthreads();
// were passing int s -- node from which distances are calculated
s = threadIdx.x;
for(i=0; i<(*n); i++)
{
d[i]=temp[s*(*n)+i];
if(d[i]!=9999)
p[i]=1;
else
p[i]=0;
v[i]=0;
}
p[s]=0;
v[s]=1;
for(i=0; i<((*n)-1); i++)
{
// findmin starts here
int i1,j1,min=0;
for(i1=0;i1<(*n);i1++)
{
if(v[i1]==0)
{
min=i1;
break;
}
}
for(j1=min+1;j1<(*n);j1++)
{
if((v[j1]==0) && (d[j1]<d[min]))
min=j1;
}
k = min;
// findmin ends here
v[k]=1;
for(u=0; u<(*n); u++)
{
if((v[u]==0) && (temp[k*(*n)+u]!=9999))
{
if(d[u]>d[k]+temp[k*(*n)+u])
{
d[u]=d[k]+temp[k*(*n)+u];
p[u]=k;
}
}
}
//storing output
int count = 0;
for(i = (s*(*n)); i< (s+1) * (*n); i++)
{
b[i] = d[count];
count++;
}
}
*n = check;
}
main()
{
int *a, *b, *n;
int *d_a, *d_b, *d_n;
int i,j,c;
int check = 0;
printf("enter the number of vertices.... : ");
n = (int*)malloc(sizeof(int));
scanf("%d",n);
int size = (*n) * (*n) * sizeof(int);
//allocating device memory
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_n, sizeof(int));
a = (int*)malloc(size);
b = (int*)malloc(size);
check = check +1;
for(i=0; i<(*n); i++)
for(j=0; j<=i; j++)
if(i==j)
a[(i*(*n) + j)]=0;
else
a[(i*(*n) + j)]=a[(j*(*n) + i)]=9999;
printf("\nInitial matrix is\n");
for(i=0;i<(*n);i++)
{
for(j=0;j<(*n);j++)
{
printf("%d ",a[i*(*n)+j]);
}
printf("\n");
}
while(1)
{
printf("\n enter the source,destination and cost of the edge\n Enter -1 to end Input\n Edges start from Zero : \n");
scanf("%d %d %d",&i,&j,&c);
if(i==-1)
break;
a[(i*(*n) + j)]=a[(j*(*n) + i)]=c;
}
printf("\nInput matrix is\n");
for(i=0;i<(*n);i++)
{
for(j=0;j<(*n);j++)
{
printf("%d ",a[i*(*n)+j]);
}
printf("\n");
}
check = check +1;
// copying input matrix to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_n, n, sizeof(int), cudaMemcpyHostToDevice);
check++;
struct timeval start,stop;
double time;
int N = *n;
gettimeofday(&start,NULL);
dijkstras<<<1,N>>>(d_a, d_b, d_n);
gettimeofday(&stop,NULL);
time=(double)(stop.tv_sec-start.tv_sec)+(double)(stop.tv_usec-start.tv_usec)/(double)nano;
printf("\n TIME TAKEN: %lf\n",time);
check++;
// copying result from device to host
cudaMemcpy(b, d_b, size, cudaMemcpyDeviceToHost);
cudaMemcpy(n, d_n, sizeof(int), cudaMemcpyDeviceToHost);
check++;
// printing result
printf("the shortest paths are....");
for(i=0; i<(N); i++)
{
for(j=0; j<(N); j++)
{
if(i != j)
printf("\n the cost of the path from %d to %d = %d\n",i,j,b[i*(N) + j]);
}
printf("\n\n");
}
printf("your debug value of check in main is %d\n",check); //5
printf("your debug value of check in device is %d\n",*n); // 1+ 7+ 10
free(a); free(b);free(n);
cudaFree(d_a); cudaFree(d_b);cudaFree(d_n);
}

The root cause of this problem was supplying an uninitialised device variable as a kernel argument. In this kernel call:
dijkstras<<<1,N>>>(d_a, d_b, d_n);
d_n had been allocated memory, but never assigned a value, resulting in undefined behaviour within the kernel.
I would contend this proved hard for the original poster to detect because of a poor design decision in the kernel itself. In this prototype:
__global__ void dijkstras(int *a, int *b, int *n)
n was being used as both an input and an output with two completely different meanings, which made it far harder to detect the problem with the call. If the prototype was:
__global__ void dijkstras(int *a, int *b, int n, *int check)
then the role of n and checkwould be far clearer, and likelihood of making a mistake when calling the kernel and missing it when debugging would be lessened.

Related

How to implement Dijkstra Algorithm for finding shortest path from 1 node to all other in an undirected graph in C and print the distances as well

There is an undirected graph. You need to store all edge weights in a two-dimensional array cost[][], and calculate the shortest distance from the source node 0 to all other nodes. Suppose there are at most 100 nodes. If there is no edge between two nodes, we set their weight to a very large number, MAX_DIS=999999, to denote these two nodes are not connected directly.
In this exercise, you are required to fulfill the following two tasks.
Initialize Cost Array
Initially, we have the array cost[100][100] to store the edge cost. We input the total nodes number n and edges number m, and the input all edges with <x,y,w> format, where w is the weight of edge (x,y). If there is no edge between two nodes, we set the cost MAX_DIS.
Calculate Shortest Distance.
With the cost array, we need to compute the shortest distance between node 0 and all other nodes. Also, we need to initialize the distance array distance[100] at first. Then in each loop, we first find the min distance distance[w] and update other distance distance[v] if node v is adjacent to w.
//Below is my code for this challenge, but it is not working properly for all the test cases. It works fine for some but I can't figure out where is the problem. I hope this is a good challenge to be solved and that is why I am posting it here. Can you guys help me debug this code...
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define MAX_NODES 100
#define MAX_DIS 999999
int cost[MAX_NODES][MAX_NODES];
int distance[MAX_NODES];
void initial(int m, int n);
void Dijkstra(int n);
void initial(int m, int n)
{
/*
let user input all edges and their weights and initialize cost[][].
note that if no edge between (x,y), set cost[x][y]=MAX_DIS
and cost[a][b]=cost[b][a] for the undirected graph.
Fill in your code here...
*/
int i,j;
for(i=0;i<n;i++){
for(j=0;j<n;j++){
cost[i][j] = MAX_DIS;
}
}
cost[0][0] = 0;
int weight,x,y;
for(i=0; i < m; i++){
scanf("%d %d %d", &x,&y,&weight);
cost[x][y] = weight;
cost[y][x] = weight;
}
}
void Dijsktra(int n)
{
/*
Fill in your code here...
calculate the distance from node 0 to all other nodes.
*/
int i;
int S[n];
S[0] = 1;
int all_visited = 0;
for(i=1;i<n;i++){
S[i] = -1;
}
for(i=0;i<n;i++){
distance[i] = cost[0][i];
}
while(all_visited != 1){
int temp = MAX_DIS;
int pos = -1;
for(i=1;i<n;i++){
if(S[i] == -1 && cost[0][i] <= temp){
temp = cost[0][i];
pos = i;
}
}
S[pos] = 1;
for(i=0;i<n;i++){
if(S[i] == -1)
break;
}
if(i==n)
all_visited = 1;
for(i=1; i<n; i++){
distance[i] = (int)fmin(distance[i], distance[pos] + cost[pos][i]);
}
}
}
int main()
{
int m,n;
printf("Input the number of nodes:\n");
scanf("%d",&n);
printf("Input the number of edges:\n");
scanf("%d",&m);
printf("Input these edges:\n");
initial(m,n);
Dijsktra(n);
for(int i=0;i<n;i++)
printf("%d ",distance[i]);
return 0;
}
This is test case for which my code is failing -
Input the number of nodes:
8
Input the number of edges:
10
Input these edges:
0 1 2,
1 2 9,
2 3 4,
3 5 7,
2 4 8,
5 6 10,
6 7 8,
7 5 1,
7 3 4,
0 4 10
Expected output - 0 2 11 15 10 20 27 19
My output - 0 2 11 15 10 999999 999999 999999
use a break statement in this loop.
for(i=1;i<n;i++){
if(S[i] == -1 && cost[0][i] <= temp){
temp = cost[0][i];
pos = i;
break; //here
}
}

Pascal's Triangle fails past row 14

I have to write a program that outputs Pascal's triangle for a computer science class, and everything is correct on the output until it gets past row 14, wherein it starts outputting odd irrational numbers. Here's my code
#include <iostream>
#include "myFunctions.h"
using namespace std;
int main() {
int rows;
cout << "Please Enter The Number of Rows: ";
cin >> rows;
cout << rows << endl;
for (int i = 0; i < rows; i++) {
for (int j = 1; j < (rows - i + 1); j++) {
cout << " ";
}
for (int k = 0; k <= i; k++) {
if (k == 0) {
cout << "1" << " ";
} else {
cout << combination(i, k) << " ";
}
}
cout << "\n";
}
return 0;
}
And here's my functions file:
#ifndef MYFUNCTIONS_CPP_INCLUDED
#define MYFUNCTIONS_CPP_INCLUDED
#include "myFunctions.h"
double factorial (int n) {
assert(n >= 0);
int v = 1;
while (n > 0) {
v *= n;
n--;
}
return v;
}
double combination (int a, int b) {
return (factorial(a) / (factorial(a - b) * factorial(b)));
}
#endif // MYFUNCTIONS_CPP_INCLUDED
And, finally, here's my header file.
#ifndef MYFUNCTIONS_H_INCLUDED
#define MYFUNCTIONS_H_INCLUDED
#include <iostream>
#include <cassert>
//*******************************************************
// description: finds factorial of value *
// return: double *
// precondition: that the value is valid and an integer *
// postcondition: returns the factorial of value *
//*******************************************************
double factorial( int n );
//********************************************************
// description: finds combination of value *
// return: double *
// precondition: both values are integers and valid *
// postcondition: returns the combination of two values *
//********************************************************
double combination( int a, int b );
#endif // MYFUNCTIONS_H_INCLUDED
I'm assuming that I did the equations within functions incorrect, or something specific is happening in main once it hits 14. Any help is appreciated.
What's going on
ints in C++ have a maximum size. As mentioned in comments, depends on your platform but for the sake of this question, I'll assume it's 2^31-1 which corresponds to a 32-bit signed integer and is what I most commonly see.
The issue comes in when you get to factorials. They grow very quickly. 14!=87178291200 which is a whole lot bigger than the maximum size of a 32 bit int. There's no feasible way to keep the whole factorial in memory for an arbitrary n! because of how large they can get.
It's not that your code is broken, it's simply running up against the physical bounds of computing.
How can we fix it?
First off, you could cancel out factorials. Basically, since we can guarantee that a>=b, we know that a!/b! is just multiplying the numbers between a and b. We can do that with a loop. Then it's just a matter of dividing by (a-b)!, which we already know how to do. This would look like
int combination(int a, int b)
{
int tmp = 1;
for(int ii = b;ii<=a;ii++)
tmp*=ii;
tmp /= factorial(b);
return tmp;
}
More efficiently, we can switch to a different algorithm. Wikipedia recommends using an iterative method for pascal's triangle. That is, each element can be calculated from two elements in the row above it. As #Damien mentions in comments, if you're looking for the kth element in row n, then you can calculate that by
int Combination(int n,int k)
{
if (k == 0 or k>n or n <= 1)
return 1;
return Combination(n-1,k) + Combination(n-1,k-1);
}

For loop only executes once, raising a number to another number

New to programming/coding, and couldn't get why my code doesn't work.
It's supposed to show you the result of raising a number to another number. But it only ends up looping once.
Example of error:
Input an integer: 3
Raise integer to what number: 4
3 raised to 4 is 9
My code:
#include <stdio.h>
int raiseToPow(int nNum1, int *nResult) {
*nResult = nNum1 * nNum1;
}
int main() {
int nNum1, nNum2, nResult, i;
printf("Input an integer: ");
scanf("%d", &nNum1);
printf("\n");
printf("Raise integer to what number: ");
scanf("%d", &nNum2);
printf("\n");
for (i = 0; i < nNum2; i++) {
raiseToPow(nNum1, &nResult);
}
printf("%d raised to %d is %d", nNum1, nNum2, nResult);
}
you should initialize nResult by 1 because your variable doesn't have anything inside. Also, replace *nResult = nNum1 * nNum1 by *nResult = *nResult * nNum1
It's looping the right number of times, you just need to put the print inside the loop. Also you should add a newline to the end of your print.
for(i = 0; i < nNum2; i++)
{
raiseToPow(nNum1, &nResult);
printf("%d raised to %d is %d\n", nNum1, nNum2, nResult);
}

MPI_Scatterv: segmentation fault 11 on process 0 only

I'm trying to scatter values among processes belonging to an hypercube group (quicksort project).
Depending on the amount of processes I either create a new communicator excluding excessive processes, or I duplicate MPI_COMM_WORLD if it fits exactly any hypercube (power of 2).
In both cases, processes other than 0 receive their data, but:
- On first scenario, process 0 throws a segmentation fault 11
- On second scenario, nothing faults, but process 0 received values are gibberish.
NOTE: If I try a regular MPI_Scatter everything works well.
//Input
vector<int> LoadFromFile();
int d; //dimension of hypercube
int p; //active processes
int idle; //idle processes
vector<int> values; //values loaded
int arraySize; //number of total values to distribute
int main(int argc, char* argv[])
{
int mpiWorldRank;
int mpiWorldSize;
int mpiRank;
int mpiSize;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &mpiWorldRank);
MPI_Comm_size(MPI_COMM_WORLD, &mpiWorldSize);
MPI_Comm MPI_COMM_HYPERCUBE;
d = log2(mpiWorldSize);
p = pow(2, d); //Number of processes belonging to the hypercube
idle = mpiWorldSize - p; //number of processes in excess
int toExclude[idle]; //array of idle processes to exclude from communicator
int sendCounts[p]; //array of values sizes to be sent to processes
//
int i = 0;
while (i < idle)
{
toExclude[i] = mpiWorldSize - 1 - i;
++i;
}
//CREATING HYPERCUBE GROUP: Group of size of power of 2 -----------------
MPI_Group world_group;
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
// Remove excessive processors if any from communicator
if (idle > 0)
{
MPI_Group newGroup;
MPI_Group_excl(world_group, 1, toExclude, &newGroup);
MPI_Comm_create(MPI_COMM_WORLD, newGroup, &MPI_COMM_HYPERCUBE);
//Abort any processor not part of the hypercube.
if (mpiWorldRank > p)
{
cout << "aborting: " << mpiWorldRank <<endl;
MPI_Finalize();
return 0;
}
}
else
{
MPI_Comm_dup(MPI_COMM_WORLD, &MPI_COMM_HYPERCUBE);
}
MPI_Comm_rank(MPI_COMM_HYPERCUBE, &mpiRank);
MPI_Comm_size(MPI_COMM_HYPERCUBE, &mpiSize);
//END OF: CREATING HYPERCUBE GROUP --------------------------
if (mpiRank == 0)
{
//STEP1: Read input
values = LoadFromFile();
arraySize = values.size();
}
//Transforming input vector into an array
int valuesArray[values.size()];
if(mpiRank == 0)
{
copy(values.begin(), values.end(), valuesArray);
}
//Broadcast input size to all processes
MPI_Bcast(&arraySize, 1, MPI_INT, 0, MPI_COMM_HYPERCUBE);
//MPI_Scatterv: determining size of arrays to be received and displacement
int nmin = arraySize / p;
int remainingData = arraySize % p;
int displs[p];
int recvCount;
int k = 0;
for (i=0; i<p; i++)
{
sendCounts[i] = i < remainingData
? nmin+1
: nmin;
displs[i] = k;
k += sendCounts[i];
}
recvCount = sendCounts[mpiRank];
int recvValues[recvCount];
//Following MPI_Scatter works well:
// MPI_Scatter(&valuesArray, 13, MPI_INT, recvValues , 13, MPI_INT, 0, MPI_COMM_HYPERCUBE);
MPI_Scatterv(&valuesArray, sendCounts, displs, MPI_INT, recvValues , recvCount, MPI_INT, 0, MPI_COMM_HYPERCUBE);
int j = 0;
while (j < recvCount)
{
cout << "rank " << mpiRank << " received: " << recvValues[j] << endl;
++j;
}
MPI_Finalize();
return 0;
}
First of all, you are supplying wrong arguments to MPI_Group_excl:
MPI_Group_excl(world_group, 1, toExclude, &newGroup);
// ^
The second argument specifies the number of entries in the exclusion list and should therefore be equal to idle. Since you are excluding a single rank only, the resulting group has mpiWorldSize-1 ranks and hence MPI_Scatterv expects that both sendCounts[] and displs[] have that many elements. Of those only p elements are properly initialised and and the rest are random, therefore MPI_Scatterv crashes in the root.
Another error is the code that aborts the idle processes: it should read if (mpiWorldRank >= p).
I would recommend that the entire exclusion code is replaced by a single call to MPI_Comm_split instead:
MPI_Comm comm_hypercube;
int colour = mpiWorldRank >= p ? MPI_UNDEFINED : 0;
MPI_Comm_split(MPI_COMM_WORLD, colour, mpiWorldRank, &comm_hypercube);
if (comm_hypercube == MPI_COMM_NULL)
{
MPI_Finalize();
return 0;
}
When no process supplies MPI_UNDEFINED as its colour, the call is equivalent to MPI_Comm_dup.
Note that you should avoid using in your code names starting with MPI_ as those could clash with symbols from the MPI implementation.
Additional note: std::vector<T> uses contiguous storage, therefore you could do without copying the elements into a regular array and simply provide the address of the first element in the call to MPI_Scatter(v):
MPI_Scatterv(&values[0], ...);

run time error - non zero exception

my programming teacher gave me this problem to code it in c :
given array of N integers A and a number K. During a turn the maximal value over all Ai is chosen, let's call it MAX. Then Ai =
MAX - Ai is done for every 1 <= i <= N. Help Roman to find out how will the array look like after K turns.
Input
The numbers N and K are given in the first line of an input. Then N integers are given in the second line which denote the array A.
Output
Output N numbers on a single line. It should be the array A after K turns.
Constraints
* 1 <= N <= 10^5
* 0 <= K <= 10^9
* Ai does not exceed 2 * 10^9 by it's absolute value.
Example
Input:
4 1
5 -1 7 0
Output:
2 8 0 7
and my code to this problem is :
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
long int Max(long int *arr, int low, int high)
{
long int max,i;
max = arr[low];
for(i=0;i<=high;i++)
{
if(max<=arr[i])
max = arr[i];
}
return max;
}
/* Driver program to test above function */
int main()
{
long int max,*arr;
long int n,k,c1,c2,c3,i,j;
c1 = (long int)pow(10,5);
c2 = (long int)pow(10,9);
c3 = 2*c2;
scanf("%ld %ld",&n,&k);
if(n<1||n>c1)
exit(1);
else if(k<0||k>c2)
exit(1);
else
{
arr = (long int *)malloc(sizeof(int)*n);
for(i=0;i<n;i++)
{
scanf("%ld",&arr[i]);
if(abs(arr[i])>c3)
exit(1);
}
if(k%2 == 0)
{
for(i=0;i<2;i++)
{
max = Max(arr, 0, n-1);
for(j=0;j<n;j++)
{
arr[j] = max-arr[j];
if(abs(arr[j])>c3)
exit(1);
}
}
}
else if(k%2 != 0)
{
max = Max(arr, 0, n-1);
for(j=0;j<n;j++)
{
arr[j] = max-arr[j];
/*if(abs(arr[j])>c3)
exit(1);*/
}
}
/* for(m=0;m<n;m++)
printf("%ld ",arr[m]);
printf("\n");*/
for(i=0;i<n;i++)
printf("%ld ",arr[i]);
printf("\n");
}
return 0;
}
i executed this code on gcc compiler in ubuntu, it is working perfectly with all the constraints satisfied but when I uploaded this code on my teacher's portal which has a compiler and executed the code, it said Runtime error -
nzec which means a non-zero exception which is used to signify that main() does not have "return 0;" statement or exception thrown by c++ compiler.
Please, can anyone help me what is wrong in my code as there is a return 0; statement in my code. Please Help.
Everyone has pointed out multiple use of exits ... Can I reduce them using any other way in place of exit()?
My guess is that it has to do with the various exit(1) statements you have for error conditions.
As pointed out by Dave Costa, exit(1) could be the cause
Another possible problem is the size of the allocated array:
arr = (long int *)malloc(sizeof(int)*n);
should be:
arr = malloc(sizeof(long int)*n);
And note that you don't need to use pow for constants:
c1 = (long int)pow(10,5);
c2 = (long int)pow(10,9);
could be replaced with:
c1 = 1e5L;
c2 = 1e9L;