two-dimensional arrays in CUDA - c++

I'm practicing this simple code which takes a two-dimensional array and sums them up with CUDA. In the end, the result of C is not what I accepting. Also, I was wondering whether I can use vector instead of c-style arrays.
#include <iostream>
using namespace std;
#define N 2
__global__ void MatAdd(double** a, double** b,
double** c)
{
int i = threadIdx.x;
int j = threadIdx.y;
c[i][j] = a[i][j] + b[i][j];
}
int main()
{
double a[2][2]= {{1.0,2.0},{3.0,4.0}};
double b[2][2]= {{1.0,2.0},{3.0,4.0}};
double c[2][2]; // it will be the result!
double** a_d;
double** b_d;
double** c_d;
int d_size = N * N * sizeof(double);
int numBlocks = 1;
dim3 threadsPerBlock(N, N);
cudaMalloc(&a_d, d_size);
cudaMalloc(&b_d, d_size);
cudaMalloc(&c_d, d_size);
cudaMemcpy(a_d, a, d_size, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b, d_size, cudaMemcpyHostToDevice);
cudaMemcpy(c_d, c, d_size, cudaMemcpyHostToDevice);
MatAdd<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d);
//cudaDeviceSynchronize();
cudaMemcpy(c, c_d, d_size, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++){
for(int j=0; j<N; j++){
cout<<c[i][j]<<endl;
}
}
return 0;
}

You must not use the double** type in this case. Alternatively, you should use a flatten array that contains all the values of a given matrix in a double*-type variable.
The heart of the problem is located in the following line (and the similar next ones):
cudaMemcpy(a_d, a, d_size, cudaMemcpyHostToDevice);
Here you assume that a and a_d are compatible types, but they are not. A double**-typed variable is a pointer that refer to one or more pointers in memory (typically an array of pointer referencing many different double-typed arrays), while a double*-typed variable or a static 2D C array refer to a contiguous location in memory.
Note that you can access to a given (i,j) cell of a matrix using matrix[N*i+j], where N is the number of column, assuming matrix is a flatten matrix of type double* and use a row-major ordering.

Related

I am getting zeros as a result of vector additon in cuda and no errors

I am running a cuda vec addtion program and getting zeros as the output of its sum later. I have tried debugging but am not able to get to the problem at hand. It should be adding the numbers but is rather simply printing out zeros which I am not able to understand why is happening.
I have tried doing everything to the code and still I am not getting any output.
using namespace std;
__global__ void vecADDKernal(double *A, double *B, double *C, int n){
int id = blockIdx.x*blockDim.x+threadIdx.x;
if(id<n) C[id] = A[id] + B[id];
}
int main( ){
int n = 1048576;
int size = n*sizeof(double);
double *d_A, *d_B;
double *d_C;
double *h_A, *h_B, *h_C;
h_A = (double*)malloc(size);
h_B = (double*)malloc(size);
h_C = (double*)malloc(size);
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_A[i] = 2*i;
h_B[i] = 3*i;
}
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
int blockSize = 256;
// Number of thread blocks in grid
int gridSize = ceil(n/blockSize);
vecADDKernal<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
double sum = 0;
for(int a = 0; a<n; a++) {
sum = h_C[a];
cout<<h_C[a]<<endl;
}
cout<<"HI "<< sum <<endl;
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
I just ran your code as is (only adding the necessary includes) and I got non-zero output. Have you verified that your device can run the nvidia provided sample successfully?
The sample is exactly what you are trying to do with vector addition, but with proper error checking and result verification.
A few notes:
The first line in your for loop assigns (=) a value to sum, instead
of adding the value to sum (+=), so you will only have the last value
in sum instead of the accumulated value.
Proper error checking helps, even with trivial examples. The sample
provides an example as does the answer Robert linked.
Did you try opening up the memory debugger to see if your values were
infact 0 in memory? printing to console is another place things can
go wrong.
You can use vectors to store your host data. You can access the raw
array for memCpy operations with vector.data() and it gives you easy
access to all sorts of useful things like range based for as well as
things like accumulate and fill functions.

Find the index of an element in an array with cuda c++

I am new with cuda.
I have two arrays:
int* AA = new int[5]{1,2,3,4,5};
int* BB = new int[5]{ 2,2,2,4,4 };
and I want to find the index of every element in AA that is equal to each element in BB that in this case is
{1,1,1,3,3}
here is My code:
__global__ void findIndex(int* A, int* B, int* C)
{
int i = threadIdx.x;
for (int j = 0; j < 5; j++)
{
if (B[i] == A[j])
{
C[i] = j;
}
}
}
int main() {
int* AA = new int[5]{1,2,3,4,5};
int* BB = new int[5]{ 2,2,2,4,4 };
int* CC = new int[5]{ 0,0,0,0,0 };
int(*ppA), (*ppB), (*ppC);
cudaMalloc((void**)&ppA, (5) * sizeof(int));
cudaMalloc((void**)&ppB, (5) * sizeof(int));
cudaMalloc((void**)&ppC, (5) * sizeof(int));
cudaMemcpy(ppA, AA, 5 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(ppB, BB, 5 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(ppC, CC, 5 * sizeof(int), cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(5);
findIndex << <numBlocks, threadsPerBlock >> > (ppA, ppB, ppC);
cudaMemcpy(CC, ppC, 5 * sizeof(int), cudaMemcpyDeviceToHost);
for (int m = 0; m < 5; m++) {
printf("%d ", CC[m]);
}
}
My output is:
{1,2,3,0,0}
Can anyone help?
Simplest non-stable single-gpu solution would be using atomics, something like this:
__global__ void find(int * arr,int * counter, int * result)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
if(arr[id] == 4)
{
int ctr = atomicAdd(counter,1);
result[ctr] = id;
}
}
this way you can have an array of results in "result" array and if the wanted number is sparse it wouldn't slowdown much (like only few in whole source array). This is not an optimal way for multi-gpu systems, though. Requires host-side coordination between gpus, unless a special CUDA feature from newest toolkit is used (for system-level atomics).
If number of 4s leads to a "dense" arr array or if you have multiple gpus, then you should look for other solutions like stream compaction. First select the cells containing 4 as a mask. Then do the compaction. Some Nvidia blogs or tutorials have this algorithm.
For the "atomic" solution (especially on "shared" memory atomics), Maxwell (and onwards) architecture is much better than Kepler, just in case you still use a Kepler. Also using atomics is not exactly reproducible as the order of atomic operations can not be known. You will get a differently-ordered result array most of the time. But the stream compaction preserves the result order. This may save you from writing a sorting algorithm (like bitonic-sort, shear-sort, etc) on top of it.

Passing multidimensional arrays (without initially defining its dimensions) into a function in C++

I have been looking for code on how to create a matrix determinant calculator, and I found code from ( Matrix determinant algorithm C++ ).
I adopted the code to try it, but i realized that i did not know how to pass in a multidimensional array into the function, without defining its dimensions first (i got errors).
So, can you please show me how to pass a multidimensional array into a function without defining its dimensions first (i.e how int a[MAX][MAX] is an argument, what is 'MAX').
I hope my question is clear, thank you for your help.
This is the (edited) code:
-sample input would be a square matrix and its size.
int determinant(int oMat[][], int n){
int sMat[n][n]; //Temporary matrix
int det = 0; //Initializing 'det'.
if(n==1){
det = oMat[0][0]; //Calculating.
return det;
}else if(n==2){
det = oMat[0][0]*oMat[1][1] - oMat[1][0]*oMat[0][1]; //Formula for 2x2.
return det;
}else{
for(int p=0; p<n; p++){ //Selecting 'oMat' row one.
int k=0; //'sMat' rows.
int m=0; //'sMat' columns.
//Creating the temporary matrix 'sMat'.
for(int i=1; i<n; i++){ //for 'oMat' rows.
for(int j=0; j<n; j++){
if(j==p){
continue;
}
sMat[k][m] = oMat[i][j]; m++;
if(m==n-1){
k++; //Go to the next row.
m = 0; //Start at column one (index 0).
}
}
}
det = det + oMat[0][p] * pow(-1,p) * determinant(sMat, n-1);
}
return det;
}
}
You can pass an array by reference and have the function figure out the size of the matrix. To do this use a template as follows
template <int R>
int determ(int (&a)[R][R]) {
//now R is the number of rows
std::cout << R << std::endl;
// rest of code here
}
If you're passing a 2-D array to function determ() as
int a[MAX][MAX];
determ(a, int n);
the function declaration should be:
determ(int a[][MAX], int n) { .. }
In the above declaration number of rows need not be specified as we are not allocating the memory for the array hence can be ignored. Number of columns is required for the dimension of the array.
There is two ways:
1) Declare your function as is:
int determ(int** a, int size, int n) {...} // 'size' aka 'MAX'
2) Use dynamic container such as std::vector:
int determ(const vector<vector<int>>& a, int n) {...}
You have a number of options:
Use a compile time size like MAX, and declare your array parameter in terms of that:
int determinant(int (*oMat)[MAX], int n) {
With that you are passing a pointer to the first element (= row) of your 2D array. The same can be done with the std::array<> template:
int determinant(array<array<int, MAX>, MAX> *oMat, int n) {
However, in both cases MAX must be a compile time constant. You may pass MAX in as a template parameter, allowing different code to work on matrices of different size, but the problem remains that MAX needs to be a compile time constant.
If you want to work with a dynamic size in C++, you have two choices: First you can use a vector of vector
int determinant(vector<vector<int> > *oMat, int n) {
This allows maximum flexibility, however, it does not enforce that the lines have the same size. oMat could be a ragged 2D array. The other way is to use a 1D array and calculate the indexes yourself:
int determinant(int size, int *oMat, int n) {
for(int i = 0; i < size; i++)
for(int j = 0; j < size; j++)
//Access the individual elements with
oMat[size*i + j];
The third approach is to drop back to C, because in contrast to C++, C allows for true dynamic 2D arrays:
int determinant(int size, int (*oMat)[size], int n) {
for(int i = 0; i < size; i++)
for(int j = 0; j < size; j++)
//Access the individual elements with
oMat[i][j];
You would create the array and call the function like this:
int n = /*whatever*/;
int (*matrix)[n] = malloc(n*sizeof(*matrix));
fillMatrix(n, matrix);
determinant(n, matrix, n);
free(matrix); //Don't forget the cleanup.

Assigning one 2D array to another

So in my program I have a function which passes into it a 2D array and I want to set another 2D array equal to that 2D array. I am coding this in C++ and can't quite get the syntax right. Right now I have the following:
void MyFunction(float **two_d_array){
float newArray[4][4];
//Set new array equal to two_d_array
}
two_d_array will also always be 4x4 so the dimensions themselves aren't an issue.
I hope you are not passing a two-dimensional array as a double pointer to your function.
Anyways, you can just write
for (int i = 0; i < 4; i++)
for (int j = 0; j < 4; j++)
newArray[i][j] = two_d_array[i][j];
If you have another two-dimensional array (and not an array of pointers), then simply use memcpy():
void foo(float arr[][4])
{
float newArray[4][4];
memcpy(newArray, arr, sizeof(newArray));
}
When you define a two dimentional array as
float a[4][4]
Its type is float [4][4].
if you want to pass float** to the function you can create your with float**
float** f = (float**) malloc(sizeof(float *)*4);
for(int i=0;i<4;i++){
f[i] = (float*) malloc(sizeof(float)*4);
}
//initialize
MyFunction(f);
And Myfunction will be similar
void MyFunction(float **two_d_array){
float newArray[4][4];
for(int i=0;i<4;i++){
float* one_d = & two_d_array[i][0];
for(int j=0;j<4;j++){
newArray[i][j] = one_d[j];
}
}
}
Yes, you must assign the second dimenson, and the first dimension passing by a integer parameter, it will look like this:
void foo(float arr[][4], int dim);

Efficient way to copy strided data (to and from a CUDA Device)?

Is there a possibility to copy data strided by a constant (or even non-constant) value to and from the CUDA device efficiently?
I want to diagonalize a large symmetric matrix.
Using the jacobi algorithm there is a bunch of operations using two rows and two columns within each iteration.
Since the Matrix itself is too big to be copied to the device entirely i am looking for a way to copy the two rows and columns to the device.
It would be nice to use the triangular matrix form to store the data but additional downsides like
non-constant row-length [not that Kind of a Problem]
non-constant stride of the column values [the stride increases by 1 for each row.]
arise.
[edit: Even using triangular form it is still impossible to store the whole Matrix on the GPU.]
I looked at some timings and recognized that copying strided values one by one is very slow (synchronous as well as async.).
// edit: removed solution - added an answer
Thanks to Robert Crovella for giving the right hint to use cudamemcpy2d.
I'll append my test code to give everyone the possibility to comprehend...
If anyone Comes up with suggestions for solving the copy problem using row-major-ordered triangular matrices - feel free to write another answer please.
__global__ void setValues (double *arr, double value)
{
arr[blockIdx.x] = value;
}
int main( void )
{
// define consts
static size_t const R = 10, C = 10, RC = R*C;
// create matrices and initialize
double * matrix = (double*) malloc(RC*sizeof(double)),
*final_matrix = (double*) malloc(RC*sizeof(double));
for (size_t i=0; i<RC; ++i) matrix[i] = rand()%R+10;
memcpy(final_matrix, matrix, RC*sizeof(double));
// create vectors on the device
double *dev_col, *dev_row,
*h_row = (double*) malloc(C*sizeof(double)),
*h_col = (double*) malloc(R*sizeof(double));
cudaMalloc((void**)&dev_row, C * sizeof(double));
cudaMalloc((void**)&dev_col, R * sizeof(double));
// choose row / col to copy
size_t selected_row = 7, selected_col = 3;
// since we are in row-major order we can copy the row at once
cudaMemcpy(dev_row, &matrix[selected_row*C],
C * sizeof(double), cudaMemcpyHostToDevice);
// the colum needs to be copied using cudaMemcpy2D
// with Columnsize*sizeof(type) as source pitch
cudaMemcpy2D(dev_col, sizeof(double), &matrix[selected_col],
C*sizeof(double), sizeof(double), R, cudaMemcpyHostToDevice);
// copy back to host to check whether we got the right column and row
cudaMemcpy(h_row, dev_row, C * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(h_col, dev_col, R * sizeof(double), cudaMemcpyDeviceToHost);
// change values to evaluate backcopy
setValues<<<R, 1>>>(dev_col, 88.0); // column should be 88
setValues<<<C, 1>>>(dev_row, 99.0); // row should be 99
// backcopy
cudaMemcpy(&final_matrix[selected_row*C], dev_row,
C * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy2D(&final_matrix[selected_col], C*sizeof(double), dev_col,
sizeof(double), sizeof(double), R, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// output for checking functionality
printf("Initial Matrix:\n");
for (size_t i=0; i<R; ++i)
{
for (size_t j=0; j<C; ++j) printf(" %lf", matrix[i*C+j]);
printf("\n");
}
printf("\nRow %u values: ", selected_row);
for (size_t i=0; i<C; ++i) printf(" %lf", h_row[i]);
printf("\nCol %u values: ", selected_col);
for (size_t i=0; i<R; ++i) printf(" %lf", h_col[i]);
printf("\n\n");
printf("Final Matrix:\n");
for (size_t i=0; i<R; ++i)
{
for (size_t j=0; j<C; ++j) printf(" %lf", final_matrix[i*C+j]);
printf("\n");
}
cudaFree(dev_col);
cudaFree(dev_row);
free(matrix);
free(final_matrix);
free(h_row);
free(h_col);
cudaDeviceReset();
return 0;
}