I was messing with a toy program for cuda.
I declare a float array transfer that to gpu and a number to each element of that float array and transfer it back to the host system and print the array. However this is not working out and it is giving me segmentation fault.
Here's code
#include <iostream>
using namespace std;
__global__ void kern(float *a, float *C){
for (int i = 0; i < 3; i++) C[i] = a[i] + i;
}
int main(){
float *A = new float[3];
for(int i = 0; i < 3; i++){
A[i] = i;
}
float * d;
float * C;
cudaMalloc(&C, sizeof(float)*3);
cudaMalloc(&d, sizeof(float)*3);
cudaMemcpy(&d, A, sizeof(float)*3, cudaMemcpyHostToDevice);
kern<<<1, 1>>>(d, C);
cudaMemcpy(&A, C, sizeof(float)*3, cudaMemcpyDeviceToHost);
cout << A[2];
}
Also I am not familiar with Malloc most of my experience was with cpp and hence I am more comfortable with new datatype[]; is there a equivalent for Cuda?
Change this to:
cudaMemcpy(&d, A, sizeof(float)*3, cudaMemcpyHostToDevice);
cudaMemcpy(&A, C, sizeof(float)*3, cudaMemcpyDeviceToHost);
To this:
cudaMemcpy(d, A, sizeof(float)*3, cudaMemcpyHostToDevice);
cudaMemcpy(A, C, sizeof(float)*3, cudaMemcpyDeviceToHost);
Also it's always better to store return code by CUDA calls they will give you better idea what going wrong.
Related
#include <stdio.h>
#define N 1024
int main(){
int i, j;
int a[N][N];
int b[N][N];
for (i=0;i<N;i++){
a[i][i]=i;
b[i][i]=i;
}
for (i=0;i<N;i++)
for(j=0;j<N;j++)
{
printf("%d", a[i][j]);
printf("%d", b[i][j]);
}
return 0;
}
This program is a reason of segmentation fault, but if I define N as 1023, program will work correctly. Why it happens?
You are overflowing the stack. 2 * 1024 * 1024 * sizeof(int) is a lot for most systems.
The simplest solution would be to make the arrays static.
static int a[N][N];
static int b[N][N];
Other methods:
Make the arrays global (this is essentially the same as the above)
Use malloc in a loop and of course remember to free
int **a = malloc(N * sizeof *a);
for (i = 0; i < N; i++)
a[i] = malloc(N * sizeof *a[i]);
#include <stdio.h>
#define N 1024
int main(){
int i, j;
int a[N][N];
int b[N][N];
for (i=0;i<N;i++){
a[i][i]=i;
b[i][i]=i;
}
for (i=0;i<N;i++)
for(j=0;j<N;j++)
{
printf("%d", a[i][j]);
printf("%d", b[i][j]);
}
return 0;
}
This program is a reason of segmentation fault, but if I define N as 1023, program will work correctly. Why it happens?
You are overflowing the stack. 2 * 1024 * 1024 * sizeof(int) is a lot for most systems.
The simplest solution would be to make the arrays static.
static int a[N][N];
static int b[N][N];
Other methods:
Make the arrays global (this is essentially the same as the above)
Use malloc in a loop and of course remember to free
int **a = malloc(N * sizeof *a);
for (i = 0; i < N; i++)
a[i] = malloc(N * sizeof *a[i]);
I have a sparse matrix in cuSparse and I want to extract the diagonal. I can't seem to find a way to do it other than converting it back to CPU memory into Eigen SparseMatrix and use the .diagonal provided by Eigen to do it, and then copy the result back to GPU. Obviously this is pretty inefficient so I am wondering if there's a way to do it directly in the GPU. Please see below code for reference:
void CuSparseTransposeToEigenSparse(
const int *d_row,
const int *d_col,
const double *d_val,
const int num_non0,
const int mat_row,
const int mat_col,
Eigen::SparseMatrix<double> &mat){
std::vector<int> outer(mat_col + 1);
std::vector<int> inner(num_non0);
std::vector<double> value(num_non0);
cudaMemcpy(
outer.data(), d_row, sizeof(int) * (mat_col + 1), cudaMemcpyDeviceToHost);
cudaMemcpy(
inner.data(), d_col, sizeof(int) * num_non0, cudaMemcpyDeviceToHost);
cudaMemcpy(
value.data(), d_val, sizeof(double) * num_non0, cudaMemcpyDeviceToHost);
Eigen::Map<Eigen::SparseMatrix<double>> mat_map(
mat_row, mat_col, num_non0, outer.data(), inner.data(), value.data());
mat = mat_map.eval();
}
int main(){
int *d_A_row;
int *d_A_col;
double *d_A_val;
int A_len;
int num_A_non0;
double *d_A_diag;
// these values are filled with some computation
// current solution
Eigen::SparseMatrix<double> A;
CuSparseTransposeToEigenSparse(
d_A_row, d_A_col, d_A_val, num_A_non0, A_len, A_len, A);
Eigen::VectorXd A_diag = A.diagonal();
cudaMemcpy(d_A_diag, A_diag.data(), sizeof(double) * A_len, cudaMemcpyHostToDevice);
// is there a way to fill in d_A_diag without copying back to CPU?
return 0;
}
Just in case anyone is interested. I figured it out for the case of a CSR matrix. The custom kernel to do it looks like this:
__global__ static void GetDiagFromSparseMat(const int *A_row,
const int *A_col,
const double *A_val,
const int A_len,
double *A_diag){
const int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < A_len){
const int num_non0_row = A_row[x + 1] - A_row[x];
A_diag[x] = 0.0;
for (int i = 0; i < num_non0_row; i++){
if (A_col[i + A_row[x]] == x){
A_diag[x] = A_val[i + A_row[x]];
break;
}
}
}
}
this is my code
#include "stdafx.h"
#include <iostream>
using namespace std;
#define n 10
__device__ int glMem[n];
__global__ void initVals()
{
for(int i=0;i<n;i++)
glMem[i] = 0;
}
__global__ void test(int *out)
{
for(int i=0;i<n;i++)
out[i] = 10;
}
int main()
{
const size_t sz = size_t(n)*sizeof(int);
initVals<<<1,1>>>();
int *devMem;
cudaMalloc((void **)&devMem, sz);
test<<<1, 1>>>(devMem);
int *hoMem=new int[n];
cudaMemcpy(hoMem, devMem,sz, cudaMemcpyDeviceToHost);
//print
for(int i=0;i<n;i++)
cout<<hoMem[i]<<endl;
return 0;
}
IN this code I define
glMem
to size n. If I dont know the size earlier hw can I define??
for example I need to define like this.
__device__ int *glMem;
It doesnt work. Please give some code sample..
In that case you need to allocate the memory into the device.
// size of data
unsigned int size_of_glMem = n * sizeof(int);
// allocate device memory for result
int* glMem = NULL;
cudaMalloc( (void**) &glMem, size_of_glMem );
Hope this help.
I've passed a 2D array from a C++ class to a CUDA function; however, once in the CUDA function the data in the matrix is gone. I'm still in the host, not the device so I don't understand what I've done wrong as this should be very straight forward.
Here is the C++
int main()
{
const int row=8;
const int column=8;
int rnum;
srand(time(0));
rnum = (rand() % 100) + 1;
float table[row][column];
for(int r=0; r<row; r++){
for(int c=0; c<column;c++){
table[row][column] = (rand()%100) + 1.f;
cout << table[row][column] << " ";
}
cout << "\n";
}
//CUDA
handleMatrix(&table[0][0], 8);
}
Here is the CUDA code that is just printing out the matrix.
void handleMatrix(float * A, int size)
{
printf("&A[0]=%i\n",&A);
printf("A[0] is %f \n",A[0]);
for(int j=0; j<size; j++){
for(int k=0; k<size;k++){
printf("%f ",A[j +size*k]); // << " ";
}
printf("\n");
}
}
In the C++ file - the print out of the matrix has real numbers, but the CUDA function just prints out 0's for both the matrix and for the address of A[0]. I don't know if this means I'm not passing in the matrix correctly between the 2 or if there is something I should do with the matrix once I get it to the CUDA function.
Ha, needed a while to find it. Check the indexing in your matrix randomization code. :) You're using the wrong variables and never initialize the float values.
float * A is a pointer on host, not in device space. use cuda malloc+memcpy.
float * A doesnt pass contents, only the address.