How to get the diagonal of a sparse matrix in cuSparse? - c++

I have a sparse matrix in cuSparse and I want to extract the diagonal. I can't seem to find a way to do it other than converting it back to CPU memory into Eigen SparseMatrix and use the .diagonal provided by Eigen to do it, and then copy the result back to GPU. Obviously this is pretty inefficient so I am wondering if there's a way to do it directly in the GPU. Please see below code for reference:
void CuSparseTransposeToEigenSparse(
const int *d_row,
const int *d_col,
const double *d_val,
const int num_non0,
const int mat_row,
const int mat_col,
Eigen::SparseMatrix<double> &mat){
std::vector<int> outer(mat_col + 1);
std::vector<int> inner(num_non0);
std::vector<double> value(num_non0);
cudaMemcpy(
outer.data(), d_row, sizeof(int) * (mat_col + 1), cudaMemcpyDeviceToHost);
cudaMemcpy(
inner.data(), d_col, sizeof(int) * num_non0, cudaMemcpyDeviceToHost);
cudaMemcpy(
value.data(), d_val, sizeof(double) * num_non0, cudaMemcpyDeviceToHost);
Eigen::Map<Eigen::SparseMatrix<double>> mat_map(
mat_row, mat_col, num_non0, outer.data(), inner.data(), value.data());
mat = mat_map.eval();
}
int main(){
int *d_A_row;
int *d_A_col;
double *d_A_val;
int A_len;
int num_A_non0;
double *d_A_diag;
// these values are filled with some computation
// current solution
Eigen::SparseMatrix<double> A;
CuSparseTransposeToEigenSparse(
d_A_row, d_A_col, d_A_val, num_A_non0, A_len, A_len, A);
Eigen::VectorXd A_diag = A.diagonal();
cudaMemcpy(d_A_diag, A_diag.data(), sizeof(double) * A_len, cudaMemcpyHostToDevice);
// is there a way to fill in d_A_diag without copying back to CPU?
return 0;
}

Just in case anyone is interested. I figured it out for the case of a CSR matrix. The custom kernel to do it looks like this:
__global__ static void GetDiagFromSparseMat(const int *A_row,
const int *A_col,
const double *A_val,
const int A_len,
double *A_diag){
const int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < A_len){
const int num_non0_row = A_row[x + 1] - A_row[x];
A_diag[x] = 0.0;
for (int i = 0; i < num_non0_row; i++){
if (A_col[i + A_row[x]] == x){
A_diag[x] = A_val[i + A_row[x]];
break;
}
}
}
}

Related

Segmentation fault when using cudaMemcpy

I'm trying to use cudaMemcpy to a std::vector::data to an array for a device kernel and it gives set fault error. The way I do it is:
cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);
where vx is vector. The following is the complete example. Any hints on where the problem are would be appreciated.
#include <iostream>
#include <math.h>
#include <vector>
using namespace std;
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < n) {
y[i] = x[i] + y[i];
}
}
int main(void)
{
int N = 1<<10;
float *d_x = NULL, *d_y = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*N);
cudaMalloc((void **)&d_y, sizeof(float)*N);
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
vx.push_back(1.0f);
vy.push_back(2.0f);
}
cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), N*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;
cout<<"blockSize: "<<blockSize<<" minGridSize: "<<minGridSize<<" gridSize: "<<gridSize<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
add<<<gridSize, blockSize>>>(N, d_x, d_y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++) {
maxError = fmax(maxError, fabs(d_y[i]-3.0f));
}
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
return 0;
}
blockSize: 1024 minGridSize: 16 gridSize: 1
Launched blocks of size 1024. Theoretical occupancy: 1.000000
Segmentation fault (core dumped)
The problem is here:
for (int i = 0; i < N; i++) {
maxError = fmax(maxError, fabs(d_y[i]-3.0f));
^^^^^^
}
And the reason is you cannot dereference device pointer on host.
The solution is copy device memory to host similar to what you did for host to device.

CUDA: Fill matrix with results of summation

I need to fill a matrix with values returned from function below
__device__ float calc(float *ar, int m, float sum, int i, int j)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < m)
{
ar[idx] = __powf(ar[idx], i + j);
atomicAdd(&sum, ar[idx]);
}
return sum;
}
Matrix set up as one dimensional array and fills up through this function
__global__ void createMatrix(float *A, float *arr, int size)
{
A[threadIdx.y*size + threadIdx.x] = /*some number*/;
}
In theory it should be something like this
__global__ void createMatrix(float *A, float *arr, int size)
{
float sum = 0;
A[threadIdx.y*size + threadIdx.x] = calc(arr, size, sum, threadIdx.x, threadIdx.y);
}
but it doesn't work that way, calc always returns 0. Is there any way I can fill matrix using global function? Thanks in advance.
You're passing sum by value rather than by reference. So all of your atomicAdd()'s have no effect on the zero-initialized value in the kernel.
However, even if you were to pass it by reference, this would still be a poorly-designed kernel. You see, you don't need the atomics if you have a per-thread sum variable (which you do). Also, your calc() function only adds a value once to each sum value, while it seems you expect it to add more than once.

Segmentation fault with vector addition in cuda

I was messing with a toy program for cuda.
I declare a float array transfer that to gpu and a number to each element of that float array and transfer it back to the host system and print the array. However this is not working out and it is giving me segmentation fault.
Here's code
#include <iostream>
using namespace std;
__global__ void kern(float *a, float *C){
for (int i = 0; i < 3; i++) C[i] = a[i] + i;
}
int main(){
float *A = new float[3];
for(int i = 0; i < 3; i++){
A[i] = i;
}
float * d;
float * C;
cudaMalloc(&C, sizeof(float)*3);
cudaMalloc(&d, sizeof(float)*3);
cudaMemcpy(&d, A, sizeof(float)*3, cudaMemcpyHostToDevice);
kern<<<1, 1>>>(d, C);
cudaMemcpy(&A, C, sizeof(float)*3, cudaMemcpyDeviceToHost);
cout << A[2];
}
Also I am not familiar with Malloc most of my experience was with cpp and hence I am more comfortable with new datatype[]; is there a equivalent for Cuda?
Change this to:
cudaMemcpy(&d, A, sizeof(float)*3, cudaMemcpyHostToDevice);
cudaMemcpy(&A, C, sizeof(float)*3, cudaMemcpyDeviceToHost);
To this:
cudaMemcpy(d, A, sizeof(float)*3, cudaMemcpyHostToDevice);
cudaMemcpy(A, C, sizeof(float)*3, cudaMemcpyDeviceToHost);
Also it's always better to store return code by CUDA calls they will give you better idea what going wrong.

Combining texture memory Unified Memory in CUDA 6

I am writing a CUDA application for Jetson TK1 using CUDA 6. I have got the impression from Mark Harris in his blog post
Jetson TK1: Mobile Embedded Supercomputer Takes CUDA Everywhere
that the memory of the Tegra K1 is physically unified. I have also observed results indicating that cudaMallocManaged is significantly faster for global memory than ordinary cudaMemcpy. This is probably because the Unified Memory doesn't require any copying.
However, what do I do when I want to use the texture memory for parts of my application? I have not found any support for textures using cudaMallocManaged so I have assumed that I have to use normal cudaMemcpyToArray and bindTextureToArray?
Using the previous mentioned method often seem to work but the variables managed by cudaMallocManaged sometimes give weird segmentation faults for me. Is this the right way to use texture memory along with Unified Memory? The following code illustrates how I do it. This code works fine but my question is whether this is the right way to go or if it might create undefined behaviour that could cause e.g. segmentation faults.
#define width 16
#define height 16
texture<float, cudaTextureType2D, cudaReadModeElementType> input_tex;
__global__ void some_tex_kernel(float* output){
int i= threadIdx.x;
float x = i%width+0.5f;
float y = i/width+0.5f;
output[i] = tex2D(input_tex, x, y);
}
int main(){
float* out;
if(cudaMallocManaged(&out, width*height*sizeof(float))!= cudaSuccess)
std::cout << "unified not working\n";
for(int i=0; i< width*height; ++i){
out[i] = float(i);
}
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaArray* input_t;
cudaMallocArray(&input_t, &desc, width, height);
cudaMemcpyToArrayAsync(input_t, 0, 0, out, width*height*sizeof(float), cudaMemcpyHostToDevice);
input_tex.filterMode = cudaFilterModeLinear;
cudaBindTextureToArray(input_tex, input_t, desc);
some_tex_kernel<<<1, width*height>>>(out);
cudaDeviceSynchronize();
for(int i=0;i<width*height; ++i)
std::cout << out[i] << " ";
cudaFree(out);
cudaFreeArray(input_t);
}
}
Another thing that I find odd is that if I remove the cudaDeviceSynchronize() in the code I always get segmentation faults. I understand that the result might not be finished if I read it without a synchronization but should not the variable still be accessible?
Anyone have a clue?
Mattias
The only managed memory possibilities at this time are static allocations using __device__ __managed__ or dynamic allocations using cudaMallocManaged(). There is no direct support for textures, surfaces, constant memory, etc.
Your usage of texturing is fine. The only overlap between texture usage and managed memory is in the following call:
cudaMemcpyToArrayAsync(input_t, 0, 0, out, width*height*sizeof(float), cudaMemcpyHostToDevice);
where managed memory is the source (i.e. host side) of the transfer. This is acceptable as long as the call is issued during a period when no kernels are executing (see below).
"Another thing that I find odd is that if I remove the cudaDeviceSynchronize() in the code I always get segmentation faults."
cudaDeviceSynchronize(); is necessary after a kernel call to make the managed memory visible to the host again. I suggest you read this section of the documentation carefully:
"In general, it is not permitted for the CPU to access any managed allocations or variables while the GPU is active. Concurrent CPU/GPU accesses, ... will cause a segmentation fault..."
As you've indicated, the code you posted works fine. If you have other code that has unpredictable seg faults while using managed memory, I would carefully inspect the code flow (especially if you are using streams i.e. concurrency) to make sure that the host is accessing managed data only after a cudaDeviceSynchronize(); has been issued, and before any subsequent kernel calls.
Robert Crovella has already answered to your question. However, in order to show you that cudaMallocManaged can be used in the framework of texture memory, I have dusted my 1D linear interpolation code and converted it using cudaMallocManaged. You will see that the code performs the 1D linear interpolation in four different ways:
CPU;
GPU;
GPU using tex1Dfetch;
GPU using tex1D filtering.
The code works without problems in all the cases and, especially, the latter two ones, on a Kepler K20c card.
// includes, system
#include <cstdlib>
#include <conio.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include <iomanip>
// includes, cuda
#include <cuda.h>
#include <cuda_runtime.h>
using namespace std;
texture<float, 1, cudaReadModeElementType> data_d_texture_filtering;
texture<float, 1> data_d_texture;
#define BLOCK_SIZE 256
/******************/
/* ERROR CHECKING */
/******************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
/************/
/* LINSPACE */
/************/
// --- Generates N equally spaced, increasing points between a and b and stores them in x
void linspace(float* x, float a, float b, int N) {
float delta_x=(b-a)/(float)N;
x[0]=a;
for(int k=1;k<N;k++) x[k]=x[k-1]+delta_x;
}
/*************/
/* RANDSPACE */
/*************/
// --- Generates N randomly spaced, increasing points between a and b and stores them in x
void randspace(float* x, float a, float b, int N) {
float delta_x=(b-a)/(float)N;
x[0]=a;
for(int k=1;k<N;k++) x[k]=x[k-1]+delta_x+(((float)rand()/(float)RAND_MAX-0.5)*(1./(float)N));
}
/******************/
/* DATA GENERATOR */
/******************/
// --- Generates N complex random data points, with real and imaginary parts ranging in (0.f,1.f)
void Data_Generator(float* data, int N) {
for(int k=0;k<N;k++) {
data[k]=(float)rand()/(float)RAND_MAX;
}
}
/*************************************/
/* LINEAR INTERPOLATION KERNEL - CPU */
/*************************************/
float linear_kernel_CPU(float in)
{
float d_y;
return 1.-abs(in);
}
/***************************************/
/* LINEAR INTERPOLATION FUNCTION - CPU */
/***************************************/
void linear_interpolation_function_CPU(float* result_GPU, float* data, float* x_in, float* x_out, int M, int N){
float a;
for(int j=0; j<N; j++){
int k = floor(x_out[j]+M/2);
a = x_out[j]+M/2-floor(x_out[j]+M/2);
result_GPU[j] = a * data[k+1] + (-data[k] * a + data[k]);
}
}
/*************************************/
/* LINEAR INTERPOLATION KERNEL - GPU */
/*************************************/
__device__ float linear_kernel_GPU(float in)
{
float d_y;
return 1.-abs(in);
}
/**************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - GLOBAL MEMORY */
/**************************************************************/
__global__ void linear_interpolation_kernel_function_GPU(float* __restrict__ result_d, const float* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - truncf(reg_x_out);
float dk = data_d[k];
float dkp1 = data_d[k+1];
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}
/***************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - TEXTURE MEMORY */
/***************************************************************/
__global__ void linear_interpolation_kernel_function_GPU_texture(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - truncf(reg_x_out);
float dk = tex1Dfetch(data_d_texture,k);
float dkp1 = tex1Dfetch(data_d_texture,k+1);
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}
/************************************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - TEXTURE MEMORY - FILTERING FEATURES */
/************************************************************************************/
__global__ void linear_interpolation_kernel_function_GPU_texture_filtering(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N) result_d[j] = tex1D(data_d_texture_filtering,float(x_out_d[j]+M/2+0.5));
}
/***************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU */
/***************************************/
void linear_interpolation_function_GPU(float* result_d, float* data_d, float* x_in_d, float* x_out_d, int M, int N){
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
linear_interpolation_kernel_function_GPU<<<dimGrid,dimBlock>>>(result_d, data_d, x_out_d, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
/********************************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU - TEXTURE MEMORY */
/********************************************************/
void linear_interpolation_function_GPU_texture(float* result_d, float* data_d, float* x_in_d, float* x_out_d, int M, int N){
cudaBindTexture(NULL, data_d_texture, data_d, M*sizeof(float));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
linear_interpolation_kernel_function_GPU_texture<<<dimGrid,dimBlock>>>(result_d, x_out_d, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
/*****************************************************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU - TEXTURE MEMORY - FILTERING FEATURES */
/*****************************************************************************/
void linear_interpolation_function_GPU_texture_filtering(float* result_d, float* data, float* x_in_d, float* x_out_d, int M, int N){
cudaArray* data_d = NULL; gpuErrchk(cudaMallocArray(&data_d, &data_d_texture_filtering.channelDesc, M, 1));
gpuErrchk(cudaMemcpyToArray(data_d, 0, 0, data, sizeof(float)*M, cudaMemcpyHostToDevice));
gpuErrchk(cudaBindTextureToArray(data_d_texture_filtering, data_d));
data_d_texture_filtering.normalized = false;
data_d_texture_filtering.filterMode = cudaFilterModeLinear;
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
linear_interpolation_kernel_function_GPU_texture_filtering<<<dimGrid,dimBlock>>>(result_d, x_out_d, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
/********/
/* MAIN */
/********/
int main()
{
int M=1024; // --- Number of input points
int N=1024; // --- Number of output points
int Nit = 100; // --- Number of computations for time measurement
// --- Input sampling
float* x_in; gpuErrchk(cudaMallocManaged(&x_in,sizeof(float)*M));
// --- Input data
float *data; gpuErrchk(cudaMallocManaged(&data,(M+1)*sizeof(float))); Data_Generator(data,M); data[M]=0.;
// --- Output sampling
float* x_out; gpuErrchk(cudaMallocManaged((void**)&x_out,sizeof(float)*N)); randspace(x_out,-M/2.,M/2.,N);
// --- Result allocation
float *result_CPU; result_CPU=(float*)malloc(N*sizeof(float));
float *result_d; gpuErrchk(cudaMallocManaged(&result_d,sizeof(float)*N));
float *result_d_texture; gpuErrchk(cudaMallocManaged(&result_d_texture,sizeof(float)*N));
float *result_d_texture_filtering; gpuErrchk(cudaMallocManaged(&result_d_texture_filtering,sizeof(float)*N));
// --- Reference interpolation result as evaluated on the CPU
linear_interpolation_function_CPU(result_CPU, data, x_in, x_out, M, N);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int k=0; k<Nit; k++) linear_interpolation_function_GPU(result_d, data, x_in, x_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "GPU Global memory [ms]: " << setprecision (10) << time/Nit << endl;
cudaEventRecord(start, 0);
for (int k=0; k<Nit; k++) linear_interpolation_function_GPU_texture_filtering(result_d_texture_filtering, data, x_in, x_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "GPU Texture filtering [ms]: " << setprecision (10) << time/Nit << endl;
cudaEventRecord(start, 0);
for (int k=0; k<Nit; k++) linear_interpolation_function_GPU_texture(result_d_texture, data, x_in, x_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "GPU Texture [ms]: " << setprecision (10) << time/Nit << endl;
float diff_norm=0.f, norm=0.f;
for(int j=0; j<N; j++) {
diff_norm = diff_norm + (result_CPU[j]-result_d[j])*(result_CPU[j]-result_d[j]);
norm = norm + result_CPU[j]*result_CPU[j];
}
printf("Error GPU [percentage] = %f\n",100.*sqrt(diff_norm/norm));
float diff_norm_texture_filtering=0.f;
for(int j=0; j<N; j++) {
diff_norm_texture_filtering = diff_norm_texture_filtering + (result_CPU[j]-result_d_texture_filtering[j])*(result_CPU[j]-result_d_texture_filtering[j]);
}
printf("Error texture filtering [percentage] = %f\n",100.*sqrt(diff_norm_texture_filtering/norm));
float diff_norm_texture=0.f;
for(int j=0; j<N; j++) {
diff_norm_texture = diff_norm_texture + (result_CPU[j]-result_d_texture[j])*(result_CPU[j]-result_d_texture[j]);
}
printf("Error texture [percentage] = %f\n",100.*sqrt(diff_norm_texture/norm));
cudaDeviceReset();
return 0;
}

Type Qualifiers for a device class in CUDA

I'm currently attempting to make a piece of CUDA code with a class that will be used solely on the device side (i.e. host doesn't need to know of it's existence). However I cannot work out the correct qualifiers for the class (deviceclass below):
__device__ float devicefunction (float *x) {return x[0]+x[1];}
class deviceclass {
private:
float _a;
public:
deviceclass(float *x) {_a = devicefunction(x);}
float getvalue () {return _a;}
};
// Device code
__global__ void VecInit(float* A, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
deviceclass *test;
test = new deviceclass(1.0, 2.0);
A[i] = test->getvalue();
}
}
// Standard CUDA guff below: Variables
float *h_A, *d_A;
// Host code
int main(int argc, char** argv)
{
printf("Vector initialization...\n");
int N = 10000;
size_t size = N * sizeof(float);
// Allocate
h_A = (float*)malloc(size);
cudaMalloc(&d_A, size);
printf("Computing...\n");
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
// Copy result from device memory to host memory
cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
//...etc
}
Setting Deviceclass as solely a __device__ throws an error as it's called from a global function, however setting it as __device__ __host__ or __global__ seems unnecessary. Can someone point me in the right direction?
It turns out the qualifiers have to go on the member functions of the class, below is a fully working version:
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
void Cleanup(void);
// Functions to be pointed to
__device__ float Plus (float a, float b) {return a+b;}
class deviceclass {
private:
float test;
public:
__device__ deviceclass(float a, float b) {
test = Plus(a,b);
}
__device__ float getvalue() {return test;}
};
// Device code
__global__ void VecInit(float* A, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
deviceclass test(1.0, 2.0);
A[i] = test.getvalue();
}
}
// Standard CUDA guff below: Variables
float *h_A, *d_A;
// Host code
int main(int argc, char** argv)
{
printf("Vector initialization...\n");
int N = 10000;
size_t size = N * sizeof(float);
// Allocate
h_A = (float*)malloc(size);
cudaMalloc(&d_A, size);
printf("Computing...\n");
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
// Copy result from device memory to host memory
cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
// Verify result
int i;
for (i = 0; i < N; ++i) {
cout << endl << h_A[i];
}
cout << endl;
Cleanup();
}
void Cleanup(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
// Free host memory
if (h_A)
free(h_A);
cudaThreadExit();
exit(0);
}
I take it that Node() is a typo.
From the CUDA C Programming Guide, Section 3.1.5:
However, only a subset of C++ is fully supported for the device code
and Appendix D.6:
Code compiled for devices with compute capability 2.x and higher may make use of C++ classes...
I think your code is using incompatible C++.