CUDA streams and concurrent kernel execution

CUDA streams and concurrent kernel execution - concurrency

I would like to use streams in order to parallelize the execution of kernels that work on separate device data arrays. Data were allocated on the device and filled from previous kernels.
I have written the following program that shows I can't reach my goal so far. Indeed, the kernels on two non-default streams execute sequentially in their respective streams.
The same behaviour is observed on 2 Intel machines with latest Debian linux version. One has a Tesla C2075 with CUDA 4.2 and the other has a Geforce 460GT with CUDA 5.0. The Visual Profiler shows sequential execution in both the 4.2 and also 5.0 CUDA version.
Here is the code:
#include <iostream>
#include <stdio.h>
#include <ctime>
#include <curand.h>
using namespace std;
// compile and run this way:
// nvcc cuStreamsBasics.cu -arch=sm_20 -o testCuStream -lcuda -lcufft -lcurand
// testCuStream 1024 512 512
/* -------------------------------------------------------------------------- */
// "useful" macros
/* -------------------------------------------------------------------------- */
#define MSG_ASSERT( CONDITION, MSG ) \
if (! (CONDITION)) \
{ \
std::cerr << std::endl << "Dynamic assertion `" #CONDITION "` failed in " << __FILE__ \
<< " line " << __LINE__ << ": <" << MSG << ">" << std::endl; \
exit( 1 ); \
} \
#define ASSERT( CONDITION ) \
MSG_ASSERT( CONDITION, " " )
// allocate data on the GPU memory, unpinned
#define CUDALLOC_GPU( _TAB, _DIM, _DATATYPE ) \
MSG_ASSERT( \
cudaMalloc( (void**) &_TAB, _DIM * sizeof( _DATATYPE) ) \
== cudaSuccess , "failed CUDALLOC" );
/* -------------------------------------------------------------------------- */
// the CUDA kernels
/* -------------------------------------------------------------------------- */
// finds index in 1D array from sequential blocks
#define CUDAINDEX_1D \
blockIdx.y * ( gridDim.x * blockDim.x ) + \
blockIdx.x * blockDim.x + \
threadIdx.x; \
__global__ void
kernel_diva(float* data, float value, int array_size)
{
int i = CUDAINDEX_1D
if (i < array_size)
data[i] /= value;
}
__global__ void
kernel_jokea(float* data, float value, int array_size)
{
int i = CUDAINDEX_1D
if (i < array_size)
data[i] *= value + sin( double(i)) * 1/ cos( double(i) );
}
/* -------------------------------------------------------------------------- */
// usage
/* -------------------------------------------------------------------------- */
static void
usage(int argc, char **argv)
{
if ((argc -1) != 3)
{
printf("Usage: %s <dimx> <dimy> <dimz> \n", argv[0]);
printf("do stuff\n");
exit(1);
}
}
/* -------------------------------------------------------------------------- */
// main program, finally!
/* -------------------------------------------------------------------------- */
int
main(int argc, char** argv)
{
usage(argc, argv);
size_t x_dim = atoi( argv[1] );
size_t y_dim = atoi( argv[2] );
size_t z_dim = atoi( argv[3] );
cudaStream_t stream1, stream2;
ASSERT( cudaStreamCreate( &stream1 ) == cudaSuccess );
ASSERT( cudaStreamCreate( &stream2 ) == cudaSuccess );
size_t size = x_dim * y_dim * z_dim;
float *data1, *data2;
CUDALLOC_GPU( data1, size, float);
CUDALLOC_GPU( data2, size, float);
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
/* Set seed */
curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
/* Generate n floats on device */
curandGenerateUniform(gen, data1, size);
curandGenerateUniform(gen, data2, size);
dim3 dimBlock( z_dim, 1, 1);
dim3 dimGrid( x_dim, y_dim, 1);
clock_t start;
double diff;
cudaDeviceSynchronize();
start = clock();
kernel_diva <<< dimGrid, dimBlock>>>( data1, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock>>>( data1, 5.55f, size);
kernel_diva <<< dimGrid, dimBlock>>>( data2, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock>>>( data2, 5.55f, size);
cudaDeviceSynchronize();
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
cout << endl << "sequential: " << diff;
cudaDeviceSynchronize();
start = clock();
kernel_diva <<< dimGrid, dimBlock, 0, stream1 >>>( data1, 5.55f, size);
kernel_diva <<< dimGrid, dimBlock, 0, stream2 >>>( data2, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock, 0, stream1 >>>( data1, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock, 0, stream2 >>>( data2, 5.55f, size);
cudaDeviceSynchronize();
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
cout << endl << "parallel: " << diff;
cudaStreamDestroy( stream1 );
cudaStreamDestroy( stream2 );
return 0;
}
Typically, the dimension of the arrays is 512^3 single float. I usually just cut the array in blocks of (512,1,1) threads that I put on a grid of size (1<<15, (rest), 1).
Thank you in advance for any hint or comment.
Best regards.

I'm trying to provide an interpretation to why you do not see execution overlap of your two kernels. To this end, I have constructed the code reported below, which uses your two kernels and monitors which Streaming Multiprocessor (SM) each block runs on. I'm using CUDA 6.5 (Release Candidate) and I'm running on a GT540M card, which has only 2 SMs, so it provides a simple playground to work with. The blockSize choice is delegated to the new CUDA 6.5 cudaOccupancyMaxPotentialBlockSize facility.
THE CODE
#include <stdio.h>
#include <time.h>
//#define DEBUG_MODE
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**************************************************/
/* STREAMING MULTIPROCESSOR IDENTIFICATION NUMBER */
/**************************************************/
__device__ unsigned int get_smid(void) {
unsigned int ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}
/************/
/* KERNEL 1 */
/************/
__global__ void kernel_1(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N) {
data[i] = data[i] / value;
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}
}
//__global__ void kernel_1(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i] / value;
// }
//}
/************/
/* KERNEL 2 */
/************/
__global__ void kernel_2(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < N) {
data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}
}
//__global__ void kernel_2(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
// }
//}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10000;
const float value = 5.55f;
const int rep_num = 20;
// --- CPU memory allocations
float *h_data1 = (float*) malloc(N*sizeof(float));
float *h_data2 = (float*) malloc(N*sizeof(float));
float *h_data1_ref = (float*) malloc(N*sizeof(float));
float *h_data2_ref = (float*) malloc(N*sizeof(float));
// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1[i] = rand() / RAND_MAX;
h_data2[i] = rand() / RAND_MAX;
}
// --- GPU memory allocations
float *d_data1, *d_data2;
gpuErrchk(cudaMalloc((void**)&d_data1, N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_data2, N*sizeof(float)));
// --- CPU -> GPU memory transfers
gpuErrchk(cudaMemcpy(d_data1, h_data1, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));
// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1_ref[i] = h_data1[i] / value;
h_data2_ref[i] = h_data2[i] * (value + sin(double(i)) * 1./cos(double(i)));
}
// --- Stream creations
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
// --- Launch parameters configuration
int blockSize1, blockSize2, minGridSize1, minGridSize2, gridSize1, gridSize2;
cudaOccupancyMaxPotentialBlockSize(&minGridSize1, &blockSize1, kernel_1, 0, N);
cudaOccupancyMaxPotentialBlockSize(&minGridSize2, &blockSize2, kernel_2, 0, N);
gridSize1 = (N + blockSize1 - 1) / blockSize1;
gridSize2 = (N + blockSize2 - 1) / blockSize2;
// --- Allocating space for SM IDs
int *h_sm_11 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_12 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_21 = (int*) malloc(gridSize2*sizeof(int));
int *h_sm_22 = (int*) malloc(gridSize2*sizeof(int));
int *d_sm_11, *d_sm_12, *d_sm_21, *d_sm_22;
gpuErrchk(cudaMalloc((void**)&d_sm_11, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_12, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_21, gridSize2*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_22, gridSize2*sizeof(int)));
// --- Timing individual kernels
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i<rep_num; i++) kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 1 - elapsed time: %3.3f ms \n", time/rep_num);
cudaEventRecord(start, 0);
for (int i=0; i<rep_num; i++) kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 2 - elapsed time: %3.3f ms \n", time/rep_num);
// --- No stream case
cudaEventRecord(start, 0);
kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data1, d_data1, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Results check
for (int i=0; i<N; i++) {
if (h_data1[i] != h_data1_ref[i]) {
printf("Kernel1 - Error at i = %i; Host = %f; Device = %f\n", i, h_data1_ref[i], h_data1[i]);
return;
}
}
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1>>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data2, d_data2, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (h_data2[i] != h_data2_ref[i]) {
printf("Kernel2 - Error at i = %i; Host = %f; Device = %f\n", i, h_data2_ref[i], h_data2[i]);
return;
}
}
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("No stream - elapsed time: %3.3f ms \n", time);
// --- Stream case
cudaEventRecord(start, 0);
kernel_1<<<gridSize1, blockSize1, 0, stream1 >>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1, 0, stream2 >>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream1 >>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream2 >>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Stream - elapsed time: %3.3f ms \n", time);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
printf("Test passed!\n");
gpuErrchk(cudaMemcpy(h_sm_11, d_sm_11, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_12, d_sm_12, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_21, d_sm_21, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_22, d_sm_22, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));
printf("Kernel 1: gridSize = %i; blockSize = %i\n", gridSize1, blockSize1);
printf("Kernel 2: gridSize = %i; blockSize = %i\n", gridSize2, blockSize2);
for (int i=0; i<gridSize1; i++) {
printf("Kernel 1 - Data 1: blockNumber = %i; SMID = %d\n", i, h_sm_11[i]);
printf("Kernel 1 - Data 2: blockNumber = %i; SMID = %d\n", i, h_sm_12[i]);
}
for (int i=0; i<gridSize2; i++) {
printf("Kernel 2 - Data 1: blockNumber = %i; SMID = %d\n", i, h_sm_21[i]);
printf("Kernel 2 - Data 2: blockNumber = %i; SMID = %d\n", i, h_sm_22[i]);
}
cudaDeviceReset();
return 0;
}
KERNEL TIMINGS FOR N = 100 and N = 10000
N = 100
kernel_1 0.003ms
kernel_2 0.005ms
N = 10000
kernel_1 0.011ms
kernel_2 0.053ms
So, kernel 1 is more computationally expensive than kernel 2.
RESULTS FOR N = 100
Kernel 1: gridSize = 1; blockSize = 100
Kernel 2: gridSize = 1; blockSize = 100
Kernel 1 - Data 1: blockNumber = 0; SMID = 0
Kernel 1 - Data 2: blockNumber = 0; SMID = 1
Kernel 2 - Data 1: blockNumber = 0; SMID = 0
Kernel 2 - Data 2: blockNumber = 0; SMID = 1
In this case, each kernel is launched with only one block and this is the timeline.
As you can see, the overlap occurs. By looking at the above outcomes, the scheduler delivers the single blocks of the two calls to kernel 1 in parallel to the two available SMs and then does the same for kernel 2. This seems to be the main reason why overlap occurs.
RESULTS FOR N = 10000
Kernel 1: gridSize = 14; blockSize = 768
Kernel 2: gridSize = 10; blockSize = 1024
Kernel 1 - Data 1: blockNumber = 0; SMID = 0
Kernel 1 - Data 2: blockNumber = 0; SMID = 1
Kernel 1 - Data 1: blockNumber = 1; SMID = 1
Kernel 1 - Data 2: blockNumber = 1; SMID = 0
Kernel 1 - Data 1: blockNumber = 2; SMID = 0
Kernel 1 - Data 2: blockNumber = 2; SMID = 1
Kernel 1 - Data 1: blockNumber = 3; SMID = 1
Kernel 1 - Data 2: blockNumber = 3; SMID = 0
Kernel 1 - Data 1: blockNumber = 4; SMID = 0
Kernel 1 - Data 2: blockNumber = 4; SMID = 1
Kernel 1 - Data 1: blockNumber = 5; SMID = 1
Kernel 1 - Data 2: blockNumber = 5; SMID = 0
Kernel 1 - Data 1: blockNumber = 6; SMID = 0
Kernel 1 - Data 2: blockNumber = 6; SMID = 0
Kernel 1 - Data 1: blockNumber = 7; SMID = 1
Kernel 1 - Data 2: blockNumber = 7; SMID = 1
Kernel 1 - Data 1: blockNumber = 8; SMID = 0
Kernel 1 - Data 2: blockNumber = 8; SMID = 1
Kernel 1 - Data 1: blockNumber = 9; SMID = 1
Kernel 1 - Data 2: blockNumber = 9; SMID = 0
Kernel 1 - Data 1: blockNumber = 10; SMID = 0
Kernel 1 - Data 2: blockNumber = 10; SMID = 0
Kernel 1 - Data 1: blockNumber = 11; SMID = 1
Kernel 1 - Data 2: blockNumber = 11; SMID = 1
Kernel 1 - Data 1: blockNumber = 12; SMID = 0
Kernel 1 - Data 2: blockNumber = 12; SMID = 1
Kernel 1 - Data 1: blockNumber = 13; SMID = 1
Kernel 1 - Data 2: blockNumber = 13; SMID = 0
Kernel 2 - Data 1: blockNumber = 0; SMID = 0
Kernel 2 - Data 2: blockNumber = 0; SMID = 0
Kernel 2 - Data 1: blockNumber = 1; SMID = 1
Kernel 2 - Data 2: blockNumber = 1; SMID = 1
Kernel 2 - Data 1: blockNumber = 2; SMID = 1
Kernel 2 - Data 2: blockNumber = 2; SMID = 0
Kernel 2 - Data 1: blockNumber = 3; SMID = 0
Kernel 2 - Data 2: blockNumber = 3; SMID = 1
Kernel 2 - Data 1: blockNumber = 4; SMID = 1
Kernel 2 - Data 2: blockNumber = 4; SMID = 0
Kernel 2 - Data 1: blockNumber = 5; SMID = 0
Kernel 2 - Data 2: blockNumber = 5; SMID = 1
Kernel 2 - Data 1: blockNumber = 6; SMID = 1
Kernel 2 - Data 2: blockNumber = 6; SMID = 0
Kernel 2 - Data 1: blockNumber = 7; SMID = 0
Kernel 2 - Data 2: blockNumber = 7; SMID = 1
Kernel 2 - Data 1: blockNumber = 8; SMID = 1
Kernel 2 - Data 2: blockNumber = 8; SMID = 0
Kernel 2 - Data 1: blockNumber = 9; SMID = 0
Kernel 2 - Data 2: blockNumber = 9; SMID = 1
This is the timeline:
In this case, no overlap occurs. According to the above outcomes, this does not mean that the two SMs are not simultaneously exploited, but (I think) that, due to the larger number of blocks to be launched, assigning two blocks of different kernels or the two blocks of the same kernel does not make much difference in terms of performance and thus the scheduler chooses the second option.
I have tested that, considering more work done per thread, the behavior keeps the same.

Related

CUDA shuffle instruction reduction slower than shared memory reduction?

Shuffle instruction based warp reduction is expected to perform faster reduction than reduction using shared memory or global memory, as mentioned in -
https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
and
https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-kepler-shuffle/
In the following code, I tried to validate this:-
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32];
int lane = threadIdx.x%32;
int wid = threadIdx.x / 32;
val = warpReduceSum(val);
//write reduced value to shared memory
if (lane == 0) shared[wid] = val;
__syncthreads();
//ensure we only grab a value from shared memory if that warp existed
val = (threadIdx.x<blockDim.x / 32) ? shared[lane] : int(0);
if (wid == 0) val = warpReduceSum(val);
return val;
}
__global__ void device_reduce_stable_kernel(float *in, float* out, int N) {
float sum = int(0);
//printf("value = %d ", blockDim.x*gridDim.x);
for (int i = blockIdx.x*blockDim.x + threadIdx.x; i<N; i += blockDim.x*gridDim.x) {
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x == 0)
out[blockIdx.x] = sum;
}
void device_reduce_stable(float *in, float* out, int N) {
//int threads = 512;
//int blocks = min((N + threads - 1) / threads, 1024);
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = N / maxThreadsPerBlock;
device_reduce_stable_kernel << <blocks, threads >> >(in, out, N);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
device_reduce_stable_kernel << <1, 1024 >> >(out, out, blocks);
//cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// do reduction in global mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
d_in[myId] += d_in[myId + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = d_in[myId];
}
}
__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[];
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// load shared mem from global mem
sdata[tid] = d_in[myId];
__syncthreads(); // make sure entire block is loaded!
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}
void reduce(float * d_out, float * d_intermediate, float * d_in,
int size, bool usesSharedMemory)
{
// assumes that size is not greater than maxThreadsPerBlock^2
// and that size is a multiple of maxThreadsPerBlock
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = size / maxThreadsPerBlock;
if (usesSharedMemory)
{
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
else
{
global_reduce_kernel << <blocks, threads >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
// now we're down to one block left, so reduce it
threads = blocks; // launch one thread for each block in prev step
blocks = 1;
if (usesSharedMemory)
{
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
else
{
global_reduce_kernel << <blocks, threads >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
}
int main()
{
/*int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
fprintf(stderr, "error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, dev) == 0)
{
printf("Using device %d:\n", dev);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
devProps.name, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
(int)devProps.clockRate);
}
*/
const int ARRAY_SIZE = 2048;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
float sum = 0.0f;
for (int i = 0; i < ARRAY_SIZE; i++) {
// generate random float in [-1.0f, 1.0f]
h_in[i] = i;
sum += h_in[i];
}
// declare GPU memory pointers
float * d_in, *d_intermediate, *d_out;
// allocate GPU memory
cudaMalloc((void **)&d_in, ARRAY_BYTES);
cudaMalloc((void **)&d_intermediate, ARRAY_BYTES); // overallocated
cudaMalloc((void **)&d_out, sizeof(float));
// transfer the input array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
int whichKernel = 2;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// launch the kernel
cudaProfilerStart();
switch (whichKernel) {
case 0:
printf("Running global reduce\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
//{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
//}
cudaEventRecord(stop, 0);
break;
case 1:
printf("Running reduce with shared mem\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
//{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
//}
cudaEventRecord(stop, 0);
break;
case 2:
printf("Running reduce with shuffle instruction\n");
cudaEventRecord(start, 0);
/*for (int i = 0; i < 100; i++)
{*/
device_reduce_stable(d_in, d_out, ARRAY_SIZE);
//}
cudaEventRecord(stop, 0);
break;
default:
fprintf(stderr, "error: ran no kernel\n");
exit(EXIT_FAILURE);
}
cudaProfilerStop();
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
elapsedTime /= 100.0f; // 100 trials
// copy back the sum from GPU
float h_out;
cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
printf("average time elapsed: %f\n", elapsedTime);
// free GPU memory allocation
cudaFree(d_in);
cudaFree(d_intermediate);
cudaFree(d_out);
return 0;
}
The results showed that warp based reduction took nearly twice the time of shared memory based reduction. These results contradict the behavior expected.
The experiment was performed on Tesla K40c with Compute capability higher than 3.0.

I'm comparing the following two reduction kernels, one using only shared memory WITHOUT using warp shuffling for the last warp reduction stage (version4) and one using shared memory AND warp shuffling for the last warp reduction stage (version5).
version4
template <class T>
__global__ void version4(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // --- Local thread index
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // --- Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
if (tid < 32) {
sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
}
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
version5
template <class T>
__global__ void version5(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // --- Local thread index
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // --- Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Single warp reduction by shuffle operations
if (tid < 32)
{
// --- Last iteration removed from the for loop, but needed for shuffle reduction
mySum += sdata[tid + 32];
// --- Reduce final warp using shuffle
//for (int offset = warpSize / 2; offset > 0; offset /= 2) mySum += __shfl_down_sync(0xffffffff, mySum, offset);
for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor_sync(0xffffffff, mySum, i);
}
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
I confirm that there is no sensitive difference between the two. On my GTX920M card, the timing have been the following:
N = 33554432
version4 = 27.5ms
version5 = 27.095ms
So, I'm confirming Robert's comment above.

CUDA: cascaded summation of all vector elements

I have implemented a cascaded addition function for a large vector of float values on my GPU and my CPU. That simply means that all elements of this vector shell be summed up into one result. The CPU algorithm is quite trivial and works fine, but the GPU algorithm is always 35200 off the desired result.
The minimal working code for the algorithm and comparison to the CPU is below.
The output is always this:
CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s
GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s
- CPU result does not match GPU result in improved atomic add.
CPU: 10000000.000000, GPU: 10035200.000000, diff:-35200.000000
I checked it with cuda-memcheck but no errors occured in that run. I have tried many many different things but none of themworked. It if not due to the inaccuracy of the float datatype because I changed all floats to ints and still got the exact same result.
This is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
void reductionWithCudaImproved(float *result, const float *input);
__global__ void reductionKernelImproved(float *result, const float *input);
void reductionCPU(float *result, const float *input);
#define SIZE 10000000
#define TILE 32
#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100
int main()
{
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0.0;
resultGPU = 0.0;
srand((int)time(NULL));
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
input[i] = 1.0;
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCudaImproved(&resultGPU, input);
if (resultCPU != resultGPU)
printf("- CPU result does not match GPU result in improved atomic add. CPU: %f, GPU: %f, diff:%f\n\n", resultCPU, resultGPU, (resultCPU - resultGPU));
else
printf("+ CPU result matches GPU result in improved atomic add. CPU: %f, GPU: %f\n\n", resultCPU, resultGPU);
return 0;
}
void reductionCPU(float *result, const float *input)
{
for (int i = 0; i < SIZE; i++)
*result += input[i];
}
__global__ void reductionKernelImproved(float *result, const float *input)
{
int i;
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
__shared__ float interResult;
if (threadIdx.x == 0 && threadIdx.y == 0)
interResult = 0.0;
__syncthreads();
#pragma unroll ILP
for (i = 0; i < ILP; i++)
{
if (index < SIZE)
{
atomicAdd(&interResult, input[index]);
index++;
}
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
atomicAdd(result, interResult);
}
void reductionWithCudaImproved(float *result, const float *input)
{
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_IMPR;
dim_block.y = BLOCK_Y_IMPR;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X_IMPR;
dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y* BLOCK_COUNT_X_IMPR));
dim_grid.z = 1;
cudaSetDevice(0);
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reductionKernelImproved << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time (improved): %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaDeviceSynchronize();
cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_input);
cudaFree(dev_result);
return;
}

I think you have overlapping indices in your kernel call:
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
If I am not mistaken, your blockDim.x = 4 and BLOCK_COUNT_X_IMPR = 100, so each row will jump 400 indices.
However, your col can go as high as 400 * 8.
Consider:
blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
row = 0
index = 408
blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
row = 1
index = 1 * 400 + 8 = 408
So I guess you should rewrite your index
// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;

Solving sparse definite positive linear systems in CUDA

We are experiencing problems while using cuSOLVER's cusolverSpScsrlsvchol function, probably due to misunderstanding of the cuSOLVER library.
Motivation: we are solving the Poisson equation -divgrad x = b on a rectangular grid. In 2 dimensions with a 5-stencil (1, 1, -4, 1, 1), the Laplacian on the grid provides a (quite sparse) matrix A. Moreover, the charge distribution on the grid gives a (dense) vector b. A is positive definite and symmetric.
Now we solve A * x = b for x using nvidia's new cuSOLVER library that comes with CUDA 7.0 . It provides a function cusolverSpScsrlsvchol which should do the sparse Cholesky factorisation for floats.
Note: we are able to correctly solve the system with the alternative sparse QR factorisation function cusolverSpScsrlsvqr. For a 4 x 4 grid with all b entries on the edge being 1 and the rest 0, we get for x:
1 1 0.999999 1 1 1 0.999999 1 1 1 1 1 1 1 1 1
Our problems:
cusolverSpScsrlsvchol returns wrong results for x:
1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
(solved, see answer below) Converting the CSR matrix A to a dense matrix and showing the output gives weird numbers (10^-44 and the like). The respective data from the CSR format are correct and validated with python numpy.
(solved, see answer below) The alternative sparse LU and partial pivoting with cusolverSpScsrlsvlu cannot even be found:
$ nvcc -std=c++11 cusparse_test3.cu -o cusparse_test3 -lcusparse -lcusolver
cusparse_test3.cu(208): error: identifier "cusolverSpScsrlsvlu" is undefined
What are we doing wrong? Thanks for your help!
Our C++ CUDA code:
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <vector>
#include <cassert>
// create poisson matrix with Dirichlet bc. of a rectangular grid with
// dimension NxN
void assemble_poisson_matrix_coo(std::vector<float>& vals, std::vector<int>& row, std::vector<int>& col,
std::vector<float>& rhs, int Nrows, int Ncols) {
//nnz: 5 entries per row (node) for nodes in the interior
// 1 entry per row (node) for nodes on the boundary, since we set them explicitly to 1.
int nnz = 5*Nrows*Ncols - (2*(Ncols-1) + 2*(Nrows-1))*4;
vals.resize(nnz);
row.resize(nnz);
col.resize(nnz);
rhs.resize(Nrows*Ncols);
int counter = 0;
for(int i = 0; i < Nrows; ++i) {
for (int j = 0; j < Ncols; ++j) {
int idx = j + Ncols*i;
if (i == 0 || j == 0 || j == Ncols-1 || i == Nrows-1) {
vals[counter] = 1.;
row[counter] = idx;
col[counter] = idx;
counter++;
rhs[idx] = 1.;
// if (i == 0) {
// rhs[idx] = 3.;
// }
} else { // -laplace stencil
// above
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-Ncols;
counter++;
// left
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-1;
counter++;
// center
vals[counter] = 4.;
row[counter] = idx;
col[counter] = idx;
counter++;
// right
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+1;
counter++;
// below
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+Ncols;
counter++;
rhs[idx] = 0;
}
}
}
assert(counter == nnz);
}
int main() {
// --- create library handles:
cusolverSpHandle_t cusolver_handle;
cusolverStatus_t cusolver_status;
cusolver_status = cusolverSpCreate(&cusolver_handle);
std::cout << "status create cusolver handle: " << cusolver_status << std::endl;
cusparseHandle_t cusparse_handle;
cusparseStatus_t cusparse_status;
cusparse_status = cusparseCreate(&cusparse_handle);
std::cout << "status create cusparse handle: " << cusparse_status << std::endl;
// --- prepare matrix:
int Nrows = 4;
int Ncols = 4;
std::vector<float> csrVal;
std::vector<int> cooRow;
std::vector<int> csrColInd;
std::vector<float> b;
assemble_poisson_matrix_coo(csrVal, cooRow, csrColInd, b, Nrows, Ncols);
int nnz = csrVal.size();
int m = Nrows * Ncols;
std::vector<int> csrRowPtr(m+1);
// --- prepare solving and copy to GPU:
std::vector<float> x(m);
float tol = 1e-5;
int reorder = 0;
int singularity = 0;
float *db, *dcsrVal, *dx;
int *dcsrColInd, *dcsrRowPtr, *dcooRow;
cudaMalloc((void**)&db, m*sizeof(float));
cudaMalloc((void**)&dx, m*sizeof(float));
cudaMalloc((void**)&dcsrVal, nnz*sizeof(float));
cudaMalloc((void**)&dcsrColInd, nnz*sizeof(int));
cudaMalloc((void**)&dcsrRowPtr, (m+1)*sizeof(int));
cudaMalloc((void**)&dcooRow, nnz*sizeof(int));
cudaMemcpy(db, b.data(), b.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrVal, csrVal.data(), csrVal.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrColInd, csrColInd.data(), csrColInd.size()*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dcooRow, cooRow.data(), cooRow.size()*sizeof(int), cudaMemcpyHostToDevice);
cusparse_status = cusparseXcoo2csr(cusparse_handle, dcooRow, nnz, m,
dcsrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
std::cout << "status cusparse coo2csr conversion: " << cusparse_status << std::endl;
cudaDeviceSynchronize(); // matrix format conversion has to be finished!
// --- everything ready for computation:
cusparseMatDescr_t descrA;
cusparse_status = cusparseCreateMatDescr(&descrA);
std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;
// optional: print dense matrix that has been allocated on GPU
std::vector<float> A(m*m, 0);
float *dA;
cudaMalloc((void**)&dA, A.size()*sizeof(float));
cusparseScsr2dense(cusparse_handle, m, m, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, dA, m);
cudaMemcpy(A.data(), dA, A.size()*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "A: \n";
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
std::cout << A[i*m + j] << " ";
}
std::cout << std::endl;
}
cudaFree(dA);
std::cout << "b: \n";
cudaMemcpy(b.data(), db, (m)*sizeof(int), cudaMemcpyDeviceToHost);
for (auto a : b) {
std::cout << a << ",";
}
std::cout << std::endl;
// --- solving!!!!
// cusolver_status = cusolverSpScsrlsvchol(cusolver_handle, m, nnz, descrA, dcsrVal,
// dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
// &singularity);
cusolver_status = cusolverSpScsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
&singularity);
cudaDeviceSynchronize();
std::cout << "singularity (should be -1): " << singularity << std::endl;
std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;
cudaMemcpy(x.data(), dx, m*sizeof(float), cudaMemcpyDeviceToHost);
// relocated these 2 lines from above to solve (2):
cusparse_status = cusparseDestroy(cusparse_handle);
std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;
cusolver_status = cusolverSpDestroy(cusolver_handle);
std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;
for (auto a : x) {
std::cout << a << " ";
}
std::cout << std::endl;
cudaFree(db);
cudaFree(dx);
cudaFree(dcsrVal);
cudaFree(dcsrColInd);
cudaFree(dcsrRowPtr);
cudaFree(dcooRow);
return 0;
}

1.cusolverSpScsrlsvchol returns wrong results for x:
1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
You said:
A is positive definite and symmetric.
No, it is not. It is not symmetric.
cusolverSpcsrlsvqr() has no requirement that the A matrix be symmetric.
cusolverSpcsrlsvchol() does have that requirement:
A is an m×m symmetric postive definite sparse matrix
This is the printout your code provides for the A matrix:
A:
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 -1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 4 -1 0 0 -1 0 0 0 0 0 0
0 0 0 0 0 -1 4 0 0 0 -1 0 0 0 0 0
0 0 0 0 0 0 -1 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 -1 0 0 0 0 0 0
0 0 0 0 0 -1 0 0 0 4 -1 0 0 0 0 0
0 0 0 0 0 0 -1 0 0 -1 4 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 -1 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 -1 0 0 0 1 0 0
0 0 0 0 0 0 0 0 0 0 -1 0 0 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
If that were symmetric, I would expect the second row:
0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0
to match the 2nd column:
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
By the way, a suggestion about Stack Overflow. If you answer your own question, my suggestion is that you intend it to be a complete answer. Some people might see an answered question and skip it. Probably better to edit such content into your question, thus focusing your question (I think) down to a single question. SO also doesn't work as well in my opinion when you ask multiple questions per question. That sort of behavior makes the question unnecessarily more difficult to answer, and I don't think it is serving you well here.

Although the matrix arising from Cartesian discretization of the Poisson equation is not positive definite, this question regards the inversion of sparse positive definite linear systems.
In the meanwhile cusolverSpScsrlsvchol becomes available for the device channel, I think it will be useful for potentially interested users to perform inversions of sparse positive definite linear systems using the cuSPARSE library. Here is a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 0.4612f; h_A_dense[4] = -0.0006f; h_A_dense[8] = 0.3566f; h_A_dense[12] = 0.0f;
h_A_dense[1] = -0.0006f; h_A_dense[5] = 0.4640f; h_A_dense[9] = 0.0723f; h_A_dense[13] = 0.0f;
h_A_dense[2] = 0.3566f; h_A_dense[6] = 0.0723f; h_A_dense[10] = 0.7543f; h_A_dense[14] = 0.0f;
h_A_dense[3] = 0.f; h_A_dense[7] = 0.0f; h_A_dense[11] = 0.0f; h_A_dense[15] = 0.1f;
// --- Create device array and copy host array to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix in CSR format\n\n");
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(Nrows * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, Nrows * sizeof(double), cudaMemcpyHostToDevice));
/******************************************/
/* STEP 1: CREATE DESCRIPTORS FOR L AND U */
/******************************************/
cusparseMatDescr_t descr_L = 0;
cusparseSafeCall(cusparseCreateMatDescr (&descr_L));
cusparseSafeCall(cusparseSetMatIndexBase (descr_L, CUSPARSE_INDEX_BASE_ONE));
cusparseSafeCall(cusparseSetMatType (descr_L, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatFillMode (descr_L, CUSPARSE_FILL_MODE_LOWER));
cusparseSafeCall(cusparseSetMatDiagType (descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT));
/********************************************************************************************************/
/* STEP 2: QUERY HOW MUCH MEMORY USED IN CHOLESKY FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS */
/********************************************************************************************************/
csric02Info_t info_A = 0; cusparseSafeCall(cusparseCreateCsric02Info(&info_A));
csrsv2Info_t info_L = 0; cusparseSafeCall(cusparseCreateCsrsv2Info (&info_L));
csrsv2Info_t info_Lt = 0; cusparseSafeCall(cusparseCreateCsrsv2Info (&info_Lt));
int pBufferSize_M, pBufferSize_L, pBufferSize_Lt;
cusparseSafeCall(cusparseDcsric02_bufferSize(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, &pBufferSize_M));
cusparseSafeCall(cusparseDcsrsv2_bufferSize (handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L, &pBufferSize_L));
cusparseSafeCall(cusparseDcsrsv2_bufferSize (handle, CUSPARSE_OPERATION_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, &pBufferSize_Lt));
int pBufferSize = max(pBufferSize_M, max(pBufferSize_L, pBufferSize_Lt));
void *pBuffer = 0; gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize));
/******************************************************************************************************/
/* STEP 3: ANALYZE THE THREE PROBLEMS: CHOLESKY FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS */
/******************************************************************************************************/
int structural_zero;
cusparseSafeCall(cusparseDcsric02_analysis(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
cusparseStatus_t status = cusparseXcsric02_zeroPivot(handle, info_A, &structural_zero);
if (CUSPARSE_STATUS_ZERO_PIVOT == status){ printf("A(%d,%d) is missing\n", structural_zero, structural_zero); }
cusparseSafeCall(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
cusparseSafeCall(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer));
/*************************************/
/* STEP 4: FACTORIZATION: A = L * L' */
/*************************************/
int numerical_zero;
cusparseSafeCall(cusparseDcsric02(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
status = cusparseXcsric02_zeroPivot(handle, info_A, &numerical_zero);
if (CUSPARSE_STATUS_ZERO_PIVOT == status){ printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero); }
printf("\nNon-zero elements in Cholesky matrix\n\n");
gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
for (int k=0; k<nnz; k++) printf("%f\n", h_A[k]);
cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_A_dense, Nrows));
printf("\nCholesky matrix\n\n");
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << h_A_dense[i * Ncols + j] << " ";
std::cout << "]\n";
}
/*********************/
/* STEP 5: L * z = x */
/*********************/
// --- Allocating the intermediate result vector
double *d_z; gpuErrchk(cudaMalloc(&d_z, N * sizeof(double)));
const double alpha = 1.;
cusparseSafeCall(cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, &alpha, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L, d_x, d_z, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
/**********************/
/* STEP 5: L' * y = z */
/**********************/
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(Ncols * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, Ncols * sizeof(double)));
cusparseSafeCall(cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, nnz, &alpha, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, d_z, d_y, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer));
cudaMemcpy(h_x, d_y, N * sizeof(double), cudaMemcpyDeviceToHost);
printf("\n\nFinal result\n");
for (int k=0; k<N; k++) printf("x[%i] = %f\n", k, h_x[k]);
}

Concerning 2: we have destroyed the cusparse handle too early (probably too much micro-tweaking to find the error sources....). Besides, the dense format is column-major which is why we need to transpose A to make it print properly!
Concerning 3: cusolverSpScsrlsvlu only exists on the host for the moment -- it's written in the documentation in a wonderfully obvious way under 6.2.1 remark 5.... http://docs.nvidia.com/cuda/cusolver/index.html#cusolver-lt-t-gt-csrlsvlu

Another possibility to solve a sparse, positive definite linear system is using the cuSOLVER library and, in particular, the cusolverSpDcsrlsvchol routine. It works very similar to the cuSOLVER routines used to Solving general sparse linear systems in CUDA, but uses a Cholesky factorization A = G * G^H, where G is the Cholesky factor, a lower triangular matrix.
As for the routines in Solving general sparse linear systems in CUDA and as of CUDA 10.0, only the host channel is at the moment available. Note that the reorder parameter has no effect and singularity is -1 if the matrix A is positive definite.
Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cusparse.h>
#include <cusolverSp.h>
//https://www.physicsforums.com/threads/all-the-ways-to-build-positive-definite-matrices.561438/
//https://it.mathworks.com/matlabcentral/answers/101132-how-do-i-determine-if-a-matrix-is-positive-definite-using-matlab
/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if (CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_cusolverGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.78; h_A_dense[4] = 0.0; h_A_dense[8] = 0.1736; h_A_dense[12] = 0.0;
h_A_dense[1] = 0.00; h_A_dense[5] = 3.1; h_A_dense[9] = 0.0; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.1736; h_A_dense[6] = 0.0; h_A_dense[10] = 5.0; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.00; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 2.349;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_y = (double *)malloc(Nrows * sizeof(double));
h_y[0] = 1.0; h_y[1] = 1.0; h_y[2] = 1.0; h_y[3] = 1.0;
double *d_y; gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_x = (double *)malloc(Ncols * sizeof(double));
double *d_x; gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));
// --- CUDA solver initialization
cusolverSpHandle_t solver_handle;
cusolverSpCreate(&solver_handle);
// --- Using Cholesky factorization
int singularity;
cusolveSafeCall(cusolverSpDcsrlsvcholHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
printf("Showing the results...\n");
for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);
}

Why AddVector CUDA c++ is not working?

I am trying to add 2 arrays using CUDA , but it didn't work .
I did all that it should be done:
1) I parallelized the VectorAdd function
2) I allocated memory to the GPu and moved the data to the GPU
3) And last thing i modified the function VectorAdd to run on the GPU
This is the code :
#define SIZE 1024
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
int i = threadIdx.x ;
if(i < n)
c[i] = a[i] + b[i];
}
int main()
{
int *a , *b , *c;
int *d_a , *d_b , *d_c;
a = (int *)malloc(SIZE * sizeof(int));
b = (int *)malloc(SIZE * sizeof(int));
c = (int *)malloc(SIZE * sizeof(int));
cudaMalloc( &d_a , SIZE * sizeof(int) );
cudaMalloc( &d_b , SIZE * sizeof(int) );
cudaMalloc( &d_c , SIZE * sizeof(int) );
for ( int i = 0 ; i < SIZE ; ++i)
{
a[i] = i ;
b[i] = i ;
c[i] = 0 ;
}
cudaMemcpy(d_a, a, SIZE *sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE *sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE *sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0 ; i < 10 ; ++i)
{
printf("C[%d] = %d\n", i, c[i]);
}
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
The output on the console is this :
c[0] = 0 , c[1] = 0 , c[2] = 0 , c[3] = 0 , c[4] = 0 ....
Why is that it should be :
c[0] = 0 ; c[1] = 2 ; c[2] = 4 ....

In your case the problem depends on your used gpu. Your kernel is launched with 1024 threads per block. Since your gpu is of compute capability 1.x only 512 or 768 threads per block are supported. A detailed list can be found in the official programming guide.
Because you didn't use proper cuda error checking, you weren't possible to get the error returned by the cuda runtime api. A good guide for cuda error checking is given by #talonmies in this SO answer/question.

Copying from cuda 3D memory to linear memory: copied data is not where I expected

Here is my issue:
I have a 3D array of float3 on my device:
int size[3] = {416,464,512};
cudaExtent extent = make_cudaExtent(size[0]*sizeof(float3),size[1],size[2]);
cudaPitchedPtr renderedVolume;
int ret = cudaMalloc3D(&renderedVolume, extent);
size_t pitch = renderedVolume.pitch; //pitch = 5,120
size_t slicePitch = pitch * size[1]; //slicePitch = 2,375,680
Then I work with it and make it full of outstanding data.
After that I wish to copy it on a 1D linear memory on my host:
float *host_memory = (float*)malloc(size[0]*size[1]*size[2]*sizeof(float3));
cudaMemcpy3DParms p = {0};
p.srcPtr = renderedVolume;
p.dstPtr = make_cudaPitchedPtr(host_memory,size[0]*sizeof(float3),size[0],size[1]);
p.extent = make_cudaExtent(size[0]*sizeof(float3),size[1],size[2]);
p.srcPos = make_cudaPos(0,0,0);
p.dstPos = make_cudaPos(0,0,0);
p.kind=cudaMemcpyDeviceToHost;
cudaMemcpy3D(&p);
I am comparing the result in host_memory with the data I initially wrote tu renderedVolume (my_data) and with the data I read in my 3Dmemory, slice by slice:
float* test1 = (float*)malloc(size[0]*size[1]*sizeof(float3));
cudaMemcpy(test1, myData, size[0]*size[1]*sizeof(float3) , cudaMemcpyDeviceToHost);
float* test2 = (float*)malloc(size[0]*size[1]*sizeof(float3));
cudaMemcpy(test2,(char*)renderedVolume.ptr + slicePitch * i,size[0]*size[1]*sizeof(float3), cudaMemcpyDeviceToHost);
Problem:
The first slice (i=0) is ok, I have the same data in host_memory, test1 and test2.
In the second slice, I have the same data in test1 and test2. However, I should find this data in host_memory+579072 (=number of float per slice, also heigth*pitch of the destination pitched pointer) and I find it in host_memory+577504. It is off by 1568 bytes, which corresponds to nothing that I am aware of, and this is why I would very much appreciate if any of you have an idea of what the problem might be in my code ?

This is a late answer provided to remove this question from the unanswered list.
Below, I'm providing a full code showing how to allocate 3D memory by cudaMalloc3D, moving a host allocated 1D memory to 3D device memory by cudaMemcpy3D, performing some operations on the 3D device data by the test_kernel_3D __global__ function and moving the 3D result data back to 1D host memory, again by cudaMemcpy3D.
The __global__ function test_kernel_3D squares each element of the 3D device memory. In particular, each thread of a 2D grid takes care of performing a for loop along the "depth" dimension.
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<conio.h>
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16
#define N 128
#define M 64
#define W 16
/*****************/
/* CUDA MEMCHECK */
/*****************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/******************/
/* TEST KERNEL 3D */
/******************/
__global__ void test_kernel_3D(cudaPitchedPtr devPitchedPtr)
{
int tidx = blockIdx.x*blockDim.x+threadIdx.x;
int tidy = blockIdx.y*blockDim.y+threadIdx.y;
char* devPtr = (char*) devPitchedPtr.ptr;
size_t pitch = devPitchedPtr.pitch;
size_t slicePitch = pitch * N;
for (int w = 0; w < W; w++) {
char* slice = devPtr + w * slicePitch;
float* row = (float*)(slice + tidy * pitch);
row[tidx] = row[tidx] * row[tidx];
}
}
/********/
/* MAIN */
/********/
int main()
{
float a[N][M][W];
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
for (int w=0; w<W; w++) {
a[i][j][w] = 3.f;
//printf("row %i column %i depth %i value %f \n",i,j,w,a[i][j][w]);
}
// --- 3D pitched allocation and host->device memcopy
cudaExtent extent = make_cudaExtent(M * sizeof(float), N, W);
cudaPitchedPtr devPitchedPtr;
gpuErrchk(cudaMalloc3D(&devPitchedPtr, extent));
cudaMemcpy3DParms p = { 0 };
p.srcPtr.ptr = a;
p.srcPtr.pitch = M * sizeof(float);
p.srcPtr.xsize = M;
p.srcPtr.ysize = N;
p.dstPtr.ptr = devPitchedPtr.ptr;
p.dstPtr.pitch = devPitchedPtr.pitch;
p.dstPtr.xsize = M;
p.dstPtr.ysize = N;
p.extent.width = M * sizeof(float);
p.extent.height = N;
p.extent.depth = W;
p.kind = cudaMemcpyHostToDevice;
gpuErrchk(cudaMemcpy3D(&p));
dim3 GridSize(iDivUp(M,BLOCKSIZE_x),iDivUp(N,BLOCKSIZE_y));
dim3 BlockSize(BLOCKSIZE_y,BLOCKSIZE_x);
test_kernel_3D<<<GridSize,BlockSize>>>(devPitchedPtr);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
p.srcPtr.ptr = devPitchedPtr.ptr;
p.srcPtr.pitch = devPitchedPtr.pitch;
p.dstPtr.ptr = a;
p.dstPtr.pitch = M * sizeof(float);
p.kind = cudaMemcpyDeviceToHost;
gpuErrchk(cudaMemcpy3D(&p));
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
for (int w=0; w<W; w++)
printf("row %i column %i depth %i value %f\n",i,j,w,a[i][j][w]);
getch();
return 0;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

CUDA streams and concurrent kernel execution - concurrency

Related

CUDA shuffle instruction reduction slower than shared memory reduction?

CUDA: cascaded summation of all vector elements

Solving sparse definite positive linear systems in CUDA

Why AddVector CUDA c++ is not working?

Copying from cuda 3D memory to linear memory: copied data is not where I expected

Categories

Resources