Finding maximum and minimum with CUBLAS - c++

I'm having problems grasping why my function that finds maximum and minimum in a range of doubles using CUBLAS doesn't work properly.
The code is as follows:
void findMaxAndMinGPU(double* values, int* max_idx, int* min_idx, int n)
{
double* d_values;
cublasHandle_t handle;
cublasStatus_t stat;
safecall( cudaMalloc((void**) &d_values, sizeof(double) * n), "cudaMalloc (d_values) in findMaxAndMinGPU");
safecall( cudaMemcpy(d_values, values, sizeof(double) * n, cudaMemcpyHostToDevice), "cudaMemcpy (h_values > d_values) in findMaxAndMinGPU");
cublasCreate(&handle);
stat = cublasIdamax(handle, n, d_values, sizeof(double), max_idx);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("Max failed\n");
stat = cublasIdamin(handle, n, d_values, sizeof(double), min_idx);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("min failed\n");
cudaFree(d_values);
cublasDestroy(handle);
}
Where values is the values to search within. The max_idx and min_idx are the index of the found numbers in values.
The results from the CUBLAS-calls seems rather random and output wrong indexes.
Anyone with a golly good answer to my problem? I am a tad sad at the moment :(

One of your arguments to both the cublasIdamax and cublasIdamin calls are wrong. The incx argument in BLAS level 1 calls should always be the stride of the input in words, not bytes. So I suspect that you want something more like:
stat = cublasIdamax(handle, n, d_values, 1, max_idx);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("Max failed\n");
stat = cublasIdamin(handle, n, d_values, 1, min_idx);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("min failed\n");
By using sizeof(double) you are telling the routines to use a stride of 8, which will have the calls overrun the allocated storage of the input array and into uninitialised memory. I presume you actually have a stride of 1 in d_values.
Edit: Here is a complete runnable example which works correctly. Note I switched the code to single precision because I don't presently have access to double precision capable hardware:
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cstdio>
#include <cstdlib>
#include <sys/time.h>
typedef float Real;
void findMaxAndMinGPU(Real* values, int* max_idx, int* min_idx, int n)
{
Real* d_values;
cublasHandle_t handle;
cublasStatus_t stat;
cudaMalloc((void**) &d_values, sizeof(Real) * n);
cudaMemcpy(d_values, values, sizeof(Real) * n, cudaMemcpyHostToDevice);
cublasCreate(&handle);
stat = cublasIsamax(handle, n, d_values, 1, max_idx);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("Max failed\n");
stat = cublasIsamin(handle, n, d_values, 1, min_idx);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("min failed\n");
cudaFree(d_values);
cublasDestroy(handle);
}
int main(void)
{
const int vmax=1000, nvals=10000;
float vals[nvals];
srand ( time(NULL) );
for(int j=0; j<nvals; j++) {
vals[j] = float(rand() % vmax);
}
int minIdx, maxIdx;
findMaxAndMinGPU(vals, &maxIdx, &minIdx, nvals);
int cmin = 0, cmax=0;
for(int i=1; i<nvals; i++) {
cmin = (vals[i] < vals[cmin]) ? i : cmin;
cmax = (vals[i] > vals[cmax]) ? i : cmax;
}
fprintf(stdout, "%d %d %d %d\n", minIdx, cmin, maxIdx, cmax);
return 0;
}
which when compiled and run gives this:
$ g++ -I/usr/local/cuda/include -L/usr/local/cuda/lib cublastest.cc -lcudart -lcublas
$ ./a.out
273 272 85 84
note that CUBLAS follows the FORTRAN convention and uses 1 indexing, rather than zero indexing, which is why there is a difference of 1 between the CUBLAS and CPU versions.

from description: The element of the maximum magnitude:
http://docs.nvidia.com/cuda/cublas/index.html#topic_6_1
if you have { 1, 2, 3, -33, 22, 11 }
result will be 4! not 5
abs(-33) > 22

Related

cuda kernel gives incorrect results by grid size increase

I am testing a simple CUDA algorithm for timing and I came across a case that when I increase the grid size of the kernel it gives incorrect results:
#include <unistd.h>
#include <stdio.h>
#include <assert.h>
/* we need these includes for CUDA's random number stuff */
#include <curand.h>
#include <curand_kernel.h>
#define MAX 10
#ifdef GRID
#define REPEAT GRID
#else
#define REPEAT 65535
#endif
#ifdef VECSIZE
#define SIZE VECSIZE
#else
#define SIZE 1024
#endif
__global__ void random(int *result) {
curandState_t state;
curand_init(100, 0, threadIdx.x, &state);
result[threadIdx.x] = curand(&state) % MAX;
//printf("th %d random %d\n", threadIdx.x, *result);
}
__global__ void myadd(const int *in, int *sum) {
sum[blockIdx.x] = 0;
//printf("thread %d value %d\n",threadIdx.x, in[threadIdx.x]);
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
//atomicAdd(sum, in[threadIdx.x]);
}
int main() {
int check = 0;
/* allocate an int on the GPU */
int *x = new int[SIZE];
int *sum = new int[REPEAT];
int *d_x, *d_sum;
cudaMalloc(&d_x, sizeof(int) * SIZE);
cudaMalloc(&d_sum, sizeof(int) * REPEAT);
/* invoke the GPU to initialize all of the random states */
random<<<1, SIZE>>>(d_x);
myadd<<<REPEAT, SIZE>>>(d_x, d_sum);
cudaDeviceSynchronize();
/* copy the random number back */
cudaMemcpy(x, d_x, sizeof(int) * SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(sum, d_sum, sizeof(int)* REPEAT, cudaMemcpyDeviceToHost);
for (int i = 0; i < SIZE; ++i) {
check += x[i];
//printf("Random[%d] = %d\n", i, x[i]);
}
cudaError_t err = cudaGetLastError(); // Get error code
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
exit(-1);
}
for (int i = 0; i < REPEAT; ++i) {
printf("i %d check %d sum[i] %d\n", i, check, sum[i]);
assert(check == sum[i]);
}
/* free the memory we allocated */
cudaFree(d_x);
cudaFree(d_sum);
delete[] x;
delete[] sum;
return 0;
}
My card is V100 with compute capability of 7.0. As you can see I can compile the above code with different grid and vector sizes with nvcc test.cu -arch=sm_70 -O3 -g -G -DGRID=1024 -DVECSIZE=512, for small vector and grid sizes everything looks good but when I increase the grid size to max (65535) sometimes the computed sum value is incorrect. For example:
.
.
.
i 511 check 2331 sum[i] 2331
i 512 check 2331 sum[i] 2331
i 513 check 2331 sum[i] 2188
a.out: test.cu:87: int main(): Assertion `check == sum[i]' failed.
There is a race condition in kernel myadd. The sum must only be set to 0 once. And it should not be set to 0 after some other threads added their value to it.
__global__ void myadd(const int *in, int *sum) {
if(threadIdx.x == 0){
sum[blockIdx.x] = 0;
}
__syncthreads(); // all threads wait until sum is initialized with 0
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
}
If you want to time your code properly, you should remove the -G compiler flag.

CUDA Dynamic Parallelism, bad performance

We are having performance issues when using the CUDA Dynamic Parallelism. At this moment, CDP is performing at least 3X slower than a traditional approach.
We made the simplest reproducible code to show this issue, which is to increment the value of all elements of an array by +1. i.e.,
a[0,0,0,0,0,0,0,.....,0] --> kernel +1 --> a[1,1,1,1,1,1,1,1,1]
The point of this simple example is just to see if CDP can perform as the others, or if there are serious overheads.
The code is here:
#include <stdio.h>
#include <cuda.h>
#define BLOCKSIZE 512
__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);
// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
cudaStream_t s1, s2;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid == 0){
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
}
a[tid] += 1;
}
__global__ void kernel_simple(int *a, int n, int N, int offset){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pos = tid + offset;
if(pos < N){
a[pos] += 1;
}
}
int main(int argc, char **argv){
if(argc != 3){
fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional) \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
exit(EXIT_FAILURE);
}
int N = atoi(argv[1])*3;
int method = atoi(argv[2]);
// init array as 0
int *ah, *ad;
printf("genarray of 3*N = %i.......", N); fflush(stdout);
ah = (int*)malloc(sizeof(int)*N);
for(int i=0; i<N; ++i){
ah[i] = 0;
}
printf("done\n"); fflush(stdout);
// malloc and copy array to gpu
printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
cudaMalloc(&ad, sizeof(int)*N);
cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
printf("done\n"); fflush(stdout);
// kernel launch (timed)
cudaStream_t s1, s2, s3;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Kernel...........................", N); fflush(stdout);
if(method == 0){
// CLASSIC KERNEL LAUNCH
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block >>> (ad, N, N, 0);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else if(method == 1){
// DYNAMIC PARALLELISM
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else{
// THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
printf("done\n"); fflush(stdout);
printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("done\n"); fflush(stdout);
printf("checking result.................."); fflush(stdout);
for(int i=0; i<N; ++i){
if(ah[i] != 1){
fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
exit(EXIT_FAILURE);
}
}
printf("done\n"); fflush(stdout);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("rtime: %f ms\n", rtime); fflush(stdout);
return EXIT_SUCCESS;
}
Can be compiled with
nvcc -arch=sm_35 -rdc=true -lineinfo -lcudadevrt -use_fast_math main.cu -o prog
This example can compute the result with 3 methods:
Simple Kernel: Just a single classic kernel +1 pass on the array.
Dynamic Parallelism: from main(), call a parent kernel which does +1 on the range [0,N/3), and also calls two child kernels. The first child does +1 in the range [N/3, 2*N/3), the second child in the range [2*N/3,N). Childs are launched using different streams so they can be concurrent.
Three Streams from Host: This one just launches three non-blocking streams from main(), one for each third of the array.
I get the following profile for method 0 (simple kernel):
The following for method 1 (dynamic parallelism):
And the following for method 2 (Three Streams from Host)
The running times are like this:
➜ simple-cdp git:(master) ✗ ./prog 16777216 0
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.140928 ms
➜ simple-cdp git:(master) ✗ ./prog 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 5.790048 ms
➜ simple-cdp git:(master) ✗ ./prog 16777216 2
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.011936 ms
The main problem, visible from the pictures, is that in the Dynamic Parallelism method the parent kernel is taking excessive amount of time to close after the two child kernels have finished, which is what is making it take 3X or 4X times more. Even when considering the worst case, if all three kernels (parent and two childs) run in serial, it should take much less. I.e., there is N/3 of work for each kernel, so the whole parent kernel should take approx 3 child kernels long, which is much less. Is there a way to solve this problem?
EDIT: The serialization phenomenon of the child kernels, as well as for method 2, have been explained by Robert Crovella in the comments (many thanks). The fact that the kernels did run in serial do not invalidate the problem described in bold text (not for now at least).
Calls into the device runtime are "expensive", just like calls into the host runtime are expensive. In this case, it seems that you are calling into the device runtime to create streams for every thread, even though this code only requires them for thread 0.
By modifying your code to only request the stream creation for thread 0, we can produce timing parity between the case where we are using separate streams for the child kernel launch, and the case where we are not using separate streams for the child kernel launch:
$ cat t370.cu
#include <stdio.h>
#define BLOCKSIZE 512
__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);
// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid == 0){
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
#ifdef USE_STREAMS
cudaStream_t s1, s2;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
#else
kernel_simple<<< grid, block >>> (a, n, N, n);
kernel_simple<<< grid, block >>> (a, n, N, 2*n);
#endif
// these next 2 lines add noticeably to the overall timing
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("oops1: %d\n", (int)err);
}
a[tid] += 1;
}
__global__ void kernel_simple(int *a, int n, int N, int offset){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pos = tid + offset;
if(pos < N){
a[pos] += 1;
}
}
int main(int argc, char **argv){
if(argc != 3){
fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional) \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
exit(EXIT_FAILURE);
}
int N = atoi(argv[1])*3;
int method = atoi(argv[2]);
// init array as 0
int *ah, *ad;
printf("genarray of 3*N = %i.......", N); fflush(stdout);
ah = (int*)malloc(sizeof(int)*N);
for(int i=0; i<N; ++i){
ah[i] = 0;
}
printf("done\n"); fflush(stdout);
// malloc and copy array to gpu
printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
cudaMalloc(&ad, sizeof(int)*N);
cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
printf("done\n"); fflush(stdout);
// kernel launch (timed)
cudaStream_t s1, s2, s3;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Kernel...........................", N); fflush(stdout);
if(method == 0){
// CLASSIC KERNEL LAUNCH
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block >>> (ad, N, N, 0);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else if(method == 1){
// DYNAMIC PARALLELISM
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else{
// THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
printf("done\n"); fflush(stdout);
printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("done\n"); fflush(stdout);
printf("checking result.................."); fflush(stdout);
for(int i=0; i<N; ++i){
if(ah[i] != 1){
fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
exit(EXIT_FAILURE);
}
}
printf("done\n"); fflush(stdout);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("rtime: %f ms\n", rtime); fflush(stdout);
return EXIT_SUCCESS;
}
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.925632 ms
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu -DUSE_STREAMS
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.673568 ms
$
Although not included in the test output above, according to my testing, this also brings the CUDA dynamic parallelism (CDP) case (1) into "approximate parity" with the non-CDP cases (0, 2). Note that we can shave about 1 ms (!) off the above time by forgoing the call to cudaGetLastError() in the parent kernel (which I added to your code).
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
using thrust::host_vector;
using thrust::device_vector;
#define BLOCKSIZE 512
__global__ void child(int* a)
{
if (threadIdx.x == 0 && blockIdx.x == 0)
a[0]++;
}
__global__ void parent(int* a)
{
if (threadIdx.x == 0 && blockIdx.x == 0)
child<<<gridDim, blockDim>>>(a);
}
#define NBLOCKS 1024
#define NTHREADS 1024
#define BENCHCOUNT 1000
template<typename Lambda>
void runBench(Lambda arg, int* rp, const char* name)
{
// "preheat" the GPU
for (int i = 0; i < 100; i++)
child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < BENCHCOUNT; i++)
arg();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("=== %s ===\n", name);
printf("time: %f ms\n", rtime/BENCHCOUNT); fflush(stdout);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaDeviceSynchronize();
}
int main(int argc, char **argv)
{
host_vector<int> hv(1);
hv[0] = 0xAABBCCDD;
device_vector<int> dv(1);
dv = hv;
int* rp = thrust::raw_pointer_cast(&dv[0]);
auto benchFun = [&](void) {
child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp); };
runBench(benchFun, rp, "Single kernel launch");
auto benchFun2 = [&](void) {
for (int j = 0; j < 2; j++)
child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp);
};
runBench(benchFun2, rp, "2x sequential kernel launch");
auto benchFunDP = [&](void) {
parent<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp); };
runBench(benchFunDP, rp, "Nested kernel launch");
}
To build/run:
Copy/paste code above to dpar.cu
nvcc -arch=sm_52 -rdc=true -std=c++11 -lcudadevrt -o dpar dpar.cu
./dpar
On my p5000 laptop it prints:
=== Single kernel launch ===
time: 0.014297 ms
=== 2x sequential kernel launch ===
time: 0.030468 ms
=== Nested kernel launch ===
time: 0.083820 ms
So the overhead is quite large.. looks like in my case 43 microseconds.

Varying results from cuBlas

I have implemented the following CUDA code but i am a little bit confused about the behavior.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
void PrintMatrix(float* a, int n)
{
int j, i;
for (j = 1; j <= n; j++)
{
for (i = 1; i <= n; i++)
{
printf("%7.0f", a[IDX2F(i, j, n)]);
}
printf("\n");
}
}
float* CreateMatrix(int n)
{
float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
if (!matrix)
{
printf("host memory allocation failed");
return nullptr;
}
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
matrix[IDX2F(i, j, n)] = 2;
}
}
return matrix;
}
long CudaMatrixMultiply(float* matrix, int n)
{
cudaError_t cudaStat;
cublasStatus_t status;
cublasHandle_t handle;
float* deviceMatrix;
cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
if (cudaStat != cudaSuccess)
{
printf("device memory allocation failed");
return EXIT_FAILURE;
}
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data download failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
float alpha = 1;
float beta = 0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);
status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data upload failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_SUCCESS;
}
float* CpuMatrixMultiply(float* matrix, int size)
{
float* result = new float[size * size]();
// Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
for (int row = 1; row <= size; row++)
{
for (int col = 1; col <= size; col++)
{
// Multiply the row of A by the column of B to get the row, column of product.
for (int inner = 1; inner <= size; inner++)
{
// result[row][col] += matrix[row][inner] * matrix[inner][col];
result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
}
}
}
free(matrix);
return result;
}
int main(void)
{
// printf("Matrix * Matrix Test\n");
int size = 1000;
int runs = 10;
for (int run = 0; run != runs; run++)
{
printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);
float* cpuMatrix = CreateMatrix(size);
cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);
PrintMatrix(cpuMatrix, 5);
float* gpuMatrix = CreateMatrix(size);
CudaMatrixMultiply(gpuMatrix, size);
PrintMatrix(gpuMatrix, 5);
free(cpuMatrix);
free(gpuMatrix);
}
getchar();
return EXIT_SUCCESS;
}
The ouput of the CPU version of the MatrixMultiplication is the following as expected:
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
but the result of the GPU computed is sometimes the right one (see above) or a wrong random(?) one. When the loop is executed the first time then the result was always the right one.
I am not able to find a mistake in my code and it would be great if you could help me.
Additionally if i set size (int the main method) to e.g. 16000 then my driver is crashing and i get an error message. For this i have written a bug report to NVidea because my pc crashed twice. But maybe it is a programming fault by me?
Driver: 364.72 (newest one)
SDK: CUDA Toolkit 7.5
Graphics Card: NVidia GeForce GTX 960 (4GB)
Windows 10 64Bit
Driver Error
Display driver NVIDIA Windows kernel Mode Driver, Version 362.72 stopped responding and has successfully recovered.
Edit: With the help of the community i found out that this is a problem with the watchdog timer. See answer below.
Regarding the second part of the question, following njuffa's remark, you may change the settings for driver behavior to avoid the error when increasing size. Open NSIGHT Monitor and in Options, General, Microsoft Display Driver, change to False the WDDM TDR enabled field.
From spec, the 32bits FPU flops should be around 2.4 TFLOPS in single precision, hence your operation for a 16000 sized matrix should take at the minimum 3.5 seconds. Hence the Driver Recovery after 2 seconds.

Cuda not giving correct answer when array size is larger than 1,000,000

I have wrote a simple sum reduction code which seems to work just fine until i increase array size to 1 million what can be the problem.
#define BLOCK_SIZE 128
#define ARRAY_SIZE 10000
cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size);
__global__ void sumKernel(const long *input, long *output)
{
int tid = threadIdx.x;
int bid = blockDim.x * blockIdx.x;
__shared__ long data[BLOCK_SIZE];
if(bid+tid < ARRAY_SIZE)
data[tid] = input[bid+tid];
else
data[tid] = 0;
__syncthreads();
for(int i = BLOCK_SIZE/2; i >= 1; i >>= 1)
{
if(tid < i)
data[tid] += data[tid + i];
__syncthreads();
}
if(tid == 0)
output[blockIdx.x] = data[0];
}
int main()
{
int totalBlocks = ARRAY_SIZE/BLOCK_SIZE;
if(ARRAY_SIZE % BLOCK_SIZE != 0)
totalBlocks++;
long *input = (long*) malloc(ARRAY_SIZE * sizeof(long) );
long *output = (long*) malloc(totalBlocks * sizeof(long) );
for(int i=0; i<ARRAY_SIZE; i++)
{
input[i] = i+1 ;
}
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(input, output, totalBlocks, ARRAY_SIZE);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
long ans = 0;
for(int i =0 ; i < totalBlocks ;i++)
{
ans = ans + output[i];
}
printf("Final Ans : %ld",ans);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
getchar();
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(const long *input, long *output, int totalBlocks, size_t size)
{
long *dev_input = 0;
long *dev_output = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for two vectors (one input, one output) .
cudaStatus = cudaMalloc((void**)&dev_input, size * sizeof(long));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_output, totalBlocks * sizeof(long));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_input, input, size * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_output, output, (totalBlocks) * sizeof(long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(output, dev_output, totalBlocks * sizeof(long), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_input);
cudaFree(dev_output);
return cudaStatus;
}
and just for the reference if it has to do somthing with my GPU device, my GPU is GTXX 650ti.
and here is the info about GPU:
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 2147483647 x 65535 x 65535
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Actually the answer =could not fit in long as well so after using long double for datatypes this issue was resolved. Thanks all!
One problem in your code is that your last cudaMemcpy is not set up correctly:
cudaMemcpy(output, dev_output, totalBlocks * sizeof(int), cudaMemcpyDeviceToHost);
All of your data is long data so you should be copying using sizeof(long) not sizeof(int)
Another problem in your code is using the wrong printf format identifier for a long datatype:
printf("\n %d \n",output[i]);
use something like this instead:
printf("\n %ld \n",output[i]);
You may also have a problem with a large block count if you are not compiling for sm_30 architecture. In that case, proper cuda error checking would identify the problem.
You don't check for errors after sumKernel<<<totalBlocks, BLOCK_SIZE>>>(dev_input, dev_output);. Normally, if you would check for the last occured error it should give the error invalid configuration argument. Try adding the following after the sumKernel line.
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
printf(stderr, "sumKernel failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
See this question for more information about the error.

CUDA Matrix Addition Seg faults

I just have a question about my cuda program that I wrote. It allows me to enter the size of the matrix, col and rows. Say I enter ~1124 and it computes fine. However say I enter 1149 it Seg faults AFTER computing in the device(I think it's seg faulting during the copy back). But say I enter 2000 it seg faults BEFORE computing in the device(I think it seg faults during the copy over). I think my issue is all with memory management. If you guys could point me in the right direction I'd appreciate it.
I udpated the code with how it is called. In the new edit(at the bottom) it contains: sumMatrix(blank matrix with the size of eleCount1, which is the size of the entire matrix), matrixOne(first matrix),matrixTwo(second matrix, allocated same way matrix1 is done),eleCount1(entire size of matrix). Both matrixOne and two are read in from a file.
Wasn't sure if someone needed to see this stuff about my GPU:
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535
The code is:
void addKernel(float *c, float *a, float *b)
{
int i = threadIdx.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
c[idx] = a[idx] + b[idx];
}
cudaError_t addWithCuda(float *c, float *a, float *b, size_t size)
{
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
blocksNeeded=(size/MAXTHREADS)+1;
int threadsPerBlock = MAXTHREADS/blocksNeeded+1;
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
addKernel<<<blocksNeeded, size>>>(dev_c, dev_a, dev_b);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
//edit: added how the matrix are allocated
float* matrixOne = (float*)malloc(sizeof(float)*file1size);
int matrixIndex = 0;
readFromFile(fd,byte, matrixOneWidth, matrixOneHeight, matrixOne);
//matrixOneHeight--;
eleCount1 = matrixOneHeight*matrixOneWidth;
matrixOne= (float*)realloc(matrixOne,eleCount1*sizeof(float));
//Edit: Added how the addWithCuda is called.
cudaStatus = addWithCuda(sumMatrix, matrixOne,matrixTwo,eleCount1);
//sumMatrix is created after we know how large the matrices are.
float sumMatrix[eleCount1];
You are not testing the bounds of your computation inside the kernel. If the total amount of work does not evenly divide to the size of a block, some threads will try to write to indices that are outside the output array. I suggest you also pass the size as a parameter to the kernel and introduce the check:
__global__ void addKernel(float *c, float *a, float *b, int size)
{
int i = threadIdx.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx < size) c[idx] = a[idx] + b[idx];
}
I see that you are indexing into arrays a, b and c in your kernel, but you do not check to make sure that the index is within the array bounds. You are therefore writing into memory that you do not own, causing seg faults in random places.