Error when accessing CUDA array from cudaMalloc - c++

I'm using VS2019 and have an NVIDIA GeForce GPU. I tried the code from this link: https://towardsdatascience.com/writing-lightning-fast-code-with-cuda-c18677dcdd5f
However, I want to try using cudaMalloc instead of using managed memory with cudaMallocManaged
I tried the code below:
__global__
void add(int n, float* x, float* y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main()
{
int N = 1 << 20;
float* x, * y;
cudaMalloc(&x, N * sizeof(float));
cudaMalloc(&y, N * sizeof(float));
cudaMemset(x,1.0, N * sizeof(float)); //want to set x as an array of 1.0s
cudaMemset(y,2.0, N * sizeof(float)); //want to set y as an array of 2.0s
int device = -1;
cudaGetDevice(&device);
int blockSize = 1024;
int numBlocks = (N + blockSize - 1) / blockSize;
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
std::cout << "duration CUDA: "<<duration;
cudaFree(x);
cudaFree(y);
return 0;
}
But I'm getting an unhandled exception error at maxError = fmax(maxError, fabs(y[i] - 3.0f));, I'm guessing because I didn't use cudaMemset correctly? How should I modify it?

In no particular order:
Device memory (i.e. memory allocated with cudaMalloc) can't be accessed directly on the host, so your maxError calculations are illegal because y is a pointer in device memory. To perform the error check, you require a copy of y to a local host copy of the memory before running the loop
cudaMemset sets bytes, not words (just like regular memset). You either need to set values on the host and copy them to the device, or in another kernel, or use something like thrust::fill_n.
In the spirit of your previous question, there is typically setup latency in the first call of a kernel, so perform a warm-up before timing
Doing these three things gets me this:
int main()
{
int N = 1 << 20;
std::vector<float> xh(N, 1.0f);
std::vector<float> yh(N, 2.0f);
float* x, * y;
cudaMalloc(&x, N * sizeof(float));
cudaMemcpy(x, &xh[0], N * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&y, N * sizeof(float));
cudaMemcpy(y, &yh[0], N * sizeof(float), cudaMemcpyHostToDevice);
int blockSize, numBlocks;
cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, add);
for(int rep=0; rep<10; rep++) {
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << rep << " duration CUDA: " << duration <<std::endl;
}
cudaMemcpy(&yh[0], y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(yh[i] - 12.0f));
std::cout << "Max error: " << maxError << std::endl;
cudaFree(x);
cudaFree(y);
cudaDeviceReset();
return 0;
}
And compiling it and running it gets me this:
$ nvcc -arch=sm_52 -std=c++11 -o devmem devmem.cu
$ ./devmem
0 duration CUDA: 155
1 duration CUDA: 94
2 duration CUDA: 95
3 duration CUDA: 94
4 duration CUDA: 94
5 duration CUDA: 93
6 duration CUDA: 93
7 duration CUDA: 99
8 duration CUDA: 92
9 duration CUDA: 93
Max error: 0
Compared to the timings in my last answer to you, you can see that using device memory provides speedup over managed memory on my system. As always, your results might vary.

Related

CUDA straight array implementation 3x faster than Struct of Arrays (SoA) AND Array of Structs (AoS)

I found this question about the speed of Structure of Array (SoA) and Array of Struct (AoS) implementation and the top answer states that the speed of each implementation depends on access pattern. To test this, I've created 3 kernels where one is for raw array of doubles, another is for AoS pattern, and the last is for SoA pattern. Each method adds 1.0 to each respective "element" in some column vector if you will. What I'm finding is that the straight array case is more 3x faster than both SoA and AoS implementations. I cannot understand why the straight array is this much faster when the access patterns are identical between all three methods? The CUDA profiler outputs the same message for each kernel which is
The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To further improve performance, work will likely need to be shifted from the most utilized to another unit. Start by analyzing workloads in the Compute Workload Analysis section.
Here is a minimal reproducible example:
#include <iostream>
#include <chrono>
#include <vector>
using namespace std::chrono;
constexpr int nvars = 4;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct AoS {
double t, x, y, z;
__device__ AoS& operator+=(const AoS &vec) {
t += vec.t;
x += vec.x;
y += vec.y;
z += vec.z;
return *this;
};
};
struct SoA {
size_t size;
bool host_allocated, device_allocated;
double* t;
double* x;
double* y;
double* z;
double* gpu_t;
double* gpu_x;
double* gpu_y;
double* gpu_z;
SoA* device_ptr;
SoA(int elements) : size(elements * sizeof(double)), host_allocated(false), device_allocated(false) {};
~SoA(){
if (host_allocated){
free(t);
free(x);
free(y);
free(z);
}
if (device_allocated) {
cudaFree(gpu_t);
cudaFree(gpu_x);
cudaFree(gpu_y);
cudaFree(gpu_z);
cudaFree(device_ptr);
}
}
void host_allocate() {
t = (double*)malloc(size);
x = (double*)malloc(size);
y = (double*)malloc(size);
z = (double*)malloc(size);
host_allocated = true;
};
void device_allocate() {
if (!host_allocated) {
host_allocate();
}
gpuErrchk(cudaMalloc((void**)&device_ptr, sizeof(SoA)));
gpuErrchk(cudaMalloc((void**)&gpu_t, size));
gpuErrchk(cudaMalloc((void**)&gpu_x, size));
gpuErrchk(cudaMalloc((void**)&gpu_y, size));
gpuErrchk(cudaMalloc((void**)&gpu_z, size));
device_allocated = true;
};
void copy_to_device() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(cudaMemcpy(gpu_t, t, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_x, x, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_y, y, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(gpu_z, z, size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->t), &gpu_t, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->x), &gpu_x, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->y), &gpu_y, sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&(device_ptr->z), &gpu_z, sizeof(double *), cudaMemcpyHostToDevice));
}
void copy_to_host() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(cudaMemcpy(t, gpu_t, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(x, gpu_x, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(y, gpu_y, size, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(z, gpu_z, size, cudaMemcpyDeviceToHost));
}
SoA* get_ptr(){
if (device_allocated) {
return device_ptr;
}
return this;
}
};
__global__ void arr_add(double* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
#pragma unroll
for (int var = 0; var < nvars; var++) {
vec[ii + var] = vec[ii + var] + 1.0;
}
};
__global__ void aos_add(AoS* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec[ii] += AoS{1.0, 1.0, 1.0, 1.0};
};
__global__ void soa_add(SoA *vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec->t[ii] = vec->t[ii] + 1.0;
vec->x[ii] = vec->x[ii] + 1.0;
vec->y[ii] = vec->y[ii] + 1.0;
vec->z[ii] = vec->z[ii] + 1.0;
};
int main() {
constexpr int n = 1 << 25;
constexpr int block_size = 128;
high_resolution_clock::time_point t1, t2;
duration<double> dt1, dt2, dt3;
SoA hybrid_soa(n);
hybrid_soa.device_allocate();
hybrid_soa.copy_to_device();
std::vector<AoS> host_vec_aos(n);
std::vector<double> host_arr(n * nvars);
AoS *dev_aos;
double *dev_arr;
gpuErrchk(cudaMalloc((void**)&dev_aos, n * sizeof(AoS)));
gpuErrchk(cudaMalloc((void**)&dev_arr, n * nvars * sizeof(double)));
gpuErrchk(cudaMemcpy(dev_aos, host_vec_aos.data(), n * sizeof(AoS), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(dev_arr, host_arr.data(), n * nvars * sizeof(double), cudaMemcpyHostToDevice));
const int nblocks = (n + block_size - 1) / block_size;
std::cout << "Size of AoS struct is " << sizeof(AoS) << " bytes" << "\n";
std::cout << "Size of SoA struct is " << sizeof(SoA) << " bytes" << "\n";\
t1 = high_resolution_clock::now();
arr_add<<<nblocks, block_size>>>(dev_arr, n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt1 = t2 - t1;
std::cout << "SrA took: " << std::scientific << dt1.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
aos_add<<<nblocks, block_size>>>(dev_aos, n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt2 = t2 - t1;
std::cout << "AoS took: " << std::scientific << dt2.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
soa_add<<<nblocks, block_size>>>(hybrid_soa.get_ptr(), n);
gpuErrchk(cudaDeviceSynchronize());
t2 = high_resolution_clock::now();
dt3 = t2 - t1;
std::cout << "SoA took: " << std::scientific << dt3.count() << " seconds" << "\n";
gpuErrchk(cudaMemcpy(host_vec_aos.data(), dev_aos, n * sizeof(AoS), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(host_arr.data(), dev_arr, n * nvars * sizeof(double), cudaMemcpyDeviceToHost));
hybrid_soa.copy_to_host();
std::cout << "Straight array is: " << dt2.count() / dt1.count() << " times faster than AoS" << "\n";
std::cout << "Straight array is: " << dt3.count() / dt1.count() << " times faster than SoA" << "\n";
cudaFree(dev_aos);
cudaFree(dev_arr);
return 0;
}
I was expecting the speed to be near identical.

Trying to run a CusolverSSgels testcase, however it is not working

I'm busy working on a LS method, I manually implemented a conjugate gradient solver, but after updating my CUDA version, I saw that there is a new function (cusolverDnSSgels) which I assume is faster than my manual implementation. My first task was to try and run it on a test case (see below), I'd expect the result to be: -6.5, 9.7 according to MATlab. Unfortunately I cannot find what I did wrong, I also cannot find an example because it is a relatively new function.
The output says that niter= -3, which would suggest too many iterations according to the documentation, however this would not make sense, as it is a very small matrix which should be easily solvable.
#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "device_launch_parameters.h"
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int dinfo = 0;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << dinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
The results I get are:
Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08
Could someone point out what I am doing wrong?
You have a problem with your dinfo parameter usage. Referring to the documentation, we see that:
Parameters of cusolverDngels() functions
parameter Memory In/out Meaning
dinfo device output Status of the IRS solver on the return. If 0 - solve was successful. If dinfo = -i then i-th argument is not valid.
the dinfo parameter is expected to live in device memory. But you have it in host memory:
int dinfo = 0;
If I move the storage to the proper location, your code outputs the values you indicate as expected:
$ cat t143.cu
#include <iostream>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int *dinfo, hinfo;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
cudaMalloc(&dinfo, sizeof(int));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, dinfo);
cudaMemcpy(&hinfo, dinfo, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << hinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
$ nvcc -o t143 t143.cu -lcublas -lcusolver
$ cuda-memcheck ./t143
========= CUDA-MEMCHECK
Status = 0
Status = 0
niter = -51
dinfo = 0
-6.5, 9.7
========= ERROR SUMMARY: 0 errors
$
Notes:
I am using CUDA 11.3 for the above. If you are using an earlier version, I strongly recommend you move forward to CUDA 11.3 or newer for usage of this function.
You can get a hint as to the problem by running your code with cuda-memcheck
It was fairly quick to spot the problem by reviewing your parameter usage with the table of parameter locations (host/device) given in the documentation. You had a problem here which was similar in that you could focus in on the problem by reviewing your parameter locations (host/device) against the table given in the documentation. This may be a good thing to check to save yourself time in the future.

Not able to get the cublasSgelsbatched function to work

I'm currently trying to get the cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) version to work. I started by first making a small test case to see what parameters are needed exactly and how they need to be inputted. However after much trial and error I still can't get it to work, I get a status return of 13, which corresponds to CUBLAS_STATUS_EXECUTION_FAILED which is a very vague error, also I tried some other cublas testcases and they seem to be working fine. I also tested the input matrix in MATlab, which does have a LS solution.
#include "stdafx.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cmath>
#include <Windows.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
&gAmat,
lda, //or 1
&gYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
I'm on Windows 10 running in MVS using CUDA 9.0
I'd really appreciate some help
As pointed out in the comments, you are not creating a proper array of pointers on the device. The batched function works with an array of pointers that lives in device memory, for the data parameters, for example:
Aarray device input/output array of pointers to array, with each array of dim. m x n with lda>=max(1,m). Matrices Aarray[i] should not overlap; otherwise, undefined behavior is expected.
Passing for example &gAmat seems to satisfy the type requirement, but that pointer does not point to device memory.
The following modifications to your code focused on proper handling of gAmat and gYmat seem to run without error for me:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used
t130.cu(24): warning: variable "ldb" was declared but never referenced
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168, 0.514827, -4.29722
========= ERROR SUMMARY: 0 errors
$
Your code only shows a single array. If you had a batch of arrays, you would pass an actual array of device-allocated pointers, for each of A and Y.
Based on comments below, here is a version of the code using non-random input:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t status;
// create handle
status = cublasCreate(&m_cuBLAS);
std::cout << "status = " << status << std::endl;
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5, 9.7, 0.707106
========= ERROR SUMMARY: 0 errors
$

why CUDA doesn't result in speedup in C++ code?

I'm using VS2019 and have an NVIDIA GeForce GPU. I tried the code from this link: https://towardsdatascience.com/writing-lightning-fast-code-with-cuda-c18677dcdd5f
The author of that post claims to get a speedup when using CUDA. However, for me, the serial version takes around 7 milliseconds while the CUDA version takes around 28 milliseconds. Why is CUDA slower for this code? The code I used is below:
__global__
void add(int n, float* x, float* y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
void addSerial(int n, float* x, float* y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main()
{
int NSerial = 1 << 20;
float* xSerial = new float[NSerial];
float* ySerial = new float[NSerial];
for (int i = 0; i < NSerial; i++) {
xSerial[i] = 1.0f;
ySerial[i] = 2.0f;
}
auto t1Serial = std::chrono::high_resolution_clock::now();
addSerial(NSerial, xSerial, ySerial);
auto t2Serial = std::chrono::high_resolution_clock::now();
auto durationSerial = std::chrono::duration_cast<std::chrono::milliseconds>(t2Serial - t1Serial).count();
float maxErrorSerial = 0.0f;
for (int i = 0; i < NSerial; i++)
maxErrorSerial = fmax(maxErrorSerial, fabs(ySerial[i] - 3.0f));
std::cout << "Max error Serial: " << maxErrorSerial << std::endl;
std::cout << "durationSerial: "<<durationSerial << std::endl;
delete[] xSerial;
delete[] ySerial;
int N = 1 << 20;
float* x, * y;
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
int device = -1;
cudaGetDevice(&device);
cudaMemPrefetchAsync(x, N * sizeof(float), device, NULL);
cudaMemPrefetchAsync(y, N * sizeof(float), device, NULL);
int blockSize = 1024;
int numBlocks = (N + blockSize - 1) / blockSize;
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
std::cout << "duration CUDA: "<<duration;
cudaFree(x);
cudaFree(y);
return 0;
}
There are several observations to make here:
The first call of a CUDA kernel can accumulate a lot of one time latency associated with setup on the GPU, so the normal approach is to include a "warm-up" call
The kernel design in your question is a "resident" design, so optimal execution should occur when you launch only as many blocks as required to fully occupy your GPU. There is an API you can use to get this information for your GPU.
Perform timing in microseconds, not milliseconds
Build your code in release mode.
Doing all of this to your CUDA code gets me this:
int N = 1 << 20;
int device = -1;
cudaGetDevice(&device);
float* x, * y;
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemPrefetchAsync(x, N * sizeof(float), device, NULL);
cudaMemPrefetchAsync(y, N * sizeof(float), device, NULL);
int blockSize, numBlocks;
cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, add);
for(int rep=0; rep<10; rep++) {
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << rep << " duration CUDA: " << duration <<std::endl;
}
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 12.0f));
std::cout << "Max error: " << maxError << std::endl;
cudaFree(x);
cudaFree(y);
And building it and running it:
$ nvcc -arch=sm_52 -std=c++11 -o not_so_fast not_so_fast.cu
$ ./not_so_fast
Max error Serial: 0
durationSerial: 2762
0 duration CUDA: 1074
1 duration CUDA: 150
2 duration CUDA: 151
3 duration CUDA: 158
4 duration CUDA: 152
5 duration CUDA: 152
6 duration CUDA: 147
7 duration CUDA: 124
8 duration CUDA: 112
9 duration CUDA: 113
Max error: 0
On my system, the first GPU run close to three times as fast as the serial loop. The second and subsequent runs are almost 10 times faster again. Your results can (and probably will) vary.

Why repeating a kernel inside a for-loop makes CUDA code significantly slower?

Suppose we have four float arrays to be used on the host side, as well as its four counterparts to be used on the device side:
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
Now assume that we have a very simple kernel, taken from one of the examples at NVIDIA's blog:
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}
Such kernel is to be called by the host side inside a for-loop, like the following:
for (int r = 0; r < LOOP_N; r++)
{
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}
And then I compare the execution time of such loop against its pure-CPU version:
for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}
Now, what I don't understand is the following. For instance with ARRAYS_SIZE = 1000000 and for LOOP_N = 1000, when I run both loops in the versions shown above, I get a ratio between the execution time of CPU version and CUDA version that is around 6. It is, the CUDA version is approximately 6 times faster.
However, if I comment out one of the calls to saxpy that is inside the CUDA version of the loop and one of the calculations inside the CPU version of the loop, the ratio between CPU and CUDA becomes around 210. It is, the CUDA version is approximately 210 times faster.
What is the technical reason for such performance loss when merely repeating the call to a kernel, if no memory is being transferred to / from the device? Are there any workarounds to this?
A (hopefully) fully reproducible code example goes below:
#include <algorithm>
#include <chrono>
#include <iostream>
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Typedef and constant variables
typedef std::chrono::high_resolution_clock::time_point timers;
const int LOOP_N = 1000;
const int ARRAYS_SIZE = 1000000;
//Pretty simple kernel, from the example in Nvidia's blog
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}
// Main loop
int main(void)
{
timers t0, t1, t2;
timers tfinal0, tfinal1, tfinal2;
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
//Initializing arrays at the host side:
for (int i = 0; i < ARRAYS_SIZE; i++) {
x[i] = 1.0f;
x2[i] = 1.0f;
y[i] = 2.0f;
y2[i] = 2.0f;
}
// GPU memory allocation:
cudaMalloc(&d_x, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_x2, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y2, ARRAYS_SIZE * sizeof(float));
// Transfering arrays from host to device:
cudaMemcpy(d_x, x, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
//////////////////
// CPU run //
//////////////////
t0 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
//comment one of the following out to see the point of my question:
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}
tfinal0 = std::chrono::high_resolution_clock::now();
auto time0 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal0 - t0).count();
std::cout << "CPU: " << (float)time0 << " microseconds" << std::endl;
//////////////////
// GPU-CUDA run //
//////////////////
// Perform SAXPY kernel on ARRAYS_SIZE elements, for LOOP_N times
t1 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
//comment one of the following out to see the point of my question:
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}
tfinal1 = std::chrono::high_resolution_clock::now();
auto time1 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal1 - t1).count();
std::cout << "CUDA: " << (float)time1 << " microseconds" << std::endl;
//Display performance ratio CPU / GPU-CUDA
std::cout << "Ratio CPU/CUDA: " << (float)time0 / (float)time1 << std::endl;
//Freeing memory used by arrays:
cudaFree(d_x);
cudaFree(d_x2);
cudaFree(d_y);
cudaFree(d_y2);
free(x);
free(x2);
free(y);
free(y2);
return 0;
}
You are not waiting for the kernels to be finished. As all kernel launches are asynchronous, you need to explicitly call cudaDeviceSynchronize() before stopping your timer.
The differences you are observing with variants of your current code likely stem from the fact that the queue for kernels to launch is finite, so at some point your code will start waiting for part of your kernels anyways.
On Windows kernel batching also plays into this, up to some number (or a timeout) the driver will not even start to launch kernels.
A simple change solves the problem, but I would still very much appreciate learning the technical reasons for all this.
The solution is to merely change, in my toy example above, the kernel to:
__global__
void saxpy(int n, float a, float *x, float *y, float *x2, float *y2)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
y2[i] = a*x2[i] + y2[i];
}
}
And then call it only once, like the following:
for (int r = 0; r < LOOP_N; r++)
{
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y, d_x2, d_y2);
}
Now the performance difference against the CPU implementation is just the same - which should be expected.
If someone can jump in with an answer to why this makes a difference, please post it that I will favor it over mine.