I'm busy working on a LS method, I manually implemented a conjugate gradient solver, but after updating my CUDA version, I saw that there is a new function (cusolverDnSSgels) which I assume is faster than my manual implementation. My first task was to try and run it on a test case (see below), I'd expect the result to be: -6.5, 9.7 according to MATlab. Unfortunately I cannot find what I did wrong, I also cannot find an example because it is a relatively new function.
The output says that niter= -3, which would suggest too many iterations according to the documentation, however this would not make sense, as it is a very small matrix which should be easily solvable.
#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "device_launch_parameters.h"
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int dinfo = 0;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << dinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
The results I get are:
Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08
Could someone point out what I am doing wrong?
You have a problem with your dinfo parameter usage. Referring to the documentation, we see that:
Parameters of cusolverDngels() functions
parameter Memory In/out Meaning
dinfo device output Status of the IRS solver on the return. If 0 - solve was successful. If dinfo = -i then i-th argument is not valid.
the dinfo parameter is expected to live in device memory. But you have it in host memory:
int dinfo = 0;
If I move the storage to the proper location, your code outputs the values you indicate as expected:
$ cat t143.cu
#include <iostream>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int *dinfo, hinfo;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
cudaMalloc(&dinfo, sizeof(int));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, dinfo);
cudaMemcpy(&hinfo, dinfo, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << hinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
$ nvcc -o t143 t143.cu -lcublas -lcusolver
$ cuda-memcheck ./t143
========= CUDA-MEMCHECK
Status = 0
Status = 0
niter = -51
dinfo = 0
-6.5, 9.7
========= ERROR SUMMARY: 0 errors
$
Notes:
I am using CUDA 11.3 for the above. If you are using an earlier version, I strongly recommend you move forward to CUDA 11.3 or newer for usage of this function.
You can get a hint as to the problem by running your code with cuda-memcheck
It was fairly quick to spot the problem by reviewing your parameter usage with the table of parameter locations (host/device) given in the documentation. You had a problem here which was similar in that you could focus in on the problem by reviewing your parameter locations (host/device) against the table given in the documentation. This may be a good thing to check to save yourself time in the future.
Related
I'm currently trying to get the cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) version to work. I started by first making a small test case to see what parameters are needed exactly and how they need to be inputted. However after much trial and error I still can't get it to work, I get a status return of 13, which corresponds to CUBLAS_STATUS_EXECUTION_FAILED which is a very vague error, also I tried some other cublas testcases and they seem to be working fine. I also tested the input matrix in MATlab, which does have a LS solution.
#include "stdafx.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cmath>
#include <Windows.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
&gAmat,
lda, //or 1
&gYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
I'm on Windows 10 running in MVS using CUDA 9.0
I'd really appreciate some help
As pointed out in the comments, you are not creating a proper array of pointers on the device. The batched function works with an array of pointers that lives in device memory, for the data parameters, for example:
Aarray device input/output array of pointers to array, with each array of dim. m x n with lda>=max(1,m). Matrices Aarray[i] should not overlap; otherwise, undefined behavior is expected.
Passing for example &gAmat seems to satisfy the type requirement, but that pointer does not point to device memory.
The following modifications to your code focused on proper handling of gAmat and gYmat seem to run without error for me:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used
t130.cu(24): warning: variable "ldb" was declared but never referenced
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168, 0.514827, -4.29722
========= ERROR SUMMARY: 0 errors
$
Your code only shows a single array. If you had a batch of arrays, you would pass an actual array of device-allocated pointers, for each of A and Y.
Based on comments below, here is a version of the code using non-random input:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t status;
// create handle
status = cublasCreate(&m_cuBLAS);
std::cout << "status = " << status << std::endl;
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5, 9.7, 0.707106
========= ERROR SUMMARY: 0 errors
$
Making Mandelbrot with MPI
So I've made a Mandelbrot generator and everything worked fine. Now I'm throwing in a speedup from MPI. Process 0 generates a file name mbrot.ppm and adds the appropriate metadata, then divides up the workload into chunks.
Each process receives the chunk's starting and ending positions and gets to work calculating its portion of the Mandelbrot set. To write to the mbrot.ppm file, each process saves its data in an array so it doesn't write to the file before the previous process finishes.
My Problem
Its a runtime error that says:
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node Lenovo exited on signal 11 (Segmentation fault).
I believe it comes from the line int data[3][xrange][yrange]; (line 120) since the print statement after this line never executes. Would there be an obvious reason I'm missing why this multi-dimensional array is causing me problems?
Full Code
#include <iostream>
#include <mpi.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <fstream>
#define MCW MPI_COMM_WORLD
using namespace std;
struct Complex {
double r;
double i;
};
Complex operator + (Complex s, Complex t) {
Complex v;
v.r = s.r + t.r;
v.i = s.i + t.i;
return v;
};
Complex operator * (Complex s, Complex t) {
Complex v;
v.r = s.r * t.r - s.i * t.i;
v.i = s.r * t.i + s.i * t.r;
return v;
};
int rcolor(int iters) {
if (iters == 255) return 0;
return 32 * (iters % 8);
};
int gcolor(int iters) {
if (iters == 255) return 0;
return 32 * (iters % 8);
};
int bcolor(int iters) {
if (iters == 255) return 0;
return 32 * (iters % 8);
};
int mbrot(Complex c, int maxIters) {
int i = 0;
Complex z;
z = c;
while (i < maxIters && z.r * z.r + z.i * z.i < 4) {
z = z * z + c;
i++;
}
return i;
};
int main(int argc, char * argv[]) {
int rank, size;
MPI_Init( & argc, & argv);
MPI_Comm_rank(MCW, & rank);
MPI_Comm_size(MCW, & size);
if (size < 2) {
printf("Not an MPI process if only 1 process runs.\n");
exit(1);
}
if (size % 2 != 0) {
printf("Please use a even number\n");
exit(1);
}
Complex c1, c2, c;
char path[] = "brot.ppm";
int DIM;
int chunk[4];
c1.r = -1;
c1.i = -1;
c2.r = 1;
c2.i = 1;
if (rank == 0) { //start the file
ofstream fout;
fout.open(path);
DIM = 2000; // pixel dimensions
fout << "P3" << endl; // The file type .ppm
fout << DIM << " " << DIM << endl; // dimensions of the image
fout << "255" << endl; // color depth
fout.close();
// making dimesions marks
for (int i = 0; i < size; i++) {
chunk[0] = 0; // startX
chunk[1] = DIM; // endX
chunk[2] = (DIM / size) * i; // startY
chunk[3] = (DIM / size) * (i + 1); // endY
MPI_Send(chunk, 4, MPI_INT, i, 0, MCW);
};
};
MPI_Recv(chunk, 4, MPI_INT, 0, 0, MCW, MPI_STATUS_IGNORE);
printf("Process %d recieved chunk\n\t StartX: %d, EndX: %d\n\t StartY: %d, EndY: %d\n", rank, chunk[0], chunk[1], chunk[2], chunk[3]);
// do stuff save in array
// data[3 elements][Xs][Ys]
int xrange = chunk[1] - chunk[0];
int yrange = chunk[3] - chunk[2];
printf("Process %d, x: %d, y: %d\n", rank, xrange, yrange);
int data[3][xrange][yrange];
printf("done\n");
// generate data for mandlebrot
for (int j = chunk[2]; j < chunk[3]; ++j) {
for (int i = chunk[0]; i < chunk[1]; ++i) {
// calculate one pixel of the DIM x DIM image
c.r = (i * (c1.r - c2.r) / DIM) + c2.r;
c.i = (j * (c1.i - c2.i) / DIM) + c2.i;
int iters = mbrot(c, 255);
data[0][i][j] = rcolor(iters);
data[1][i][j] = gcolor(iters);
data[2][i][j] = bcolor(iters);
}
}
printf("here2\n");
// taking turns to write their data to file
for (int k = 0; k < size; k++) {
if (rank == k) {
ofstream fout;
fout.open(path, ios::app);
fout << rank << " was here" << endl;
for (int j = chunk[2]; j < chunk[3]; ++j) {
for (int i = chunk[0]; i < chunk[1]; ++i) {
fout << data[0][i][j] << " " << data[1][i][j] << " " << data[2][i][j] << " ";
}
fout << endl;
}
printf("Process %d done and waiting\n", rank);
} else {
MPI_Barrier(MCW);
}
}
MPI_Finalize();
};
How to Run
$ mpic++ -o mbrot.out mbrot.cpp
$ mpirun -np 4 mbrot.out
I'm using VS2019 and have an NVIDIA GeForce GPU. I tried the code from this link: https://towardsdatascience.com/writing-lightning-fast-code-with-cuda-c18677dcdd5f
The author of that post claims to get a speedup when using CUDA. However, for me, the serial version takes around 7 milliseconds while the CUDA version takes around 28 milliseconds. Why is CUDA slower for this code? The code I used is below:
__global__
void add(int n, float* x, float* y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
void addSerial(int n, float* x, float* y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main()
{
int NSerial = 1 << 20;
float* xSerial = new float[NSerial];
float* ySerial = new float[NSerial];
for (int i = 0; i < NSerial; i++) {
xSerial[i] = 1.0f;
ySerial[i] = 2.0f;
}
auto t1Serial = std::chrono::high_resolution_clock::now();
addSerial(NSerial, xSerial, ySerial);
auto t2Serial = std::chrono::high_resolution_clock::now();
auto durationSerial = std::chrono::duration_cast<std::chrono::milliseconds>(t2Serial - t1Serial).count();
float maxErrorSerial = 0.0f;
for (int i = 0; i < NSerial; i++)
maxErrorSerial = fmax(maxErrorSerial, fabs(ySerial[i] - 3.0f));
std::cout << "Max error Serial: " << maxErrorSerial << std::endl;
std::cout << "durationSerial: "<<durationSerial << std::endl;
delete[] xSerial;
delete[] ySerial;
int N = 1 << 20;
float* x, * y;
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
int device = -1;
cudaGetDevice(&device);
cudaMemPrefetchAsync(x, N * sizeof(float), device, NULL);
cudaMemPrefetchAsync(y, N * sizeof(float), device, NULL);
int blockSize = 1024;
int numBlocks = (N + blockSize - 1) / blockSize;
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
std::cout << "duration CUDA: "<<duration;
cudaFree(x);
cudaFree(y);
return 0;
}
There are several observations to make here:
The first call of a CUDA kernel can accumulate a lot of one time latency associated with setup on the GPU, so the normal approach is to include a "warm-up" call
The kernel design in your question is a "resident" design, so optimal execution should occur when you launch only as many blocks as required to fully occupy your GPU. There is an API you can use to get this information for your GPU.
Perform timing in microseconds, not milliseconds
Build your code in release mode.
Doing all of this to your CUDA code gets me this:
int N = 1 << 20;
int device = -1;
cudaGetDevice(&device);
float* x, * y;
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemPrefetchAsync(x, N * sizeof(float), device, NULL);
cudaMemPrefetchAsync(y, N * sizeof(float), device, NULL);
int blockSize, numBlocks;
cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, add);
for(int rep=0; rep<10; rep++) {
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << rep << " duration CUDA: " << duration <<std::endl;
}
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 12.0f));
std::cout << "Max error: " << maxError << std::endl;
cudaFree(x);
cudaFree(y);
And building it and running it:
$ nvcc -arch=sm_52 -std=c++11 -o not_so_fast not_so_fast.cu
$ ./not_so_fast
Max error Serial: 0
durationSerial: 2762
0 duration CUDA: 1074
1 duration CUDA: 150
2 duration CUDA: 151
3 duration CUDA: 158
4 duration CUDA: 152
5 duration CUDA: 152
6 duration CUDA: 147
7 duration CUDA: 124
8 duration CUDA: 112
9 duration CUDA: 113
Max error: 0
On my system, the first GPU run close to three times as fast as the serial loop. The second and subsequent runs are almost 10 times faster again. Your results can (and probably will) vary.
I am fairly new to CUDA, and I am trying to offload to the GPU some cumbersome computations I am doing for a performance-critical project. On my computer I have two NVS 510 Graphic cards, but I am currently experimenting with one only.
I have some big column-major matrix (1000-5000 rows x 1-5 M columns) to be filled. I was so far able to write the code to fill the matrix like it were an array, and it works well for matrices of relatively small size.
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_rows;
int row = index % n_rows;
if (row > n_sim || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
The kernel is called:
fl_type *res;
cudaMalloc((void**)&res, n_columns*n_rows*fl_size);
int block_size = 1024;
int num_blocks = (n_rows* n_columns + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (res,[other params], n_rows,n_columns);
and everything works just fine.
If I change the kernel to work with 2D threads:
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column* n_rows + row;
if (row > n_rows || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
}
and I invoke it
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (n_columns + block_size2 - 1) / block_size2;
int y_grid = (n_rows + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
interp_kernel2D <<< grid_size2, num_blocks2 >>> (res,[other params], n_rows,n_columns);
the results are all zero and CUDA returns unknown error. What am I missing? the actual code, which compiles without error with VS2015 and CUDA 8.0, can be found here: https://pastebin.com/XBCVC7VV
Here is the code from the pastebin link:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, int& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_point] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (float(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
//2d
//int block_size2 = 32; //each block will have block_size2*block_size2 threads
//dim3 num_blocks2(block_size2, block_size2);
//int x_grid = (total_action_number + block_size2 - 1) / block_size2;
//int y_grid = (n_sim + block_size2 - 1) / block_size2;
//dim3 grid_size2(x_grid, y_grid);
//std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
//interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
Moreover, I would extremely grateful for any direction on how to improve the performance of the code.
Any time you are having trouble with a CUDA code, I recommend doing proper CUDA error checking (which you mostly seem to be doing), and also run your code with cuda-memcheck. This last utility is similar to "enabling the memory checker" in Nsight VSE, but not quite the same. However the Nsight VSE memory checker may have given you the same indication.
In C (or C++) indexing of arrays generally starts at 0. Therefore, to test for an out-of-bounds index, I must check to see if the generated index is equal to or greater than the size of the array. But in your case you are only testing for greater than:
if (row > n_sim || column > num_cols) return;
You make a similar error in both your 1D kernel and in your 2D kernel, and although you believe your 1D kernel is working correctly, it is actually making out-of-bounds accesses. You can verify this if you run with the aforementioned cuda-memcheck utility (or probably also with the memory checker that can be enabled in Nsight VSE).
When I modify your code in the pastebin link to use proper range/bounds checking, cuda-memcheck reports no errors, and your program reports the correct results. I've tested both cases, but the code below is modified from your pastebin link to uncomment the 2D case, and use that instead of the 1D case:
$ cat t375.cu | more
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl
_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map,
int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, in
t& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_poi
nt] * n_sim + row];
}
return res;
}
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* no
de_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type*
node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
}
d_matrix[index] = res;
}
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
count++;
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[
i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
count_zero++;
}
}
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (f
loat(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (
float(count_zero) / size * 100) << "%)" << std::endl;
}
else
std::cout << "Perfect match!" << std::endl;
}
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
}
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
}
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
}
}
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
free(interp_value_cpu);
free(interp_value_gpu);
free(grid_values);
free(node_map);
free(weights);
}
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
}
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
}
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
}
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
}
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
}
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
}
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
}
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
cudaDeviceSynchronize();
//1d
#if 0
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
#endif
//2d
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (total_action_number + block_size2 - 1) / block_size2;
int y_grid = (n_sim + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
}
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
cudaFree(interp_value_device);
cudaFree(weights_device);
cudaFree(node_map_device);
cudaFree(grid_values_device);
}
$ nvcc -arch=sm_52 -o t375 t375.cu -std=c++11
$ cuda-memcheck ./t375
========= CUDA-MEMCHECK
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.081s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:31 ms
Time spent moving the results back to the host: 335 ms
Crunching values on the GPU: 7.089s
Performance: -5.73452 % less time than parallel CPU!
Perfect match!
========= ERROR SUMMARY: 0 errors
$
Note that cuda-memcheck slows down the execution of your program on the GPU to do rigorous memory bounds checking. Therefore the performance may not match the ordinary case. This is what an "ordinary" run looks like:
$ ./t375
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.273s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:32 ms
Time spent moving the results back to the host: 332 ms
Crunching values on the GPU: 1.161s
Performance: -84.6596 % less time than parallel CPU!
Perfect match!
$
You are accessing memory beyond the allocated chunk. To check if row and column indices are within the range:
if (row >= n_rows || column >= num_cols) return; // Do this
if (row > n_rows || column > num_cols) return; // Instead of this
In flat version this int row = index % n_rows; makes row stay below the n_rows. You only access one column beyond the allocated memory, which for small matrix could still be withing the memory alignment. Python demo.
The second version does access an extra column plus and extra element, and one extra element for each row (the first element of the following row), as this:
int row = blockIdx.y * blockDim.y + threadIdx.y;
no longer keeps row index within the valid range. Python demo.
Looking at your pastebin, this is probably the place where it breaks:
44. fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
^^^
45. for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
46. res += weights[w_p + inter_point] * \
grid_values[row + node_map[w_p + inter_point] * n_sim];
^^^
47. }
I'm trying to implement a matrix-vector Multiplication on GPU (using CUDA).
In my C++ code (CPU), I load the matrix as a dense matrix, and then I perform the matrix-vector multiplication using CUDA. I'm also using shared memory to improve the performance.
How can I load the matrix in an efficient way, knowing that my matrix is a sparse matrix?
Below is my C++ function to load the matrix:
int readMatrix( char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR )
{
unsigned int w, h, x, y, num_entries;
float val;
std::ifstream file( filename );
if ( file )
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";
assert( w == h || w == 1 || h == 1 );
if( dim != NULL ) *dim = std::max( w, h );
matrix = new float[ w * h ];
unsigned int i;
for( i = 0; i < num_entries; i++ ){
if( file.eof() ) break;
file >> y >> x >> val;
if( majority == ROW_MAJOR ){
matrix[ w * y + x ] = val;
} else if( majority == COLUMN_MAJOR ){
matrix[ h * x + y ] = val;
}
}
file.close();
if( i == num_entries )
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";
// print first few elements
if( w == h ){
for( unsigned int i = 0; i < w; i++ ){
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j + w * i ] );
}
}
}
else{
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j ] );
}
}
} else {
std::cout << "Unable to open file\n";
return false;
}
return true;
}
Below is my CUDA Kernel function that handles the matrix-vector multiplication:
__global__ void
_cl_matrix_vector_( float *A, float *b, float *x, int dim )
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;
//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}
//make sure all threads are synchronized
__syncthreads();
if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}
}
What are the necessary changes that I have to make on my CUDA code to take into account that my matrix is a sparse matrix?
I found out from a forum that we can also use padding to be able to optimize the performance, but this requires me to change the way I read the matrix / sort the matrix. Any ideas how to implement this padding in the way I read the matrix and perform the calculation?
This is a very old post and I want to highlight that cuSPARSE (since some time now) makes routines for the multiplication between sparse matrices or between a sparse matrix and a dense vector available.
For the csr format, the relevant routine for the multiplication between a sparse matrix and a dense vector is cusparse<t>csrmv. Below, a fully worked example showing its use.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include "Utilities.cuh"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns
// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(double));
double *h_x_dense = (double*)malloc(N * sizeof(double));
double *h_y_dense = (double*)malloc(N * sizeof(double));
// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;
// --- Initializing the data and result vectors
for (int k = 0; k < N; k++) {
h_x_dense[k] = 1.;
h_y_dense[k] = 0.;
}
// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(double)));
double *d_x_dense; gpuErrchk(cudaMalloc(&d_x_dense, N * sizeof(double)));
double *d_y_dense; gpuErrchk(cudaMalloc(&d_y_dense, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_x_dense, h_x_dense, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y_dense, h_y_dense, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
int nnzA = 0; // --- Number of nonzero elements in dense matrix A
const int lda = N; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));
// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");
// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
printf("\n");
for (int i = 0; i < N; ++i) printf("h_x[%i] = %f \n", i, h_x_dense[i]); printf("\n");
const double alpha = 1.;
const double beta = 0.;
cusparseSafeCall(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nnzA, &alpha, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x_dense,
&beta, d_y_dense));
gpuErrchk(cudaMemcpy(h_y_dense, d_y_dense, N * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nResult vector\n\n");
for (int i = 0; i < N; ++i) printf("h_y[%i] = %f ", i, h_y_dense[i]); printf("\n");
}
You might want to have a look at the very good CUSP library. They implement sparse matrices in a variety of formats (coo, csr, ellpack, diagonal and a hybrid between ellpack and coo). Each with their own advantages as described in the documentation. Most of them are "standard" sparse matrix formats about which you can find more information online. Not a complete answer to your question perhaps, but it should provide a starting point.