cuda tiled 3d convolution implementations with shared memory - c++

Based on my study, there are 2 different strategies to implement tiled version of convolution with cuda. I want to know more about this, and would like to see how they compare with each other, what is the advantage and disadvantage of each strategy, and how to choose. Below is the implementations of the two different strategies.
Strategy 1: the tile size matches with the output size, and needs multiple steps to load the input.
#define MASK_WIDTH 3
#define MASK_RADIUS 1
#define TILE_WIDTH 8
#define SHAREDMEM_DIM (TILE_WIDTH + (MASK_RADIUS * 2))
__constant__ float deviceMask[MASK_WIDTH * MASK_WIDTH * MASK_WIDTH];
__global__ void conv3d(float *inputArray,
float *outputArray,
const int z_size,
const int y_size,
const int x_size) {
__shared__ float subTile[SHAREDMEM_DIM][SHAREDMEM_DIM][SHAREDMEM_DIM];
int bx = blockIdx.x, tx = threadIdx.x;
int by = blockIdx.y, ty = threadIdx.y;
int bz = blockIdx.z, tz = threadIdx.z;
int destination = (tz * TILE_WIDTH * TILE_WIDTH) + (ty * TILE_WIDTH) + tx;
int destTmp = destination;
int dX = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
int dY = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
int dZ = destTmp;
int inputZ = dZ + (bz * TILE_WIDTH) - MASK_RADIUS;
int inputY = dY + (by * TILE_WIDTH) - MASK_RADIUS;
int inputX = dX + (bx * TILE_WIDTH) - MASK_RADIUS;
int input = (inputZ * y_size * x_size) + (inputY * x_size) + inputX;
if( inputZ >= 0 && inputZ < z_size
&& inputY >= 0 && inputY < y_size
&& inputX >= 0 && inputX < x_size){
subTile[dZ][dY][dX] = inputArray[input];
}
else{
subTile[dZ][dY][dX] = 0;
}
destination = TILE_WIDTH * TILE_WIDTH * TILE_WIDTH
+ (tz * TILE_WIDTH * TILE_WIDTH) + (ty * TILE_WIDTH) + tx;
destTmp = destination;
dX = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
dY = destTmp % SHAREDMEM_DIM;
destTmp = destTmp / SHAREDMEM_DIM;
dZ = destTmp;
inputZ = dZ + (bz * TILE_WIDTH) - MASK_RADIUS;
inputY = dY + (by * TILE_WIDTH) - MASK_RADIUS;
inputX = dX + (bx * TILE_WIDTH) - MASK_RADIUS;
input = (inputZ * y_size * x_size) + (inputY * x_size) + inputX;
if(dZ < SHAREDMEM_DIM){
if( inputZ >= 0 && inputZ < z_size
&& inputY >= 0 && inputY < y_size
&& inputX >= 0 && inputX < x_size ) {
subTile[dZ][dY][dX] = inputArray[input];
}
else{
subTile[dZ][dY][dX] = 0;
}
}
__syncthreads();
float sum = 0;
int z, y, x;
for(z = 0; z < MASK_WIDTH; z++){
for(y = 0; y < MASK_WIDTH; y++){
for(x = 0; x < MASK_WIDTH; x++){
sum += subTile[tz + z][ty + y][tx + x]
* deviceMask[x + (y * MASK_WIDTH) + (z * MASK_WIDTH * MASK_WIDTH)];
}
}
}
z = tz + (bz * TILE_WIDTH);
y = ty + (by * TILE_WIDTH);
x = tx + (bx * TILE_WIDTH);
if(z < z_size && y < y_size && x < x_size){
outputArray[x + (y * x_size) + (z * y_size * x_size)] = sum;
}
__syncthreads();
}
The second strategy is to set the block size to be the same with input tile. In calculating output, some threads are turned off.
#define TILE_X 14
#define TILE_Y 6
#define TILE_Z 6
#define MASK_WIDTH 3
#define MASK_SIZE MASK_WIDTH * MASK_WIDTH * MASK_WIDTH
__constant__ float mask[MASK_WIDTH][MASK_WIDTH][MASK_WIDTH];
__global__ void conv3d(float *input, float *output, const int z_size, const int y_size, const int x_size) {
__shared__ float inputTile [TILE_Z+MASK_WIDTH-1][TILE_Y+MASK_WIDTH-1][TILE_X+MASK_WIDTH-1];
int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z;
int x_o = bx * TILE_X + tx
int y_o = by * TILE_Y + ty;
int z_o = bz * TILE_Z + tz;
int x_i = x_o - MASK_WIDTH/2;
int y_i = y_o - MASK_WIDTH/2;
int z_i = z_o - MASK_WIDTH/2;
if (x_i >= 0 && y_i >= 0 && z_i >= 0 && x_i < x_size && y_i < y_size && z_i < z_size)
inputTile[tz][ty][tx] = input[(z_i * y_size + y_i) * x_size + x_i];
else
inputTile[tz][ty][tx] = 0.0;
__syncthreads();
float acc = 0.0;
if(tz < TILE_Z && ty < TILE_Y && tx < TILE_X) {
for(int z_mask = 0; z_mask < Z_MASK_WIDTH; z_mask++) {
for(int y_mask = 0; y_mask < Y_MASK_WIDTH; y_mask++) {
for(int x_mask = 0; x_mask < X_MASK_WIDTH; x_mask++) {
acc += mask[z_mask][y_mask][x_mask] *
inputTile[tz+z_mask][ty+y_mask][tx+x_mask];
}
}
}
if(z_o < z_size && y_o < y_size && x_o < x_size)
output[(z_o * y_size + y_o) * x_size + x_o] = acc;
}
}
Any idea about how to choose between these? In addition, which version is used more often in practice, like in deep learning? Also if you have any comments on the code, please also let me know!

The general answer whenever it comes to the question of "which is faster?" is always: measure how fast each approach runs your application scenario to find out. In this case, I would say that the first approach would seem preferable most of the time (if you had to pick one of those two options for some reason). Unless you have some very tiny convolution kernels, the second approach would have lots of threads idle in the parts that do much of the actual work. Be sure to avoid bank conflicts within your tiles and think about the memory access patterns you get from your warps when moving data to and from global memory.
In the end, convolution is basically just computing sums over all possible combinations of kernel coefficients and input elements. Since the workload is essentially just repeatedly fetching these values in some order, convolution is almost necessarily going to be limited by bandwidth. Thus, doing convolution efficiently comes down to optimizing memory access and reducing bandwidth as much as possible.
[…] which version is used more often in practice, like in deep learning?
Neither. The naïve approach of throwing nested loops at it to brute-force convolution in the spatial domain is almost never an efficient way of computing convolutions. Convolution is such a fundamental operation for so many things that it has been studied extensively. There are literally hundreds, if not thousands of papers and books you could read on the subject. In deep learning, the problem of convolution has commonly been formulated in terms of general matrix multiplications (GEMMs) since this approach leads to rather nice memory access patterns and many efficient GEMM implementations are available for the GPU. But also FFT-based approaches as well as other algorithms are increasingly used depending on the application.

Related

Implementing the Lanczos algorithm into C++ for a quantum anharmonic oscillator

Firstly, I would like to mention that I am a complete beginner when it comes to coding, let alone C++, so bear with me, as I need complete guidance. My task is to implement the Lanczos algorithm for the case of a 1-D anharmonic oscillator in C++, with reference to the paper linked Analytical Lanczos method.
The paper offers a step by step guide for the implementation of the algorithm:
Step by step guide here
with the initial trial function being: Psi_1 = (1 + x^2) * (exp(-x^2 - 1/4 * x^4).
The paper also contains code in MATHEMATICA for this particular case. Mathematica code
and thus, here is my attempt, which is greatly unfinished, however, I wanted to ensure I was going along the correct path with regards to the programming logic. There are still plentiful errors etc. (Also excuse the lack of fundamentals here, I am only a beginner. Thank you very much.)
int main() {
//Grid parameters.
const int Rmin = 1, Rmax = 31, nx = 300;//Grid length and stepsize.
double dx = (Rmax- Rmin) / nx; //Delta x.
double a, b;
std::vector<double> x, psi_1;
for (int j = 1; j < 64; ++j) { //Corresponds to each succesive Lanczos Vector.
for (int i = Rmin; i < nx + 1; i++) { //Defining the Hamiltonian on the grid.
x[i] = (nx / 2) + i;
psi_1[i] = (1 + pow(x[i] * dx, 2)) * exp(pow(-x[i] * dx, 2) - (1 / 4 * pow(x[i] * dx, 4 )) //Trial wavefunction.
H[i] = ((PSI[j][i + 1] - 2 * PSI[j][i] + PSI[j][i - 1]) / pow(dx, 2)) + PSI[j][i] * 1/2 * pow(x[i] * dx, 2) + PSI[j][i] * 2 * pow(x[i] * dx, 4) + PSI[j][i] * 1/2 * pow(x[i], 6); //Hamiltonian. ****
//First Lanczos step.
PSI[1][i] = psi_1[i]
}
//Normalisation of the wavefunction (b).
double b[j] = 0.0;
for (int i = Rmin; i < nx + 1; i++) {
PSI[1][i] = psi_1[i];
b[j] += abs(pow(PSI[j][i], 2));
}
b[j] = b[j] * dx;
for (int i = Rmin; i < nx + 1; i++) {
PSI[j] = PSI[j] / sqrt(b[j]);
}
//Expectation values (a). Main diagonal of the Hamiltonian matrix.
double a[j] = 0.0;
for (int i = Rmin; i < nx + 1; i++) {
a[j] += PSI[j] * H[i] * PSI[j] * dx
}
//Recursive expression.
PSI[j] = H[i] * PSI[j-1] - PSI[j-1] * a[j-1] - PSI[j-2] * b[j-1]
//Lanczos Matrix.
LanczosMatrix[R][C] =
for (int R = 1; R < 64; R++) {
row[R] =
}
}
I have yet to finish the code, but some experienced guidance would be greatly appreciated! (also, the code has to be cleaned up greatly, but this was an attempt to get the general idea down first.)

How to transpose a huge arbitrary matrix in cuda using shared memory?

I have a task to transpose a matrix in CUDA using shared memory with no bank conflicts. The limits are: with*height <= 10^8. The key test sizes are: 1x10^8, 10^4x10^4, 10^8*1.
I have tried a solution provided here Matrix Transpose (with shared Memory) with arbitary size on Cuda C but it did not help me, because my matrix size is too large and outside of CUDA dimension limits (65536 blocks and 32 threads per block).
I tried to create a loop, that helps to work with huge matrix:
const int BLOCK_DIM = 32;
__global__ void transposeMatrixFast(double* inputMatrix, double* outputMatrix, int width, int height)
{
__shared__ double temp[BLOCK_DIM][BLOCK_DIM+1];
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
for (int y = yIndex; y < height; y += offsety)
{
for (int x = xIndex; x < width; x += offsetx)
{
if ((xIndex < width) && (yIndex < height))
{
int idx = y * width + x;
temp[threadIdx.y][threadIdx.x] = inputMatrix[idx];
}
__syncthreads();
if ((x < width) && (y < height))
{
int idx = x * height + y;
outputMatrix[idx] = temp[threadIdx.y][threadIdx.x];
}
}
}
}
Now I am getting a "time limit exceeded" error on a testing server. The reason is that I can't use a benefit of a shared memory in this line:
outputMatrix[idx] = temp[threadIdx.x][threadIdx.y]; and my kerner slows down. I think there is another way to organise my loop, but I don't know how.
I found another way to organise my loop, and now I can transpose matrices of ANY size:
const int BLOCK_SIZE = 32;
__global__ void matrixTransposeSolveBankConflicts(const double *d_a, double *d_b, const unsigned long rows, const unsigned long cols) {
__shared__ double mat[BLOCK_SIZE][BLOCK_SIZE + 1];
unsigned long bh = ceil((double)rows / BLOCK_SIZE);
unsigned long bw = ceil((double)cols / BLOCK_SIZE);
for (unsigned long blocky = blockIdx.y; blocky < bh; blocky += gridDim.y) {
for (unsigned long blockx = blockIdx.x; blockx < bw; blockx += gridDim.x) {
unsigned long bx = blockx * BLOCK_SIZE;
unsigned long by = blocky * BLOCK_SIZE;
unsigned long i = by + threadIdx.y;
unsigned long j = bx + threadIdx.x;
if (i < rows && j < cols)
{
mat[threadIdx.x][threadIdx.y] = d_a[i*cols + j];
}
__syncthreads();
unsigned long ti = bx + threadIdx.y;
unsigned long tj = by + threadIdx.x;
if (tj < rows && ti < cols)
{
d_b[ti*rows + tj] = mat[threadIdx.y][threadIdx.x];
}
__syncthreads();
}
}
}

Why CUDA shared memory is slower than global memory in tiled matrix multiplication?

I have tiled matrix multiplication code with and without shared memory. Below is matrix multiplication using global memory:
__global__
void MatrixMulKernel(float* M, float* N, float* P, int Width)
{
int Row = blockIdx.y*blockDim.y + threadIdx.y;
int Col = blockIdx.x*blockDim.x + threadIdx.x;
if ((Row < Width) && (Col < Width)) {
float Pvalue = 0;
for (int k = 0; k < Width; ++k)
{
Pvalue += M[Row*Width + k] * N[k*Width + Col];
}
P[Row*Width + Col] = Pvalue;
}
}
Below is matrix multiplication using shared memory:
__global__
void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width)
{
__shared__ float Mds[blockWidth][blockWidth];
__shared__ float Nds[blockWidth][blockWidth];
int tx = threadIdx.x; int ty = threadIdx.y;
int bx = blockIdx.x; int by = blockIdx.y;
int row = by * blockWidth + ty;
int col = bx * blockWidth + tx;
float pvalue = 0;
for (int m = 0; m < Width / blockWidth; ++m)
{
Mds[ty][tx] = d_M[row * Width + m*blockWidth + tx];
Nds[ty][tx] = d_N[(m*blockWidth + ty)*Width + col];
__syncthreads();
for (int k = 0; k < blockWidth; ++k)
{
pvalue += Mds[ty][k]*Nds[k][tx];
}
__syncthreads();
}
d_P[row*Width + col] = pvalue;
}
As much as I know using shared memory should be faster but in comparing this two codes I found code without shared memory runs about 2 seconds faster for 1600*1600 matrixes. Is there any explanation for this speed difference or something goes wrong with my code?
My teacher uses "Programming Massively Parallel Processors" Book as main text resource these two codes comes from that.
EDIT:
Configuration for Kernel:
int NumBlocks =ceil( Width / blockWidth); // blockWidth = 16
dim3 dimGrid(NumBlocks, NumBlocks,1); // Width = 1600
dim3 dimBlock(blockWidth, blockWidth,1);
clock_t startGpuCalculation = clock();
MatrixMulKernel <<<dimGrid, dimBlock >>>(d_M, d_N, d_P, Width);
cudaThreadSynchronize();
clock_t endGpuCalculation = clock();
I was Running Project In Debug Mode (VS 2017 & CUDA 9). I Run Code in Release Mode and Shared Memory Is Much Faster Than Global Memory. My Bad.

Matrix multiplication in CUDA of variable matrix sizes and the use of shared memory [duplicate]

This question already has an answer here:
CUDA-Kernel supposed to be dynamic crashes depending upon block size
(1 answer)
Closed 2 years ago.
I want to implement a simple matrix multiplication in CUDA. The dimensions of the matrix are determined at runtime and I also want to use the shared memory in order to gain a perfomance boost. I have implemented such a function but everytime I run it, I get this error:
mulKernel launch failed: an illegal memory access was encountered
I am also not sure if I can use malloc to allocate shared memory. However, if I want to use something like this
__shared__ float matrM_sm[tile_width][tile_width];
the compiler complains that tile_width has to be known at runtime...
I have tried everything I can think of and tried various suggestions as well but none of them worked. This is the function (the full working file can be found HERE):
__global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN,
const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width)
{
int i, j;
extern __shared__ float shared[];
float *matrM_sm = shared;
float *matrN_sm = &shared[tile_width * tile_width];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * tile_width + ty;
int col = bx * tile_width + tx;
float tmp;
int limit = ceil(m_y / (float) tile_width);
for (i = 0; i < limit; i++)
{
tmp = 0.0;
if (i * tile_width + tx < m_y && row < m_x)
matrM_sm[ty * tile_width + tx] = matrM[row * m_y + (i * tile_width + tx)];
else
matrM_sm[ty * tile_width + tx] = 0.0;
if (i * tile_width + ty < n_x && col < n_y)
matrN_sm[ty * tile_width + tx] = matrN[col + (i * tile_width + ty) * n_y];
else
matrN_sm[ty * tile_width + tx] = 0.0;
__syncthreads();
for (j = 0; j < tile_width; j++)
tmp += matrM_sm[ty * tile_width + j] * matrN_sm[j * tile_width + tx];
__syncthreads();
}
if (row < m_x && col < n_y)
matrR[row * n_y + col] = tmp;
}
The basic layout should work as I have also implemented a version without shared memory which works just fine. The function without shared memory is listed below:
__global__ void mulKernel(float *matrR, const float *matrM, const float *matrN,
const int m_x, const int m_y, const int n_x, const int n_y)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int i;
if ((row < m_x) && (col < n_y))
{
float tmp = 0.0;
for (i = 0; i < m_y; i++)
{
tmp += matrM[row * m_y + i] * matrN[col + n_y * i];
}
matrR[row * n_y + col] = tmp;
}
}
If there is any information missing I will provide it immediatly.
You swapped row, col. Furthermore, I believe to get the global thread index you should rather do this int x_global = threadIdx.x + blockDim.x * threadIdx.y

Visualize un-distorted images received from the Leap motion cameras using OpenCV

I want to use OpenCV to visualize undistorted images, obtained after correction of raw images taken from Leap Motion cameras;
according to the documentation,
https://developer.leapmotion.com/documentation/cpp/devguide/Leap_Images.html
the following code should return corrected images: am I right?
unsigned char destination[320][120];
//define needed variables outside the inner loop
float calibrationX, calibrationY;
float weightX, weightY;
float dX, dX1, dX2, dX3, dX4;
float dY, dY1, dY2, dY3, dY4;
int x1, x2, y1, y2;
int denormalizedX, denormalizedY;
int i, j;
const unsigned char* raw = image.data();
const float* distortion_buffer = image.distortion();
//Local variables for values needed in loop
const int distortionWidth = image.distortionWidth();
const int width = image.width();
const int height = image.height();
for (i = 0; i < destinationWidth; i++) {
for (j = 0; j < destinationHeight; j++) {
//Calculate the position in the calibration map (still with a fractional part)
calibrationX = 63 * i/destinationWidth;
calibrationY = 62 * (1 - j/destinationHeight); // The y origin is at the bottom
//Save the fractional part to use as the weight for interpolation
weightX = calibrationX - truncf(calibrationX);
weightY = calibrationY - truncf(calibrationY);
//Get the x,y coordinates of the closest calibration map points to the target pixel
x1 = calibrationX; //Note truncation to int
y1 = calibrationY;
x2 = x1 + 1;
y2 = y1 + 1;
//Look up the x and y values for the 4 calibration map points around the target
dX1 = distortion_buffer[x1 * 2 + y1 * distortionWidth];
dX2 = distortion_buffer[x2 * 2 + y1 * distortionWidth];
dX3 = distortion_buffer[x1 * 2 + y2 * distortionWidth];
dX4 = distortion_buffer[x2 * 2 + y2 * distortionWidth];
dY1 = distortion_buffer[x1 * 2 + y1 * distortionWidth + 1];
dY2 = distortion_buffer[x2 * 2 + y1 * distortionWidth + 1];
dY3 = distortion_buffer[x1 * 2 + y2 * distortionWidth + 1];
dY4 = distortion_buffer[x2 * 2 + y2 * distortionWidth + 1];
//Bilinear interpolation of the looked-up values:
// X value
dX = dX1 * (1 - weightX) * (1 - weightY) +
dX2 * weightX * (1 - weightY) +
dX3 * (1 - weightX) * weightY +
dX4 * weightX * weightY;
// Y value
dY = dY1 * (1 - weightX) * (1 - weightY) +
dY2 * weightX * (1 - weightY) +
dY3 * (1 - weightX) * weightY +
dY4 * weightX * weightY;
// Reject points outside the range [0..1]
if((dX >= 0) && (dX <= 1) && (dY >= 0) && (dY <= 1)) {
//Denormalize from [0..1] to [0..width] or [0..height]
denormalizedX = dX * width;
denormalizedY = dY * height;
//look up the brightness value for the target pixel
destination[i][j] = raw[denormalizedX + denormalizedY * width];
} else {
destination[i][j] = -1;
}
}
}
Now, I'm using OpenCV to visualize undistorted image:
Mat imgCorrected(120,320,CV_8UC1);
for(int i = 0; i < 120; i++)
for(int j = 0; j < 320; j++)
imgCorrected.at<unsigned char>(i,j) = destination[i][j];
imshow("ImgCorrected", imgCorrected);
And this is the result:
Result
I really don't know what I'm doing wrong.
Thanks for any help.