2D tiled convolution taking more time than untiled version - c++

Writing a code that perform a 2D convolution on a float matrix, in both tiled and untiled version. I'm assuming the width of the tile as
BLOCK_SIZE - MASK_WIDTH + 1
, using halo cells.
But for a 1024 matrix and masks varing from 3 to 9 I get the untiled version performing better:
untiled version
vs
tiled
Both matrix and mask are defined in a constant manner, equal for tiled and untiled. No random values/sizes used.
I guess I'm doing some wrong assumption about the tile size, but even after doing some research the implementation seems quite legit.
#define MATRIX_SIZE 1024
#define BLOCK_WIDTH 32
Here's the kernel code for the tiled version
__global__ void convolution_2D_tiled(float* in, const float* __restrict__ mask, float* out, size_t mask_width, size_t w, size_t h) {
float outputPixel = 0; //minimize write to global memory: stored in register
int tx = threadIdx.x;
int ty = threadIdx.y;
int tile_width = BLOCK_WIDTH - mask_width + 1; //since BLOCK_WIDTH = TILE_WIDTH + MASK_WIDTH - 1
int col = blockIdx.x * tile_width + tx;
int row = blockIdx.y * tile_width + ty;
//picking the starting indexes of input matrix inside the mask
//(TOP-LEFT of the mask)
int inputRow = row - (mask_width / 2);
int inputCol = col - (mask_width / 2);
__shared__ float tile[BLOCK_WIDTH][BLOCK_WIDTH];
// Load tile elements
if (inputRow >= 0 && inputRow < h && inputCol >= 0 && inputCol < w)
tile[ty][tx] = in[inputRow * w + inputCol];
else
tile[ty][tx] = 0.0;
// Wait until all tile elements are loaded
__syncthreads();
//some thread won't write any outputs, only need to calculate tile_width elements
if (col < w && row < h && ty < tile_width && tx < tile_width) {
//get the neighbour in the mask
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) { //(Mask_Width^2) access for each thread in block -> for each block (Mask_Width^2) * (Block_width^2)
outputPixel += tile[i + ty][j + tx] * mask[i * mask_width + j];
}
}
out[(row * w) + col] = (float)(outputPixel);
}
}
The main with the matrix generation and sizes assumptions:
void errorCheck(unsigned int line){
cudaError_t cudaError = cudaGetLastError();
// if error code wasn't a code describing success
if (cudaError != cudaSuccess)
{
// output that there has been a CUDA error in the line of the CUDA function call
// and exit the program
printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
exit(EXIT_FAILURE);
}}
int main(int argc, char const* argv[]){
for (size_t mask_width = 3; mask_width <= 9; mask_width += 2) {
printf("Testing with mask size = %d\n\n", mask_width);
float* a;
float* b;
float* c;
cudaMallocManaged((void **) &a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE);
cudaMallocManaged((void **) &b, sizeof(int)*mask_width*mask_width);
cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
// initialize matrix A
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
a[i * MATRIX_SIZE + j] = (float)(1 +(3 * j % 20));
}
}
// initialize matrix B
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) {
b[i * mask_width + j] = (float)(1 + (((2 * i) + j) % mask_width));
}
}
float naive_gpu_elapsed_time_ms;
// some events to count the execution time
//clock_t st, end;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int tile_width = BLOCK_WIDTH - mask_width + 1;
dim3 dimGrid(MATRIX_SIZE / tile_width, MATRIX_SIZE / tile_width);
dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
errorCheck(__LINE__);
cudaEventRecord(start, 0);
convolution_2D_tiled <<<dimGrid, dimBlock >>> (a, b, c, mask_width, MATRIX_SIZE, MATRIX_SIZE);
errorCheck(__LINE__);
cudaThreadSynchronize();
//time counting terminate
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//compute time elapsed on GPU computing
cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on naive GPU convolution 2d tiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
//free memory
cudaFree(a);
cudaFree(b);
cudaFree(c);
printf("________________________________________________________________________\n\n");
}
return 0;
}
I'm using google colab with Tesla T4 GPU, and no CUDA error is thrown.
Also tried to use bigger masks (11, 15 ..) but no changes in comparison between tiled and untiled.

You are making inefficient usage of managed memory as discussed here and here.
Nearly all of your ~2ms of execution time is used in inefficient demand-paged copying of data from host to device. As a result, your ability to resolve the difference in performance in the two cases due to the device code changes is almost completely obscured.
If you add these 3 lines of code immediately before float naive_gpu_elapsed_time_ms;, you will observe that your reported execution times decrease dramatically, and you should be able to better judge the performance difference between the shared memory tiled version and the non-tiled version:
cudaMemPrefetchAsync(a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE, 0);
cudaMemPrefetchAsync(b, sizeof(int)*mask_width*mask_width, 0);
cudaMemPrefetchAsync(c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE, 0);
You haven't shown your non-tiled code, so I can't demonstrate that for you. Here's an example profiling output using a non-tiled convolution code that I wrote, comparing to your tiled kernel, and including the cudaMemPrefetchAsync() statements:
$ nvprof ./t2140
Testing with mask size = 3
==13236== NVPROF is profiling process 13236, command: ./t2140
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.032832 ms.
________________________________________________________________________
Testing with mask size = 5
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.061120 ms.
________________________________________________________________________
Testing with mask size = 7
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.086080 ms.
________________________________________________________________________
Testing with mask size = 9
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.118688 ms.
________________________________________________________________________
==13236== Profiling application: ./t2140
==13236== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 52.59% 311.69us 4 77.922us 41.089us 119.08us convolution_2D(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
47.41% 280.97us 4 70.241us 28.449us 114.28us convolution_2D_tiled(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
API calls: 96.10% 365.32ms 12 30.443ms 12.906us 365.10ms cudaMallocManaged
1.32% 5.0301ms 4 1.2575ms 586.91us 3.2433ms cuDeviceTotalMem
0.66% 2.4917ms 404 6.1670us 320ns 268.82us cuDeviceGetAttribute
0.56% 2.1277ms 12 177.31us 8.3020us 578.90us cudaMemPrefetchAsync
0.50% 1.9035ms 4 475.88us 295.08us 549.01us cudaDeviceSynchronize
0.49% 1.8594ms 12 154.95us 75.533us 328.85us cudaFree
0.14% 526.53us 4 131.63us 42.014us 220.14us cudaEventSynchronize
0.11% 399.28us 4 99.820us 61.310us 210.74us cuDeviceGetName
0.09% 351.52us 8 43.940us 11.426us 116.52us cudaLaunchKernel
0.01% 45.911us 8 5.7380us 4.1870us 10.243us cudaEventRecord
0.01% 25.946us 8 3.2430us 935ns 10.182us cudaEventCreate
0.01% 21.643us 4 5.4100us 3.1450us 8.6700us cuDeviceGetPCIBusId
0.00% 10.304us 8 1.2880us 430ns 5.0980us cuDeviceGet
0.00% 9.6790us 4 2.4190us 1.9560us 3.7180us cudaEventElapsedTime
0.00% 3.3390us 3 1.1130us 617ns 1.6520us cuDeviceGetCount
0.00% 3.2480us 4 812ns 700ns 1.0470us cuDeviceGetUuid
0.00% 3.1420us 8 392ns 229ns 1.2110us cudaGetLastError
==13236== Unified Memory profiling result:
Device "Tesla V100-PCIE-32GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
12 1.3346MB 4.0000KB 2.0000MB 16.01563MB 1.405760ms Host To Device
Total CPU Page faults: 52
$
You can see that in each case, the tiled/shared memory kernel is faster.

Related

How to fix my block and grid layout to handle large data?

I'm trying to find the closest pair naive algorithm which has 3D coord.
Input is two files which contains 3 floats in one line.
I handled inputs with float3* type variable.
float3* teamA;
float3* teamB;
float3* results;
handleFileInput(argv[1], argv[2], teamA, teamB, numPoints);
results = new float3[numPoints[0]];
after this, I allocated and copied host data to device like this
#define CHECKERROR(val) { if (val != cudaSuccess) {fprintf(stderr, "Error %s at line %d in file %s\n", cudaGetErrorString(val), __LINE__, __FILE__); exit(1);} }
CHECKERROR(cudaMalloc(&d_tA, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemset(d_tA, 0, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMalloc(&d_tB, sizeof(float3) * numPoints[1]));
CHECKERROR(cudaMemset(d_tB, 0, sizeof(float3) * numPoints[1]));
CHECKERROR(cudaMalloc(&d_results, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemset(d_results, 0, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemcpy(d_tA, teamA, sizeof(float3) * numPoints[0], cudaMemcpyHostToDevice));
CHECKERROR(cudaMemcpy(d_tB, teamB, sizeof(float3) * numPoints[1], cudaMemcpyHostToDevice));
I set my block, grid like this.
dim3 block(512);
dim3 grid(ceil((float)numPoints[0] / 512);
naive_algorithm <<< block, grid >>> (d_tA, d_tB, d_results, numPoints[0], numPoints[1]);
My kernel code is simple like this
__global__ void naive_algorithm(float3* d_tA, float3* d_tB, float3* d_r, int a_size, int b_size)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < a_size)
{
float min_distance = -1;
for (int y = 0; y < b_size; y++)
{
float i = MUL(SUB(d_tA[idx].x, d_tB[y].x), SUB(d_tA[idx].x, d_tB[y].x));
float j = MUL(SUB(d_tA[idx].y, d_tB[y].y), SUB(d_tA[idx].y, d_tB[y].y));
float k = MUL(SUB(d_tA[idx].z, d_tB[y].z), SUB(d_tA[idx].z, d_tB[y].z));
float distance = SQRT(ADD(ADD(i, j), k));
if (min_distance > distance || min_distance == -1)
{
d_r[idx].x = (float)idx;
d_r[idx].y = (float)y;
d_r[idx].z = distance;
min_distance = distance;
}
}
__syncthreads();
}
}
Environment : RTX 2080Ti
There are five different of data samples :
Team A - 1000000 points / Team B - 500000 points -> Test Failed
Team A - 700000 points / Team B - 500000 points -> Test Failed
Team A - 500000 points / Team B - 300000 points -> Test OK!
Team A - 500000 points / Team B - 100000 points -> Test OK!
Team A - 300000 points / Team B - 100000 points -> Test OK!
In my opinion this caused from thread layout.
Do I have to change the block / grid layout 1D by 1D -> 2D by 2D?
Then how should I set my grid layout?
Like Robert Crovella said this was just typo error my mistake.
Because one block can handle to 1024 and grid's one dimension can handle to 65535,
if the grid's x dimension : numPoints[0] / BLOCK_SIZE is bigger than 1024 it doesn't works.
Thanks a lot to check my code!

Cuda: XOR single bitset with array of bitsets

I want to XOR a single bitset with a bunch of other bitsets (~100k) and count the set bits of every xor-result. The size of a single bitset is around 20k bits.
The bitsets are already converted to arrays of unsigned int to be able to use the intrinsic __popc()-function. The 'bunch' is already residing contiguously in device-memory.
My current kernel code looks like this:
// Grid/Blocks used for kernel invocation
dim3 block(32);
dim3 grid((bunch_size / 31) + 32);
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){ // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){ // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}
However, compared to a parallelized c++/boost version, I see no benefit using Cuda.
Is there any potential in optimizing this kernel?
Is there any potential in optimizing this kernel?
I see 2 problems here (and they are the first two classical primary optimizations objectives for any CUDA programmer):
You want to try to efficiently use global memory. Your accesses to bitset and bunch are not coalesced. (efficiently use the memory subsystems)
The use of 32 threads per block is generally not recommended and could limit your overall occupancy. One thread per bitset is also potentially problematic. (expose enough parallelism)
Whether addressing those issues will meet your definition of benefit is impossible to say without a comparison test case. Furthermore, simple memory-bound problems like this are rarely interesting in CUDA when considered by themselves. However, we can (probably) improve the performance of your kernel.
We'll use a laundry list of ideas:
have each block handle a bitset, rather than each thread, to enable coalescing
use shared memory to load the comparison bitset, and reuse it
use just enough blocks to saturate the GPU, along with striding loops
use const ... __restrict__ style decoration to possibly benefit from RO cache
Here's a worked example:
$ cat t1649.cu
#include <iostream>
#include <cstdlib>
const int my_bitset_size = 20000/(32);
const int my_bunch_size = 100000;
typedef unsigned uint;
//using one thread per bitset in the bunch
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){ // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){ // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}
const int nTPB = 256;
// one block per bitset, multiple bitsets per block
__global__ void kernelXOR_imp(const uint * __restrict__ bitset, const uint * __restrict__ bunch, int * __restrict__ set_bits, int bitset_size, int bunch_size) {
__shared__ uint sbitset[my_bitset_size]; // could also be dynamically allocated for varying bitset sizes
__shared__ int ssum[nTPB];
// load shared, block-stride loop
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) sbitset[idx] = bitset[idx];
__syncthreads();
// stride across all bitsets in bunch
for (int bidx = blockIdx.x; bidx < bunch_size; bidx += gridDim.x){
int my_sum = 0;
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) my_sum += __popc(sbitset[idx] ^ bunch[bidx*bitset_size + idx]);
// block level parallel reduction
ssum[threadIdx.x] = my_sum;
for (int ridx = nTPB>>1; ridx > 0; ridx >>=1){
__syncthreads();
if (threadIdx.x < ridx) ssum[threadIdx.x] += ssum[threadIdx.x+ridx];}
if (!threadIdx.x) set_bits[bidx] = ssum[0];}
}
int main(){
// data setup
uint *d_cbitset, *d_bitsets, *h_cbitset, *h_bitsets;
int *d_r, *h_r, *h_ri;
h_cbitset = new uint[my_bitset_size];
h_bitsets = new uint[my_bitset_size*my_bunch_size];
h_r = new int[my_bunch_size];
h_ri = new int[my_bunch_size];
for (int i = 0; i < my_bitset_size*my_bunch_size; i++){
h_bitsets[i] = rand();
if (i < my_bitset_size) h_cbitset[i] = rand();}
cudaMalloc(&d_cbitset, my_bitset_size*sizeof(uint));
cudaMalloc(&d_bitsets, my_bitset_size*my_bunch_size*sizeof(uint));
cudaMalloc(&d_r, my_bunch_size*sizeof(int));
cudaMemcpy(d_cbitset, h_cbitset, my_bitset_size*sizeof(uint), cudaMemcpyHostToDevice);
cudaMemcpy(d_bitsets, h_bitsets, my_bitset_size*my_bunch_size*sizeof(uint), cudaMemcpyHostToDevice);
// original
// Grid/Blocks used for kernel invocation
dim3 block(32);
dim3 grid((my_bunch_size / 31) + 32);
kernelXOR<<<grid, block>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_r, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);
// improved
dim3 iblock(nTPB);
dim3 igrid(640);
kernelXOR_imp<<<igrid, iblock>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_ri, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < my_bunch_size; i++)
if (h_r[i] != h_ri[i]) {std::cout << "mismatch at i: " << i << " was: " << h_ri[i] << " should be: " << h_r[i] << std::endl; return 0;}
std::cout << "Results match." << std::endl;
return 0;
}
$ nvcc -o t1649 t1649.cu
$ cuda-memcheck ./t1649
========= CUDA-MEMCHECK
Results match.
========= ERROR SUMMARY: 0 errors
$ nvprof ./t1649
==18868== NVPROF is profiling process 18868, command: ./t1649
Results match.
==18868== Profiling application: ./t1649
==18868== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.06% 71.113ms 2 35.557ms 2.3040us 71.111ms [CUDA memcpy HtoD]
2.26% 1.6563ms 1 1.6563ms 1.6563ms 1.6563ms kernelXOR(unsigned int*, unsigned int*, int*, int, int)
0.59% 432.68us 1 432.68us 432.68us 432.68us kernelXOR_imp(unsigned int const *, unsigned int const *, int*, int, int)
0.09% 64.770us 2 32.385us 31.873us 32.897us [CUDA memcpy DtoH]
API calls: 78.20% 305.44ms 3 101.81ms 11.373us 304.85ms cudaMalloc
18.99% 74.161ms 4 18.540ms 31.554us 71.403ms cudaMemcpy
1.39% 5.4121ms 4 1.3530ms 675.30us 3.3410ms cuDeviceTotalMem
1.26% 4.9393ms 388 12.730us 303ns 530.95us cuDeviceGetAttribute
0.11% 442.37us 4 110.59us 102.61us 125.59us cuDeviceGetName
0.03% 128.18us 2 64.088us 21.789us 106.39us cudaLaunchKernel
0.01% 35.764us 4 8.9410us 2.9670us 18.982us cuDeviceGetPCIBusId
0.00% 8.3090us 8 1.0380us 540ns 1.3870us cuDeviceGet
0.00% 5.9530us 3 1.9840us 310ns 3.9900us cuDeviceGetCount
0.00% 2.8800us 4 720ns 574ns 960ns cuDeviceGetUuid
$
In this case, on my Tesla V100, for your problem size, I witness about a 4x improvement in kernel performance. However the kernel performance here is tiny compared to the cost of data movement. So it's unlikely that these sort of optimizations would make a significant difference in your comparison test case, if this is the only thing you are doing on the GPU.
The code above uses striding-loops at the block level and at the grid level, which means it should behave correctly for almost any choice of threadblock size (multiple of 32 please) as well as grid size. That doesn't mean that any/all choices will perform equally. The choice of the threadblock size is to allow the possibility for nearly full occupancy (so don't choose 32). The choice of the grid size is the number of blocks to achieve full occupancy per SM, times the number of SMs. These should be nearly optimal choices, but according to my testing e.g. a larger number of blocks doesn't really reduce performance, and the performance should be roughly constant for nearly any threadblock size (except 32), assuming the number of blocks is calculated accordingly.

Why transposing a CUDA grid (but not its threadblocks) still slowdowns computation?

EDIT: It seems that, at least in this case, transposing the grid has a negative effect on the L2 Cache Bandwidth. This was obtained from the visual profiler. The reason why is not clear to me yet.
I have come to a GPU computing situation in which require to transpose a CUDA grid. So, if block_{x,y} originally acted on data region d_{x,y}, now it acts on data region d_{y,x}, therefore block_{y,x} would act on data region d_{x,y}. An example is presented in the following figure.
It is worth mentioning that threads are not transposed inside each block, that is, once the block is located, the threadIdx.x and threadIdx.y values are used in a normal way for their x and y offsets, respectively.
From what I know, in theory this design should do no harm in performance, as the memory coalescing pattern is still preserved, i.e., threads inside a block are not transposed, it is just the grid that re-arranged its blocks. However I found that when transposing the grid, the kernel runs approx. 2X slower than in the normal case. I made a toy example to illustrate the situation.
➜ transpose-grid ./prog 10000 10000 100 0
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
normal_kernel (100 rep).......done: 0.935132 ms
verifying correctness.........ok
➜ transpose-grid ./prog 10000 10000 100 1
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
transp_kernel (100 rep).......done: 1.980445 ms
verifying correctness.........ok
I would really appreciate any explanation for this issue. Here is the source code to reproduce the behavior.
// -----------------------------------
// can compile as nvcc main.cu -o prog
// -----------------------------------
#include <cuda.h>
#include <cstdio>
#define BSIZE2D 32
__global__ void normal_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.y*blockDim.y + threadIdx.y;
const int j = blockIdx.x*blockDim.x + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
__global__ void transp_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.x*blockDim.x + threadIdx.y;
const int j = blockIdx.y*blockDim.y + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
int verify(int *hmat, const int m, const int n){
printf("verifying correctness........."); fflush(stdout);
for(int i=0; i<m*n; ++i){
if(hmat[i] != 1){
fprintf(stderr, "Incorrect value at m[%i,%i] = %i\n", i/n, i%n);
return 0;
}
}
printf("ok\n"); fflush(stdout);
return 1;
}
int main(int argc, char **argv){
if(argc != 5){
printf("\nrun as ./prog m n r t\n\nr = number of repeats\nt = transpose (1 or 0)\n");
exit(EXIT_FAILURE);
}
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int r = atoi(argv[3]);
const int t = atoi(argv[4]);
const unsigned int size = m*n;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float time;
int *hmat, *dmat;
printf("init data....................."); fflush(stdout);
hmat = (int*)malloc(sizeof(int)*(size));
for(int i=0; i<size; ++i){
hmat[i] = 0;
}
printf("done: zero matrix of %i rows x %i cols\n", m, n);
printf("copy data to GPU.............."); fflush(stdout);
cudaMalloc(&dmat, sizeof(int)*(size));
cudaMemcpy(dmat, hmat, sizeof(int)*(size), cudaMemcpyHostToDevice);
printf("done\n");
printf("preparing grid................"); fflush(stdout);
dim3 block(BSIZE2D, BSIZE2D, 1);
dim3 grid;
// if transpose or not
if(t){
grid = dim3((m + BSIZE2D - 1)/BSIZE2D, (n + BSIZE2D - 1)/BSIZE2D, 1);
}
else{
grid = dim3((n + BSIZE2D - 1)/BSIZE2D, (m + BSIZE2D - 1)/BSIZE2D, 1);
}
printf("done: block(%i, %i, %i), grid(%i, %i, %i)\n", block.x, block.y, block.z, grid.x, grid.y, grid.z);
if(t){
printf("transp_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
transp_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
else{
printf("normal_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
normal_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
cudaMemcpy(hmat, dmat, sizeof(int)*size, cudaMemcpyDeviceToHost);
verify(hmat, m, n);
exit(EXIT_SUCCESS);
}
Since I could not find any literature on that topic here is my guess-explanation rather basing on experience (my old problem with memory reading speed).
As you wrote, your example preserves memory coalescing pattern but it is done only on warp level (consecutive 32 threads). But to achieve full speed, there is a need for coalescing on inter-warp level - and here the reason is not clear if such coalescing is actually done or maybe cache&memory works somehow better in this scenario (probably as described here we have better utilization of memory burst mode).
So in your normal_kernel execution not only single warp is coalesced but also warp from the next block(s).
To check it on your example I have modified your code to use different block sizes and here are my results on 1080Ti:
Block size (32, 32) same as yours:
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
normal_kernel (100 rep).......done: 1.020545 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
transp_kernel (100 rep).......done: 1.564084 ms
verifying correctness.........ok
Block size (64, 16) unfortunetly we cannot create 64,64 since #threads limit in one block:
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
normal_kernel (100 rep).......done: 1.020420 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
transp_kernel (100 rep).......done: 1.205506 ms
verifying correctness.........ok
Block size (128, 8):
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
normal_kernel (100 rep).......done: 1.019547 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
transp_kernel (100 rep).......done: 1.058236 ms
verifying correctness.........ok
I'm not sure if this helps in your particular problem but at least we have some more data to discuss.

CUDA - Optimize mean of matrix rows calculation using shared memory

I am trying to optimize the computation of the mean of each row in my 512w x 1024h image, and then subtract the mean from the row from which it was computed. I wrote a piece of code which does it in 1.86 ms, but I want to reduce the speed. This piece of code works fine, but does not use shared memory, and it utilizes for loops. I want to do away with them.
__global__ void subtractMean (const float *__restrict__ img, float *lineImg, int height, int width) {
// height = 1024, width = 512
int tidy = threadIdx.x + blockDim.x * blockIdx.x;
float sum = 0.0f;
float sumDiv = 0.0f;
if(tidy < height) {
for(int c = 0; c < width; c++) {
sum += img[tidy*width + c];
}
sumDiv = (sum/width)/2;
//__syncthreads();
for(int cc = 0; cc < width; cc++) {
lineImg[tidy*width + cc] = img[tidy*width + cc] - sumDiv;
}
}
__syncthreads();
I called the above kernel using:
subtractMean <<< 2, 512 >>> (originalImage, rowMajorImage, actualImHeight, actualImWidth);
However, the following code I wrote uses shared memory to optimize. But, it does not work as expected. Any thoughts on what the problem might be?
__global__ void subtractMean (const float *__restrict__ img, float *lineImg, int height, int width) {
extern __shared__ float perRow[];
int idx = threadIdx.x; // set idx along x
int stride = width/2;
while(idx < width) {
perRow[idx] = 0;
idx += stride;
}
__syncthreads();
int tidx = threadIdx.x; // set idx along x
int tidy = blockIdx.x; // set idx along y
if(tidy < height) {
while(tidx < width) {
perRow[tidx] = img[tidy*width + tidx];
tidx += stride;
}
}
__syncthreads();
tidx = threadIdx.x; // reset idx along x
tidy = blockIdx.x; // reset idx along y
if(tidy < height) {
float sumAllPixelsInRow = 0.0f;
float sumDiv = 0.0f;
while(tidx < width) {
sumAllPixelsInRow += perRow[tidx];
tidx += stride;
}
sumDiv = (sumAllPixelsInRow/width)/2;
tidx = threadIdx.x; // reset idx along x
while(tidx < width) {
lineImg[tidy*width + tidx] = img[tidy*width + tidx] - sumDiv;
tidx += stride;
}
}
__syncthreads();
}
The shared memory function was called using:
subtractMean <<< 1024, 256, sizeof(float)*512 >>> (originalImage, rowMajorImage, actualImHeight, actualImWidth);
2 blocks is hardly enough to saturate GPU use. You are going towards the right approach with utilizing more blocks, however, you are using Kepler and I would like to present an option that does not use shared memory at all.
Start with 32 threads in a block (this can be changed later using 2D blocks)
With those 32 threads you should do something along the lines of this:
int rowID = blockIdx.x;
int tid = threadIdx.x;
int stride= blockDim.x;
int index = threadIdx.x;
float sum=0.0;
while(index<width){
sum+=img[width*rowID+index];
index+=blockDim.x;
}
at this point you will have 32 threads that have a partial sum in each of them. You next need to add them all together. You can do this without the use of shared memory (since we are within a warp) by utilizing a shuffle reduction. For details on that look here: http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ what you want is the shuffle warp reduce, but you need to change it to use the full 32 threads.
Now that thread 0 in each warp has the sum of every row, you can divide that by the width cast to a float, and broadcast it to the rest of the warp using shfl using shfl(average, 0);. http://docs.nvidia.com/cuda/cuda-c-programming-guide/#warp-description
With the average found and the warps synchronized implicitly and explicitly (with shfl), you can continue on in a similar method with the subtract.
Possible further optimizations would be to include more than one warp in a block to improve occupancy, and to manually unroll the loops over the width to improve instruction level parallelism.
Good Luck.

How to efficiently repeat a vector to a matrix in cuda?

I want to repeat a vector to form a matrix in cuda, avoiding too many memcopy. Both vector and matrix are allocated on GPU.
For example:
I have a vector:
a = [1 2 3 4]
expand it to a matrix:
b = [1 2 3 4;
1 2 3 4;
.......
1 2 3 4]
What I have tried is to assign each element of b. But this involves a lot of GPU memory to GPU memory copy.
I know this is easy in matlab (using repmat), but how to do it in cuda efficiently? I didn't find any routine in cublas.
EDIT based on the comments, I've updated the code to a version that will handle either row-major or column-major underlying storage.
Something like this should be reasonably fast:
// for row_major, blocks*threads should be a multiple of vlen
// for column_major, blocks should be equal to vlen
template <typename T>
__global__ void expand_kernel(const T* vector, const unsigned vlen, T* matrix, const unsigned mdim, const unsigned col_major=0){
if (col_major){
int idx = threadIdx.x+blockIdx.x*mdim;
T myval = vector[blockIdx.x];
while (idx < ((blockIdx.x+1)*mdim)){
matrix[idx] = myval;
idx += blockDim.x;
}
}
else{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
T myval = vector[idx%vlen];
while (idx < mdim*vlen){
matrix[idx] = myval;
idx += gridDim.x*blockDim.x;
}
}
}
This assumes your matrix is of dimensions mdim rows x vlen columns (seems to be what you have outlined in the question.)
You can tune the grid and block dimensions to find out what works fastest for your particular GPU. For the row-major case, start with 256 or 512 threads per block, and set the number of blocks equal to or greater than 4 times the number of SMs in your GPU. Choose the product of grid and block dimensions to be equal to an integer multiple of your vector length vlen. If this is difficult, choosing an arbitrary, but "large" threadblock size, such as 250 or 500, should not result in much lost efficiency.
For the column-major case, choose 256 or 512 threads per block, and choose the number of blocks equal to vlen, the vector length. If vlen > 65535, you will need to compile this for compute capability 3.0 or higher. If vlen is small, perhaps less than 32, the efficiency of this method may be significantly reduced. Some mitigation will be found if you increase the threads per block to the maximum for your GPU, either 512 or 1024. There may be other "expand" realizations that may be better suited to the column-major "narrow" matrix case. For example, a straightforward modification to the column-major code would allow two blocks per vector element, or four blocks per vector element, and the total launched blocks would then be 2*vlen or 4*vlen, for example.
Here's a fully worked example, along with a run of bandwidth test, to demonstrate that the above code achieves ~90% of the throughput indicated by bandwidthTest:
$ cat t546.cu
#include <stdio.h>
#define W 512
#define H (512*1024)
// for row_major, blocks*threads should be a multiple of vlen
// for column_major, blocks should be equal to vlen
template <typename T>
__global__ void expand_kernel(const T* vector, const unsigned vlen, T* matrix, const unsigned mdim, const unsigned col_major=0){
if (col_major){
int idx = threadIdx.x+blockIdx.x*mdim;
T myval = vector[blockIdx.x];
while (idx < ((blockIdx.x+1)*mdim)){
matrix[idx] = myval;
idx += blockDim.x;
}
}
else{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
T myval = vector[idx%vlen];
while (idx < mdim*vlen){
matrix[idx] = myval;
idx += gridDim.x*blockDim.x;
}
}
}
template <typename T>
__global__ void check_kernel(const T* vector, const unsigned vlen, T* matrix, const unsigned mdim, const unsigned col_major=0){
unsigned i = 0;
while (i<(vlen*mdim)){
unsigned idx = (col_major)?(i/mdim):(i%vlen);
if (matrix[i] != vector[idx]) {printf("mismatch at offset %d\n",i); return;}
i++;}
}
int main(){
int *v, *m;
cudaMalloc(&v, W*sizeof(int));
cudaMalloc(&m, W*H*sizeof(int));
int *h_v = (int *)malloc(W*sizeof(int));
for (int i = 0; i < W; i++)
h_v[i] = i;
cudaMemcpy(v, h_v, W*sizeof(int), cudaMemcpyHostToDevice);
// test row-major
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
expand_kernel<<<44, W>>>(v, W, m, H);
cudaEventRecord(stop);
float et;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
printf("row-majortime: %fms, bandwidth: %.0fMB/s\n", et, W*H*sizeof(int)/(1024*et));
check_kernel<<<1,1>>>(v, W, m, H);
cudaDeviceSynchronize();
// test col-major
cudaEventRecord(start);
expand_kernel<<<W, 256>>>(v, W, m, H, 1);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
printf("col-majortime: %fms, bandwidth: %.0fMB/s\n", et, W*H*sizeof(int)/(1024*et));
check_kernel<<<1,1>>>(v, W, m, H, 1);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t546 t546.cu
$ ./t546
row-majortime: 13.066944ms, bandwidth: 80246MB/s
col-majortime: 12.806720ms, bandwidth: 81877MB/s
$ /usr/local/cuda/samples/bin/x86_64/linux/release/bandwidthTest
[CUDA Bandwidth Test] - Starting...
Running on...
Device 0: Quadro 5000
Quick Mode
Host to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 5864.2
Device to Host Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 6333.1
Device to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 88178.6
Result = PASS
$
CUDA 6.5, RHEL 5.5
This can also be implemented using a CUBLAS Rank-1 update function but it will be considerably slower than the above method.