Why transposing a CUDA grid (but not its threadblocks) still slowdowns computation? - c++

EDIT: It seems that, at least in this case, transposing the grid has a negative effect on the L2 Cache Bandwidth. This was obtained from the visual profiler. The reason why is not clear to me yet.
I have come to a GPU computing situation in which require to transpose a CUDA grid. So, if block_{x,y} originally acted on data region d_{x,y}, now it acts on data region d_{y,x}, therefore block_{y,x} would act on data region d_{x,y}. An example is presented in the following figure.
It is worth mentioning that threads are not transposed inside each block, that is, once the block is located, the threadIdx.x and threadIdx.y values are used in a normal way for their x and y offsets, respectively.
From what I know, in theory this design should do no harm in performance, as the memory coalescing pattern is still preserved, i.e., threads inside a block are not transposed, it is just the grid that re-arranged its blocks. However I found that when transposing the grid, the kernel runs approx. 2X slower than in the normal case. I made a toy example to illustrate the situation.
➜ transpose-grid ./prog 10000 10000 100 0
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
normal_kernel (100 rep).......done: 0.935132 ms
verifying correctness.........ok
➜ transpose-grid ./prog 10000 10000 100 1
init data.....................done: zero matrix of 10000 x 10000
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(313, 313, 1)
transp_kernel (100 rep).......done: 1.980445 ms
verifying correctness.........ok
I would really appreciate any explanation for this issue. Here is the source code to reproduce the behavior.
// -----------------------------------
// can compile as nvcc main.cu -o prog
// -----------------------------------
#include <cuda.h>
#include <cstdio>
#define BSIZE2D 32
__global__ void normal_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.y*blockDim.y + threadIdx.y;
const int j = blockIdx.x*blockDim.x + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
__global__ void transp_kernel(int *dmat, const int m, const int n){
const int i = blockIdx.x*blockDim.x + threadIdx.y;
const int j = blockIdx.y*blockDim.y + threadIdx.x;
if(i < m && j < n){
dmat[i*n + j] = 1;
}
}
int verify(int *hmat, const int m, const int n){
printf("verifying correctness........."); fflush(stdout);
for(int i=0; i<m*n; ++i){
if(hmat[i] != 1){
fprintf(stderr, "Incorrect value at m[%i,%i] = %i\n", i/n, i%n);
return 0;
}
}
printf("ok\n"); fflush(stdout);
return 1;
}
int main(int argc, char **argv){
if(argc != 5){
printf("\nrun as ./prog m n r t\n\nr = number of repeats\nt = transpose (1 or 0)\n");
exit(EXIT_FAILURE);
}
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int r = atoi(argv[3]);
const int t = atoi(argv[4]);
const unsigned int size = m*n;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float time;
int *hmat, *dmat;
printf("init data....................."); fflush(stdout);
hmat = (int*)malloc(sizeof(int)*(size));
for(int i=0; i<size; ++i){
hmat[i] = 0;
}
printf("done: zero matrix of %i rows x %i cols\n", m, n);
printf("copy data to GPU.............."); fflush(stdout);
cudaMalloc(&dmat, sizeof(int)*(size));
cudaMemcpy(dmat, hmat, sizeof(int)*(size), cudaMemcpyHostToDevice);
printf("done\n");
printf("preparing grid................"); fflush(stdout);
dim3 block(BSIZE2D, BSIZE2D, 1);
dim3 grid;
// if transpose or not
if(t){
grid = dim3((m + BSIZE2D - 1)/BSIZE2D, (n + BSIZE2D - 1)/BSIZE2D, 1);
}
else{
grid = dim3((n + BSIZE2D - 1)/BSIZE2D, (m + BSIZE2D - 1)/BSIZE2D, 1);
}
printf("done: block(%i, %i, %i), grid(%i, %i, %i)\n", block.x, block.y, block.z, grid.x, grid.y, grid.z);
if(t){
printf("transp_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
transp_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
else{
printf("normal_kernel (%3i rep).......", r); fflush(stdout);
cudaEventRecord(start, 0);
for(int i=0; i<r; ++i){
normal_kernel<<<grid, block>>>(dmat, m, n);
cudaDeviceSynchronize();
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop); // that's our time!
printf("done: %f ms\n", time/(float)r);
}
cudaMemcpy(hmat, dmat, sizeof(int)*size, cudaMemcpyDeviceToHost);
verify(hmat, m, n);
exit(EXIT_SUCCESS);
}

Since I could not find any literature on that topic here is my guess-explanation rather basing on experience (my old problem with memory reading speed).
As you wrote, your example preserves memory coalescing pattern but it is done only on warp level (consecutive 32 threads). But to achieve full speed, there is a need for coalescing on inter-warp level - and here the reason is not clear if such coalescing is actually done or maybe cache&memory works somehow better in this scenario (probably as described here we have better utilization of memory burst mode).
So in your normal_kernel execution not only single warp is coalesced but also warp from the next block(s).
To check it on your example I have modified your code to use different block sizes and here are my results on 1080Ti:
Block size (32, 32) same as yours:
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
normal_kernel (100 rep).......done: 1.020545 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(32, 32, 1), grid(320, 320, 1)
transp_kernel (100 rep).......done: 1.564084 ms
verifying correctness.........ok
Block size (64, 16) unfortunetly we cannot create 64,64 since #threads limit in one block:
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
normal_kernel (100 rep).......done: 1.020420 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(64, 16, 1), grid(160, 640, 1)
transp_kernel (100 rep).......done: 1.205506 ms
verifying correctness.........ok
Block size (128, 8):
~$ ./prog 10240 10240 100 0
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
normal_kernel (100 rep).......done: 1.019547 ms
verifying correctness.........ok
~$ ./prog 10240 10240 100 1
init data.....................done: zero matrix of 10240 rows x 10240 cols
copy data to GPU..............done
preparing grid................done: block(128, 8, 1), grid(80, 1280, 1)
transp_kernel (100 rep).......done: 1.058236 ms
verifying correctness.........ok
I'm not sure if this helps in your particular problem but at least we have some more data to discuss.

Related

2D tiled convolution taking more time than untiled version

Writing a code that perform a 2D convolution on a float matrix, in both tiled and untiled version. I'm assuming the width of the tile as
BLOCK_SIZE - MASK_WIDTH + 1
, using halo cells.
But for a 1024 matrix and masks varing from 3 to 9 I get the untiled version performing better:
untiled version
vs
tiled
Both matrix and mask are defined in a constant manner, equal for tiled and untiled. No random values/sizes used.
I guess I'm doing some wrong assumption about the tile size, but even after doing some research the implementation seems quite legit.
#define MATRIX_SIZE 1024
#define BLOCK_WIDTH 32
Here's the kernel code for the tiled version
__global__ void convolution_2D_tiled(float* in, const float* __restrict__ mask, float* out, size_t mask_width, size_t w, size_t h) {
float outputPixel = 0; //minimize write to global memory: stored in register
int tx = threadIdx.x;
int ty = threadIdx.y;
int tile_width = BLOCK_WIDTH - mask_width + 1; //since BLOCK_WIDTH = TILE_WIDTH + MASK_WIDTH - 1
int col = blockIdx.x * tile_width + tx;
int row = blockIdx.y * tile_width + ty;
//picking the starting indexes of input matrix inside the mask
//(TOP-LEFT of the mask)
int inputRow = row - (mask_width / 2);
int inputCol = col - (mask_width / 2);
__shared__ float tile[BLOCK_WIDTH][BLOCK_WIDTH];
// Load tile elements
if (inputRow >= 0 && inputRow < h && inputCol >= 0 && inputCol < w)
tile[ty][tx] = in[inputRow * w + inputCol];
else
tile[ty][tx] = 0.0;
// Wait until all tile elements are loaded
__syncthreads();
//some thread won't write any outputs, only need to calculate tile_width elements
if (col < w && row < h && ty < tile_width && tx < tile_width) {
//get the neighbour in the mask
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) { //(Mask_Width^2) access for each thread in block -> for each block (Mask_Width^2) * (Block_width^2)
outputPixel += tile[i + ty][j + tx] * mask[i * mask_width + j];
}
}
out[(row * w) + col] = (float)(outputPixel);
}
}
The main with the matrix generation and sizes assumptions:
void errorCheck(unsigned int line){
cudaError_t cudaError = cudaGetLastError();
// if error code wasn't a code describing success
if (cudaError != cudaSuccess)
{
// output that there has been a CUDA error in the line of the CUDA function call
// and exit the program
printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
exit(EXIT_FAILURE);
}}
int main(int argc, char const* argv[]){
for (size_t mask_width = 3; mask_width <= 9; mask_width += 2) {
printf("Testing with mask size = %d\n\n", mask_width);
float* a;
float* b;
float* c;
cudaMallocManaged((void **) &a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE);
cudaMallocManaged((void **) &b, sizeof(int)*mask_width*mask_width);
cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
// initialize matrix A
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
a[i * MATRIX_SIZE + j] = (float)(1 +(3 * j % 20));
}
}
// initialize matrix B
for (int i = 0; i < mask_width; ++i) {
for (int j = 0; j < mask_width; ++j) {
b[i * mask_width + j] = (float)(1 + (((2 * i) + j) % mask_width));
}
}
float naive_gpu_elapsed_time_ms;
// some events to count the execution time
//clock_t st, end;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int tile_width = BLOCK_WIDTH - mask_width + 1;
dim3 dimGrid(MATRIX_SIZE / tile_width, MATRIX_SIZE / tile_width);
dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
errorCheck(__LINE__);
cudaEventRecord(start, 0);
convolution_2D_tiled <<<dimGrid, dimBlock >>> (a, b, c, mask_width, MATRIX_SIZE, MATRIX_SIZE);
errorCheck(__LINE__);
cudaThreadSynchronize();
//time counting terminate
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//compute time elapsed on GPU computing
cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on naive GPU convolution 2d tiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
//free memory
cudaFree(a);
cudaFree(b);
cudaFree(c);
printf("________________________________________________________________________\n\n");
}
return 0;
}
I'm using google colab with Tesla T4 GPU, and no CUDA error is thrown.
Also tried to use bigger masks (11, 15 ..) but no changes in comparison between tiled and untiled.
You are making inefficient usage of managed memory as discussed here and here.
Nearly all of your ~2ms of execution time is used in inefficient demand-paged copying of data from host to device. As a result, your ability to resolve the difference in performance in the two cases due to the device code changes is almost completely obscured.
If you add these 3 lines of code immediately before float naive_gpu_elapsed_time_ms;, you will observe that your reported execution times decrease dramatically, and you should be able to better judge the performance difference between the shared memory tiled version and the non-tiled version:
cudaMemPrefetchAsync(a, sizeof(float)*MATRIX_SIZE*MATRIX_SIZE, 0);
cudaMemPrefetchAsync(b, sizeof(int)*mask_width*mask_width, 0);
cudaMemPrefetchAsync(c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE, 0);
You haven't shown your non-tiled code, so I can't demonstrate that for you. Here's an example profiling output using a non-tiled convolution code that I wrote, comparing to your tiled kernel, and including the cudaMemPrefetchAsync() statements:
$ nvprof ./t2140
Testing with mask size = 3
==13236== NVPROF is profiling process 13236, command: ./t2140
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.032832 ms.
________________________________________________________________________
Testing with mask size = 5
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.061120 ms.
________________________________________________________________________
Testing with mask size = 7
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.086080 ms.
________________________________________________________________________
Testing with mask size = 9
Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 0.118688 ms.
________________________________________________________________________
==13236== Profiling application: ./t2140
==13236== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 52.59% 311.69us 4 77.922us 41.089us 119.08us convolution_2D(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
47.41% 280.97us 4 70.241us 28.449us 114.28us convolution_2D_tiled(float*, float const *, float*, unsigned long, unsigned long, unsigned long)
API calls: 96.10% 365.32ms 12 30.443ms 12.906us 365.10ms cudaMallocManaged
1.32% 5.0301ms 4 1.2575ms 586.91us 3.2433ms cuDeviceTotalMem
0.66% 2.4917ms 404 6.1670us 320ns 268.82us cuDeviceGetAttribute
0.56% 2.1277ms 12 177.31us 8.3020us 578.90us cudaMemPrefetchAsync
0.50% 1.9035ms 4 475.88us 295.08us 549.01us cudaDeviceSynchronize
0.49% 1.8594ms 12 154.95us 75.533us 328.85us cudaFree
0.14% 526.53us 4 131.63us 42.014us 220.14us cudaEventSynchronize
0.11% 399.28us 4 99.820us 61.310us 210.74us cuDeviceGetName
0.09% 351.52us 8 43.940us 11.426us 116.52us cudaLaunchKernel
0.01% 45.911us 8 5.7380us 4.1870us 10.243us cudaEventRecord
0.01% 25.946us 8 3.2430us 935ns 10.182us cudaEventCreate
0.01% 21.643us 4 5.4100us 3.1450us 8.6700us cuDeviceGetPCIBusId
0.00% 10.304us 8 1.2880us 430ns 5.0980us cuDeviceGet
0.00% 9.6790us 4 2.4190us 1.9560us 3.7180us cudaEventElapsedTime
0.00% 3.3390us 3 1.1130us 617ns 1.6520us cuDeviceGetCount
0.00% 3.2480us 4 812ns 700ns 1.0470us cuDeviceGetUuid
0.00% 3.1420us 8 392ns 229ns 1.2110us cudaGetLastError
==13236== Unified Memory profiling result:
Device "Tesla V100-PCIE-32GB (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
12 1.3346MB 4.0000KB 2.0000MB 16.01563MB 1.405760ms Host To Device
Total CPU Page faults: 52
$
You can see that in each case, the tiled/shared memory kernel is faster.

How to fix my block and grid layout to handle large data?

I'm trying to find the closest pair naive algorithm which has 3D coord.
Input is two files which contains 3 floats in one line.
I handled inputs with float3* type variable.
float3* teamA;
float3* teamB;
float3* results;
handleFileInput(argv[1], argv[2], teamA, teamB, numPoints);
results = new float3[numPoints[0]];
after this, I allocated and copied host data to device like this
#define CHECKERROR(val) { if (val != cudaSuccess) {fprintf(stderr, "Error %s at line %d in file %s\n", cudaGetErrorString(val), __LINE__, __FILE__); exit(1);} }
CHECKERROR(cudaMalloc(&d_tA, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemset(d_tA, 0, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMalloc(&d_tB, sizeof(float3) * numPoints[1]));
CHECKERROR(cudaMemset(d_tB, 0, sizeof(float3) * numPoints[1]));
CHECKERROR(cudaMalloc(&d_results, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemset(d_results, 0, sizeof(float3) * numPoints[0]));
CHECKERROR(cudaMemcpy(d_tA, teamA, sizeof(float3) * numPoints[0], cudaMemcpyHostToDevice));
CHECKERROR(cudaMemcpy(d_tB, teamB, sizeof(float3) * numPoints[1], cudaMemcpyHostToDevice));
I set my block, grid like this.
dim3 block(512);
dim3 grid(ceil((float)numPoints[0] / 512);
naive_algorithm <<< block, grid >>> (d_tA, d_tB, d_results, numPoints[0], numPoints[1]);
My kernel code is simple like this
__global__ void naive_algorithm(float3* d_tA, float3* d_tB, float3* d_r, int a_size, int b_size)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < a_size)
{
float min_distance = -1;
for (int y = 0; y < b_size; y++)
{
float i = MUL(SUB(d_tA[idx].x, d_tB[y].x), SUB(d_tA[idx].x, d_tB[y].x));
float j = MUL(SUB(d_tA[idx].y, d_tB[y].y), SUB(d_tA[idx].y, d_tB[y].y));
float k = MUL(SUB(d_tA[idx].z, d_tB[y].z), SUB(d_tA[idx].z, d_tB[y].z));
float distance = SQRT(ADD(ADD(i, j), k));
if (min_distance > distance || min_distance == -1)
{
d_r[idx].x = (float)idx;
d_r[idx].y = (float)y;
d_r[idx].z = distance;
min_distance = distance;
}
}
__syncthreads();
}
}
Environment : RTX 2080Ti
There are five different of data samples :
Team A - 1000000 points / Team B - 500000 points -> Test Failed
Team A - 700000 points / Team B - 500000 points -> Test Failed
Team A - 500000 points / Team B - 300000 points -> Test OK!
Team A - 500000 points / Team B - 100000 points -> Test OK!
Team A - 300000 points / Team B - 100000 points -> Test OK!
In my opinion this caused from thread layout.
Do I have to change the block / grid layout 1D by 1D -> 2D by 2D?
Then how should I set my grid layout?
Like Robert Crovella said this was just typo error my mistake.
Because one block can handle to 1024 and grid's one dimension can handle to 65535,
if the grid's x dimension : numPoints[0] / BLOCK_SIZE is bigger than 1024 it doesn't works.
Thanks a lot to check my code!

Cuda: XOR single bitset with array of bitsets

I want to XOR a single bitset with a bunch of other bitsets (~100k) and count the set bits of every xor-result. The size of a single bitset is around 20k bits.
The bitsets are already converted to arrays of unsigned int to be able to use the intrinsic __popc()-function. The 'bunch' is already residing contiguously in device-memory.
My current kernel code looks like this:
// Grid/Blocks used for kernel invocation
dim3 block(32);
dim3 grid((bunch_size / 31) + 32);
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){ // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){ // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}
However, compared to a parallelized c++/boost version, I see no benefit using Cuda.
Is there any potential in optimizing this kernel?
Is there any potential in optimizing this kernel?
I see 2 problems here (and they are the first two classical primary optimizations objectives for any CUDA programmer):
You want to try to efficiently use global memory. Your accesses to bitset and bunch are not coalesced. (efficiently use the memory subsystems)
The use of 32 threads per block is generally not recommended and could limit your overall occupancy. One thread per bitset is also potentially problematic. (expose enough parallelism)
Whether addressing those issues will meet your definition of benefit is impossible to say without a comparison test case. Furthermore, simple memory-bound problems like this are rarely interesting in CUDA when considered by themselves. However, we can (probably) improve the performance of your kernel.
We'll use a laundry list of ideas:
have each block handle a bitset, rather than each thread, to enable coalescing
use shared memory to load the comparison bitset, and reuse it
use just enough blocks to saturate the GPU, along with striding loops
use const ... __restrict__ style decoration to possibly benefit from RO cache
Here's a worked example:
$ cat t1649.cu
#include <iostream>
#include <cstdlib>
const int my_bitset_size = 20000/(32);
const int my_bunch_size = 100000;
typedef unsigned uint;
//using one thread per bitset in the bunch
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){ // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){ // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}
const int nTPB = 256;
// one block per bitset, multiple bitsets per block
__global__ void kernelXOR_imp(const uint * __restrict__ bitset, const uint * __restrict__ bunch, int * __restrict__ set_bits, int bitset_size, int bunch_size) {
__shared__ uint sbitset[my_bitset_size]; // could also be dynamically allocated for varying bitset sizes
__shared__ int ssum[nTPB];
// load shared, block-stride loop
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) sbitset[idx] = bitset[idx];
__syncthreads();
// stride across all bitsets in bunch
for (int bidx = blockIdx.x; bidx < bunch_size; bidx += gridDim.x){
int my_sum = 0;
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) my_sum += __popc(sbitset[idx] ^ bunch[bidx*bitset_size + idx]);
// block level parallel reduction
ssum[threadIdx.x] = my_sum;
for (int ridx = nTPB>>1; ridx > 0; ridx >>=1){
__syncthreads();
if (threadIdx.x < ridx) ssum[threadIdx.x] += ssum[threadIdx.x+ridx];}
if (!threadIdx.x) set_bits[bidx] = ssum[0];}
}
int main(){
// data setup
uint *d_cbitset, *d_bitsets, *h_cbitset, *h_bitsets;
int *d_r, *h_r, *h_ri;
h_cbitset = new uint[my_bitset_size];
h_bitsets = new uint[my_bitset_size*my_bunch_size];
h_r = new int[my_bunch_size];
h_ri = new int[my_bunch_size];
for (int i = 0; i < my_bitset_size*my_bunch_size; i++){
h_bitsets[i] = rand();
if (i < my_bitset_size) h_cbitset[i] = rand();}
cudaMalloc(&d_cbitset, my_bitset_size*sizeof(uint));
cudaMalloc(&d_bitsets, my_bitset_size*my_bunch_size*sizeof(uint));
cudaMalloc(&d_r, my_bunch_size*sizeof(int));
cudaMemcpy(d_cbitset, h_cbitset, my_bitset_size*sizeof(uint), cudaMemcpyHostToDevice);
cudaMemcpy(d_bitsets, h_bitsets, my_bitset_size*my_bunch_size*sizeof(uint), cudaMemcpyHostToDevice);
// original
// Grid/Blocks used for kernel invocation
dim3 block(32);
dim3 grid((my_bunch_size / 31) + 32);
kernelXOR<<<grid, block>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_r, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);
// improved
dim3 iblock(nTPB);
dim3 igrid(640);
kernelXOR_imp<<<igrid, iblock>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_ri, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < my_bunch_size; i++)
if (h_r[i] != h_ri[i]) {std::cout << "mismatch at i: " << i << " was: " << h_ri[i] << " should be: " << h_r[i] << std::endl; return 0;}
std::cout << "Results match." << std::endl;
return 0;
}
$ nvcc -o t1649 t1649.cu
$ cuda-memcheck ./t1649
========= CUDA-MEMCHECK
Results match.
========= ERROR SUMMARY: 0 errors
$ nvprof ./t1649
==18868== NVPROF is profiling process 18868, command: ./t1649
Results match.
==18868== Profiling application: ./t1649
==18868== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.06% 71.113ms 2 35.557ms 2.3040us 71.111ms [CUDA memcpy HtoD]
2.26% 1.6563ms 1 1.6563ms 1.6563ms 1.6563ms kernelXOR(unsigned int*, unsigned int*, int*, int, int)
0.59% 432.68us 1 432.68us 432.68us 432.68us kernelXOR_imp(unsigned int const *, unsigned int const *, int*, int, int)
0.09% 64.770us 2 32.385us 31.873us 32.897us [CUDA memcpy DtoH]
API calls: 78.20% 305.44ms 3 101.81ms 11.373us 304.85ms cudaMalloc
18.99% 74.161ms 4 18.540ms 31.554us 71.403ms cudaMemcpy
1.39% 5.4121ms 4 1.3530ms 675.30us 3.3410ms cuDeviceTotalMem
1.26% 4.9393ms 388 12.730us 303ns 530.95us cuDeviceGetAttribute
0.11% 442.37us 4 110.59us 102.61us 125.59us cuDeviceGetName
0.03% 128.18us 2 64.088us 21.789us 106.39us cudaLaunchKernel
0.01% 35.764us 4 8.9410us 2.9670us 18.982us cuDeviceGetPCIBusId
0.00% 8.3090us 8 1.0380us 540ns 1.3870us cuDeviceGet
0.00% 5.9530us 3 1.9840us 310ns 3.9900us cuDeviceGetCount
0.00% 2.8800us 4 720ns 574ns 960ns cuDeviceGetUuid
$
In this case, on my Tesla V100, for your problem size, I witness about a 4x improvement in kernel performance. However the kernel performance here is tiny compared to the cost of data movement. So it's unlikely that these sort of optimizations would make a significant difference in your comparison test case, if this is the only thing you are doing on the GPU.
The code above uses striding-loops at the block level and at the grid level, which means it should behave correctly for almost any choice of threadblock size (multiple of 32 please) as well as grid size. That doesn't mean that any/all choices will perform equally. The choice of the threadblock size is to allow the possibility for nearly full occupancy (so don't choose 32). The choice of the grid size is the number of blocks to achieve full occupancy per SM, times the number of SMs. These should be nearly optimal choices, but according to my testing e.g. a larger number of blocks doesn't really reduce performance, and the performance should be roughly constant for nearly any threadblock size (except 32), assuming the number of blocks is calculated accordingly.

How to efficiently repeat a vector to a matrix in cuda?

I want to repeat a vector to form a matrix in cuda, avoiding too many memcopy. Both vector and matrix are allocated on GPU.
For example:
I have a vector:
a = [1 2 3 4]
expand it to a matrix:
b = [1 2 3 4;
1 2 3 4;
.......
1 2 3 4]
What I have tried is to assign each element of b. But this involves a lot of GPU memory to GPU memory copy.
I know this is easy in matlab (using repmat), but how to do it in cuda efficiently? I didn't find any routine in cublas.
EDIT based on the comments, I've updated the code to a version that will handle either row-major or column-major underlying storage.
Something like this should be reasonably fast:
// for row_major, blocks*threads should be a multiple of vlen
// for column_major, blocks should be equal to vlen
template <typename T>
__global__ void expand_kernel(const T* vector, const unsigned vlen, T* matrix, const unsigned mdim, const unsigned col_major=0){
if (col_major){
int idx = threadIdx.x+blockIdx.x*mdim;
T myval = vector[blockIdx.x];
while (idx < ((blockIdx.x+1)*mdim)){
matrix[idx] = myval;
idx += blockDim.x;
}
}
else{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
T myval = vector[idx%vlen];
while (idx < mdim*vlen){
matrix[idx] = myval;
idx += gridDim.x*blockDim.x;
}
}
}
This assumes your matrix is of dimensions mdim rows x vlen columns (seems to be what you have outlined in the question.)
You can tune the grid and block dimensions to find out what works fastest for your particular GPU. For the row-major case, start with 256 or 512 threads per block, and set the number of blocks equal to or greater than 4 times the number of SMs in your GPU. Choose the product of grid and block dimensions to be equal to an integer multiple of your vector length vlen. If this is difficult, choosing an arbitrary, but "large" threadblock size, such as 250 or 500, should not result in much lost efficiency.
For the column-major case, choose 256 or 512 threads per block, and choose the number of blocks equal to vlen, the vector length. If vlen > 65535, you will need to compile this for compute capability 3.0 or higher. If vlen is small, perhaps less than 32, the efficiency of this method may be significantly reduced. Some mitigation will be found if you increase the threads per block to the maximum for your GPU, either 512 or 1024. There may be other "expand" realizations that may be better suited to the column-major "narrow" matrix case. For example, a straightforward modification to the column-major code would allow two blocks per vector element, or four blocks per vector element, and the total launched blocks would then be 2*vlen or 4*vlen, for example.
Here's a fully worked example, along with a run of bandwidth test, to demonstrate that the above code achieves ~90% of the throughput indicated by bandwidthTest:
$ cat t546.cu
#include <stdio.h>
#define W 512
#define H (512*1024)
// for row_major, blocks*threads should be a multiple of vlen
// for column_major, blocks should be equal to vlen
template <typename T>
__global__ void expand_kernel(const T* vector, const unsigned vlen, T* matrix, const unsigned mdim, const unsigned col_major=0){
if (col_major){
int idx = threadIdx.x+blockIdx.x*mdim;
T myval = vector[blockIdx.x];
while (idx < ((blockIdx.x+1)*mdim)){
matrix[idx] = myval;
idx += blockDim.x;
}
}
else{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
T myval = vector[idx%vlen];
while (idx < mdim*vlen){
matrix[idx] = myval;
idx += gridDim.x*blockDim.x;
}
}
}
template <typename T>
__global__ void check_kernel(const T* vector, const unsigned vlen, T* matrix, const unsigned mdim, const unsigned col_major=0){
unsigned i = 0;
while (i<(vlen*mdim)){
unsigned idx = (col_major)?(i/mdim):(i%vlen);
if (matrix[i] != vector[idx]) {printf("mismatch at offset %d\n",i); return;}
i++;}
}
int main(){
int *v, *m;
cudaMalloc(&v, W*sizeof(int));
cudaMalloc(&m, W*H*sizeof(int));
int *h_v = (int *)malloc(W*sizeof(int));
for (int i = 0; i < W; i++)
h_v[i] = i;
cudaMemcpy(v, h_v, W*sizeof(int), cudaMemcpyHostToDevice);
// test row-major
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
expand_kernel<<<44, W>>>(v, W, m, H);
cudaEventRecord(stop);
float et;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
printf("row-majortime: %fms, bandwidth: %.0fMB/s\n", et, W*H*sizeof(int)/(1024*et));
check_kernel<<<1,1>>>(v, W, m, H);
cudaDeviceSynchronize();
// test col-major
cudaEventRecord(start);
expand_kernel<<<W, 256>>>(v, W, m, H, 1);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
printf("col-majortime: %fms, bandwidth: %.0fMB/s\n", et, W*H*sizeof(int)/(1024*et));
check_kernel<<<1,1>>>(v, W, m, H, 1);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t546 t546.cu
$ ./t546
row-majortime: 13.066944ms, bandwidth: 80246MB/s
col-majortime: 12.806720ms, bandwidth: 81877MB/s
$ /usr/local/cuda/samples/bin/x86_64/linux/release/bandwidthTest
[CUDA Bandwidth Test] - Starting...
Running on...
Device 0: Quadro 5000
Quick Mode
Host to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 5864.2
Device to Host Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 6333.1
Device to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(MB/s)
33554432 88178.6
Result = PASS
$
CUDA 6.5, RHEL 5.5
This can also be implemented using a CUBLAS Rank-1 update function but it will be considerably slower than the above method.

CUDA kernel not updating the output data

Ok, so the main idea of the task is to calculate the average of multiple images, I have it running in the normal way so I thought I will give it a go using CUDA, but unfortunately what I receive in the output is the first image instead of the average. (Inside the Kernel I also tried to set some pixels to 0 to make sure something is happening but no luck..)
////My kernel:
//nImages - number of images in the memory
//nBytes - number of pixels*color per image (also it's a size of dataOut)
//nImages*nBytes gives us the size of dataIn
//nBatch - dataIn has 1 milion bytes per image, we run in 6144 threads, so we need 163 batches to calc the whole dataOut
__global__
void avg_arrays(unsigned char* cuDataIn, unsigned char* cuDataOut, int nImages, int nBytes, int nBatch)
{
//get the position of the correct byte
int j = threadIdx.x + nBatch;
//if we're outside of image then give up
if(j >= nBytes) return;
//proceed averaging
long lSum = 0;
for(int i=0; i < nImages; ++i)
lSum += cuDataIn[i*nBytes + j];
lSum = lSum / nImages;
cuDataOut[j] = lSum;
}
Memory allocation etc.
unsigned char* dataIn = 0;
unsigned char* dataOut= 0;
// Allocate and Transfer memory to the devicea
gpuErrchk( cudaMalloc((void**)&dataIn, nPixelCountBGR * nNumberOfImages * sizeof(unsigned char))); //dataIn
gpuErrchk( cudaMalloc((void**)&dataOut, nPixelCountBGR * sizeof(unsigned char))); //dataOut
gpuErrchk( cudaMemcpy(dataIn, bmps, nPixelCountBGR * nNumberOfImages * sizeof(unsigned char), cudaMemcpyHostToDevice )); //dataIn
gpuErrchk( cudaMemcpy(dataOut, basePixels, nPixelCountBGR * sizeof(unsigned char), cudaMemcpyHostToDevice )); //dataOut
// Perform the array addition
dim3 dimBlock(N);
dim3 dimGrid(1);
//do it in batches, unless it's possible to run more threads at once, anyway N is a number of max threads
for(int i=0; i<nPixelCountBGR; i+=N){
cout << "Running with: nImg: "<< nNumberOfImages << ", nPixBGR " << nPixelCountBGR << ", and i = " << i << endl;
avg_arrays<<<dimGrid, dimBlock>>>(dataIn, dataOut, nNumberOfImages, nPixelCountBGR, 0);
}
// Copy the Contents from the GPU
gpuErrchk(cudaMemcpy(basePixels, dataOut, nPixelCountBGR * sizeof(unsigned char), cudaMemcpyDeviceToHost));
gpuErrchk(cudaFree(dataOut));
gpuErrchk(cudaFree(dataIn));
The error check doesn't bring any messages, all the code runs smoothly, all I get at the end is the exact copy of the first image.
Just in case if someone needs here's some console output:
Running with: nImg: 29, nPixBGR 1228800, and i = 0
...
Running with: nImg: 29, nPixBGR 1228800, and i = 1210368
Running with: nImg: 29, nPixBGR 1228800, and i = 1216512
Running with: nImg: 29, nPixBGR 1228800, and i = 1222656
Time of averaging: 0.219
If N is greater than 512 or 1024 (depending on which GPU you are running on, which you don't mention), then this is invalid:
dim3 dimBlock(N);
because you can't launch a kernel with greater than 512 or 1024 threads per block:
avg_arrays<<<dimGrid, dimBlock>>>(...
^
|
this is limited to 512 or 1024
If you study proper cuda error checking and apply it to your kernel launch, you'll trap this kind of error.