Measuring OpenCL kernel's memory throughput - profiling

I read about global memory optimization in OpenCL. In one of the slide-shows, a very simple kernel (below) has been used to demonstrate the importance of memory coalescing.
__kernel void measure(__global float* idata, __global float* odata, int offset) {
int xid = get_global_id(0) + offset;
odata[xid] = idata[xid];
}
Please see my code below which measures the running time of the kernel
ret = clFinish(command_queue);
size_t local_item_size = MAX_THREADS;
size_t global_item_size = INPUTSIZE;
struct timeval t0,t1;
gettimeofday(&t0, 0 );
//ret = clFinish(command_queue);
ret = clEnqueueNDRangeKernel(command_queue, measure, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
gettimeofday(&t1,0);
double elapsed = (t1.tv_sec-t0.tv_sec)*1000000 + (t1.tv_usec-t0.tv_usec);
printf("time taken = %lf microseconds\n", elapsed);
I transfer around 0.5 GB of data:
#define INPUTSIZE 1024 * 1024 * 128
int main (int argc, char *argv[])
{
int offset = atoi(argv[1]);
float* input = (float*) malloc(sizeof(float) * INPUTSIZE);
Now, the results are a bit random. With offset =0, I get times as low as 21 usecs. With offset = 1, I get times ranging between 53 usecs to 24400 usecs.
Can someone please tell me what is going on. I thought that offset=0 will be the fastest, because all the threads will access consecutive locations, hence the minimum number of memory transactions will take place.

Bandwidth is a measure of how fast data can be transferred, and is typically measured in bytes/second in these situations (usually GB/s for GPU memory bandwidth).
To compute the bandwidth of a compute kernel, you just need to know how much data the kernel reads/writes from/to memory, and then divide that by the time your kernel took to execute.
Your example kernel has each work-item (or CUDA thread) read a single float, and write a single float. If you launch this kernel to copy 2^10 floats, then you will be reading 2^10 * sizeof(float) bytes, and writing the same amount (so 8MB in total). If this kernel takes 1ms to execute, then you have achieved bandwidth of 8MB / 0.001s = 8GB/s.
Your new code snippet that shows your kernel timing approach indicates that you are only timing the kernel enqueue, not the amount of time it actually takes to run the kernel. This is why you are getting very low kernel timings (0.5GB / 0.007ms ~= 71TB/s!). You should add calls to clFinish() to obtain proper timing. I typically also take timings over several runs, to allow the device to warm-up, which usually gives more consistent timing:
// Warm-up run (not timed)
clEnqueueNDRangeKernel(command_queue, ...);
clFinish(command_queue);
// start timing
start = ...
for (int i = 0; i < NUM_RUNS; i++)
{
clEnqueueNDRangeKernel(command_queue, ...);
}
clFinish(command_queue);
// stop timing
end = ...
// Compute time taken, bandwidth etc
average_time = (end-start)/NUM_RUNS;
...
Question from comment:
Why does offset=0 perform better than offset=1,4 or 6?
On NVIDIA GPUs, work-items are grouped into 'warps' of size 32, which execute in lockstep (other devices have similar approaches, just with a different sizes). Memory transactions are aligned to multiples of the cacheline size (e.g. 64 bytes, 128 bytes etc). Consider what happens when each work-item in a warp attempts to read a single 4-byte value (assuming they are contiguous, as per your example), with a cacheline size of 64 bytes.
This warp is reading a total of 128 bytes of data. If the start of this 128-byte chunk is aligned to a 64-byte boundary (i.e. if offset=0), then this can serviced in two 64-byte transactions. However, if this chunk is not aligned to the a 64-byte boundary (offset=1,4,6,etc), then this will require three memory transactions to fetch all of the data. This is where your performance difference comes from.
If you set the offset to be a multiple of the cacheline size (e.g. 64), then you will likely get performance equivalent to offset=0.

Related

CUDA periodic execution time

I just started learning CUDA and I have a trouble interpreting my experiment results. I wanted to compare CPU vs GPU in a simple program that adds two vectors together. The code is following:
__global__ void add(int *a, int *b, int *c, long long n) {
long long tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
c[tid] = a[tid] + b[tid];
}
}
void add_cpu(int* a, int* b, int* c, long long n) {
for (long long i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
void check_results(int* gpu, int* cpu, long long n) {
for (long long i = 0; i < n; i++) {
if (gpu[i] != cpu[i]) {
printf("Different results!\n");
return;
}
}
}
int main(int argc, char* argv[]) {
long long n = atoll(argv[1]);
int num_of_blocks = atoi(argv[2]);
int num_of_threads = atoi(argv[3]);
int* a = new int[n];
int* b = new int[n];
int* c = new int[n];
int* c_cpu = new int[n];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, n * sizeof(int));
cudaMalloc((void **) &dev_b, n * sizeof(int));
cudaMalloc((void **) &dev_c, n * sizeof(int));
for (long long i = 0; i < n; i++) {
a[i] = i;
b[i] = i * 2;
}
cudaMemcpy(dev_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, c, n * sizeof(int), cudaMemcpyHostToDevice);
StopWatchInterface *timer=NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
sdkStartTimer(&timer);
add <<<num_of_blocks, num_of_threads>>>(dev_a, dev_b, dev_c, n);
cudaDeviceSynchronize();
sdkStopTimer(&timer);
float time = sdkGetTimerValue(&timer);
sdkDeleteTimer(&timer);
cudaMemcpy(c, dev_c, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
clock_t start = clock();
add_cpu(a, b, c_cpu, n);
clock_t end = clock();
check_results(c, c_cpu, n);
printf("%f %f\n", (double)(end - start) * 1000 / CLOCKS_PER_SEC, time);
return 0;
}
I ran this code in a loop with a bash script:
for i in {1..2560}
do
n="$((1024 * i))"
out=`./vectors $n $i 1024`
echo "$i $out" >> "./vectors.txt"
done
Where 2560 is maximum number of blocks that my GPU supports, and 1024 is the maximum number of threads in block. So I just ran it for maximum block size to the maximum problem size my GPU can handle, with a step of 1 block (1024 ints in vector).
Here is my GPU info:
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "NVIDIA GeForce RTX 2070 SUPER"
CUDA Driver Version / Runtime Version 11.3 / 11.0
CUDA Capability Major/Minor version number: 7.5
Total amount of global memory: 8192 MBytes (8589934592 bytes)
(040) Multiprocessors, (064) CUDA Cores/MP: 2560 CUDA Cores
GPU Max Clock rate: 1785 MHz (1.78 GHz)
Memory Clock rate: 7001 Mhz
Memory Bus Width: 256-bit
L2 Cache Size: 4194304 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total shared memory per multiprocessor: 65536 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 1024
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device supports Managed Memory: Yes
Device supports Compute Preemption: Yes
Supports Cooperative Kernel Launch: Yes
Supports MultiDevice Co-op Kernel Launch: Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.3, CUDA Runtime Version = 11.0, NumDevs = 1
Result = PASS
After running the experiment I gathered the results and plotted them:
So what bothers me is this 256 blocks-wide period in the GPU execution time. I have no clue why this happens. Why executing 512 blocks is much slower than executing 513 blocks of threads?
I also checked this with a constant number of blocks (2560) as well as with different block sizes and it always give this period of 256 * 1024 vector size (so for block size 512 its each 512 blocks, not each 256 blocks). So maybe this is something with memory, but I can't figure out what.
I would appreciate any ideas on why this is happening.
This is by no means a complete or precise answer. However I believe the periodic pattern you are observing is at least partly due to a 1-time or first-time kernel launch overhead. Good benchmarking practice usually is to do something other than what you are doing. For example, run the kernel multiple times and take an average. Or do some other kind of statistical measurement.
When I run your code using your script on a GTX 960 GPU, I get the following graph (only plotting the GPU data, vertical axis is in milliseconds):
When I modify your code as follows:
cudaMemcpy(dev_c, c, n * sizeof(int), cudaMemcpyHostToDevice);
// next two lines added:
add <<<num_of_blocks, num_of_threads>>>(dev_a, dev_b, dev_c, n);
cudaDeviceSynchronize();
StopWatchInterface *timer=NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
sdkStartTimer(&timer);
add <<<num_of_blocks, num_of_threads>>>(dev_a, dev_b, dev_c, n);
cudaDeviceSynchronize();
Doing a "warm-up" run first, then timing the second run, I witness data like this:
So the data without the warm-up shows a periodicity. After the warm-up, the periodicity disappears. I conclude that the periodicity is due to some kind of 1-time or first-time behavior. Some typical things that might be in this category are caching effects and cuda "lazy" initialization effects (for example, the time taken to JIT-compile the GPU code, which is certainly happening in your case, or the time to load the GPU code into GPU memory). I won't be able to go farther with any explanation of what kind of first-time effect exactly is giving rise to the periodicity.
Another observation is that while my data shows an expected "average slope" to each graph, indicating that the kernel duration associated with 2560 blocks is approximately 5 times the kernel duration associated with 512 blocks, I don't see that kind of trend in your data. It ought to be there, however. Your GPU will "saturate" at about 40 blocks. Thereafter, the average kernel duration should increase in approximately a linear fashion, such that the kernel duration associated with 2560 blocks is 4-5x the kernel duration associated with 512 blocks. I can't explain your data in this respect at all, I suspect a graphing or data processing error, or else a characteristic in your environment (e.g. shared GPU with other users, broken CUDA install, etc.) that is not present in my environment, and which I'm unable to guess at.
Finally, my conclusion is that GPU "expected" behavior is more evident in the presence of good benchmarking techniques.

OpenCL Memory Bandwidth/Coalescing

Summary:
I'm trying to write a memory bound OpenCL program that comes close to the advertised memory bandwidth on my GPU. In reality I'm off by a factor of ~50.
Setup:
I only have a relatively old Polaris Card (RX580), so I can't use CUDA and have to settle on OpenCL for now. I know this is suboptmial, and I can't get any debugging/performance counters to work, but it's all I have.
I'm new to GPU computing and want to get a feel for some of the performance that I can expect
from GPU vs CPU. First thing to work on for me is memory bandwidth.
I wrote a very small OpenCL Kernel, which reads from strided memory locations in a way that I want all workers in the wavefront together to perform continuous memory access over a large memory segment, coalescing the accesses. All that the kernel then does with the loaded data is to sum the values up and write the sum back to another memory location at the very end. The code (which I shamelessly copied together from various sources for the most part) is quite simply
__kernel void ThroughputTestKernel(
__global float* vInMemory,
__global float* vOutMemory,
const int iNrOfIterations,
const int iNrOfWorkers
)
{
const int gtid = get_global_id(0);
__private float fAccumulator = 0.0;
for (int k = 0; k < iNrOfIterations; k++) {
fAccumulator += vInMemory[gtid + k * iNrOfWorkers];
}
vOutMemory[gtid] = fAccumulator;
}
I spawn iNrOfWorkers of these Kernels and measure the time it takes them to finish processing. For my tests I set iNrOfWorkers = 1024 and iNrOfIterations = 64*1024. From the processing time and the iMemorySize = iNrOfWorkers * iNrOfIterations * sizeof(float) I calculate a memory bandwidth of around 5GByte/s.
Expectations:
My problem is that memory accesses seem to be one to two orders of magnitude slower than the 256GByte/s that I was led to believe I have available.
The GCN ISA Manual [1] has me assuming that I have 36 CUs, each of which contains 4 SIMD units, each of which process vectors of 16 elements. Therefore I should have 36416 = 2304 processing elements available.
I spawn less than that amount, i.e. 1024, global work units ("threads"). The threads access memory locations in order, 1024 locations apart, so that in each iteration of the loop, the entire wavefront accesses 1024 consecutive elements. Therefore I believe that the GPU should be able to produce consecutive memory address accesses with no breaks in between.
My guess is that, instead of 1024, it only spawns very few threads, one per CU maybe? That way it would have to re-read the data over and over again. I don't know how I would be able to verify that, though.
[1] http://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf
A few issues with your approach:
You don't saturate the GPU. To get peak performance, you need to launch much more threads than your GPU has execution units. Much more means >10000000.
Your loop contains index integer computation (for array-of-structures coalesced access). Here this is probably not enough to get you into the compute limit, but it's generally better to unroll the small loop with #pragma unroll; then the compiler does all the index calculation already. You can also bake the constants iNrOfIterations and iNrOfWorkers right into the OpenCL code with #define iNrOfIterations 16 / #define iNrOfWorkers 15728640 via C++ string concatenation or by hardcoding.
There is 4 different memory bandwidths based on your access pattern: coalesced/misaligned reads/writes. Coalesced is much faster than misaligned and the performance penalty for misaligned reads is less than misaligned writes. Only coalesced memory access gets you anywhere near the advertised bandwidth. You measure iNrOfIterations coalesced reads and 1 coalesced write. To measure all four types separately, you can use this:
#define def_N 15728640
#define def_M 16
kernel void benchmark_1(global float* data) {
const uint n = get_global_id(0);
#pragma unroll
for(uint i=0; i<def_M; i++) data[i*def_N+n] = 0.0f; // M coalesced writes
}
kernel void benchmark_2(global float* data) {
const uint n = get_global_id(0);
float x = 0.0f;
#pragma unroll
for(uint i=0; i<def_M; i++) x += data[i*def_N+n]; // M coalesced reads
data[n] = x; // 1 coalesced write (to prevent compiler optimization)
}
kernel void benchmark_3(global float* data) {
const uint n = get_global_id(0);
#pragma unroll
for(uint i=0; i<def_M; i++) data[n*def_M+i] = 0.0f; // M misaligned writes
}
kernel void benchmark_4(global float* data) {
const uint n = get_global_id(0);
float x = 0.0f;
#pragma unroll
for(uint i=0; i<def_M; i++) x += data[n*def_M+i]; // M misaligned reads
data[n] = x; // 1 coalesced write (to prevent compiler optimization)
}
Here the data array has the size N*M and each kernel is executed across the range N. For bandwidth calculation, execute each kernel a few hundred times (better average) and get the average execution times time1, time2, time3 and time4. The bandwidths are then computed like this:
coalesced read bandwidth (GB/s) = 4.0E-9f*M*N/(time2-time1/M)
coalesced write bandwidth (GB/s) = 4.0E-9f*M*N/( time1 )
misaligned read bandwidth (GB/s) = 4.0E-9f*M*N/(time4-time1/M)
misaligned write bandwidth (GB/s) = 4.0E-9f*M*N/(time3 )
For reference, here are a few bandwidth values measured with this benchmark.
Edit: How to measure kernel execution time:
Clock
#include <thread>
class Clock {
private:
typedef chrono::high_resolution_clock clock;
chrono::time_point<clock> t;
public:
Clock() { start(); }
void start() { t = clock::now(); }
double stop() const { return chrono::duration_cast<chrono::duration<double>>(clock::now()-t).count(); }
};
Time measurement of K executions of a kernel
const int K = 128; // execute kernel 128 times and average execution time
NDRange range_local = NDRange(256); // thread block size
NDRange range_global = NDRange(N); // N must be divisible by thread block size
Clock clock;
clock.start();
for(int k=0; k<K; k++) {
queue.enqueueNDRangeKernel(kernel_1, NullRange, range_global, range_local);
queue.finish();
}
const double time1 = clock.stop()/(double)K;

Why my cuda program became slower after using 128 threads on blocks?

I have a simple cuda application with the following code:
#include <stdio.h>
#include <sys/time.h>
#include <stdint.h>
__global__
void daxpy(int n, int a, int *x, int *y) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
y[i] = x[i];
int j;
for(j = 0; j < 1024*10000; ++j) {
y[i] += j%10;
}
}
// debug time
void calc_time(struct timeval *start, const char *msg) {
struct timeval end;
gettimeofday(&end, NULL);
uint64_t us = end.tv_sec * 1000000 + end.tv_usec - (start->tv_sec * 1000000 + start->tv_usec);
printf("%s cost us = %llu\n", msg, us);
memcpy(start, &end, sizeof(struct timeval));
}
void do_test() {
unsigned long n = 1536;
int *x, *y, a, *dx, *dy;
a = 2.0;
x = (int*)malloc(sizeof(int)*n);
y = (int*)malloc(sizeof(int)*n);
for(i = 0; i < n; ++i) {
x[i] = i;
}
cudaMalloc((void**)&dx, n*sizeof(int));
cudaMalloc((void**)&dy, n*sizeof(int));
struct timeval start;
gettimeofday(&start, NULL);
cudaMemcpy(dx, x, n*sizeof(int), cudaMemcpyHostToDevice);
daxpy<<<1, 512>>>(n, a, dx, dy); // this line
cudaThreadSynchronize();
cudaMemcpy(y, dy, n*sizeof(int), cudaMemcpyDeviceToHost);
calc_time(&start, "do_test ");
cudaFree(dx);
cudaFree(dy);
free(x);
free(y);
}
int main() {
do_test();
return 0;
}
The gpu kernel call is daxpy<<<1, 512>>>(n, a, dx, dy) and I performed some tests using different block sizes:
daxpy<<<1, 32>>>(n, a, dx, dy)
daxpy<<<1, 64>>>(n, a, dx, dy)
daxpy<<<1, 128>>>(n, a, dx, dy)
daxpy<<<1, 129>>>(n, a, dx, dy)
daxpy<<<1, 512>>>(n, a, dx, dy)
... and made the following observations:
Execution time is the same for 32, 64, and 128 block sizes,
Execution time differs for block sizes 128 and 129, in particular:
For 128 the execution time is 280ms,
For 129 the execution time is 386ms.
I would like to ask what is causing the difference in execution time for block sizes 128 and 129.
My GPU is tesla K80:
CUDA Driver Version / Runtime Version 6.5 / 6.5
CUDA Capability Major/Minor version number: 3.7
Total amount of global memory: 11520 MBytes (12079136768 bytes)
(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
GPU Clock rate: 824 MHz (0.82 GHz)
Memory Clock rate: 2505 Mhz
Memory Bus Width: 384-bit
L2 Cache Size: 1572864 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: No
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Enabled
Device supports Unified Addressing (UVA): Yes
Device PCI Bus ID / PCI location ID: 135 / 0
After providing us with the exact time differences in one of the comments, i.e.:
280ms for up to 128 threads,
386ms for 129+ threads,
I think it indirectly supports my theory of issue being related to warp scheduling. Look at the GK210 whitepaper, which is a chip used in K80:
K80 SMX features a quad warp scheduler, see section Quad Warp Scheduler,
It means that K80 SMX is able to schedule up to 128 threads at once (4 warps == 128 threads), these are then executed simultaneously,
Therefore, for 129 threads, scheduling cannot happen at once, because SMX has to schedule 5 warps, i.e. scheduling will happen in two steps.
If the above is true, then I would expect:
The execution time to be roughly the same for block sizes 1 - 128,
The execution time to be roughly the same for block sizes 129 - 192.
192 is the number of cores on the SMX, see whitepaper. As a reminder - entire blocks are always scheduled for one SMX and so obviously if you spawn more than 192 threads then those for sure won't be able to execute in parallel and execution time should be higher for 193+ number of threads.
You can verify the above thesis by simplifying your kernel code to the degree where it will do almost nothing so it should be more or less obvious whether the execution takes longer only due to scheduling (there will be no other limiting factors such as memory throughput).
Disclaimer: The above are just my assumptions as I don't have access to K80, nor any other GPU with quad warp scheduler so I cannot profile your code properly. But anyway, I believe that is the task for you - why not to use nvprof and profile your code yourself? Then you should be able to see where the time difference lies.

It's slower to calculate integral image using CUDA than CPU code

I am implementing the integral image calculation module using CUDA to improve performance.
But its speed slower than the CPU module.
Please let me know what i did wrong.
cuda kernels and host code follow.
And also, another problem is...
In the kernel SumH, using texture memory is slower than global one, imageTexture was defined as below.
texture<unsigned char, 1> imageTexture;
cudaBindTexture(0, imageTexture, pbImage);
// kernels to scan the image horizontally and vertically.
__global__ void SumH(unsigned char* pbImage, int* pnIntImage, __int64* pn64SqrIntImage, float rVSpan, int nWidth)
{
int nStartY, nEndY, nIdx;
if (!threadIdx.x)
{
nStartY = 1;
}
else
nStartY = (int)(threadIdx.x * rVSpan);
nEndY = (int)((threadIdx.x + 1) * rVSpan);
for (int i = nStartY; i < nEndY; i ++)
{
for (int j = 1; j < nWidth; j ++)
{
nIdx = i * nWidth + j;
pnIntImage[nIdx] = pnIntImage[nIdx - 1] + pbImage[nIdx - nWidth - i];
pn64SqrIntImage[nIdx] = pn64SqrIntImage[nIdx - 1] + pbImage[nIdx - nWidth - i] * pbImage[nIdx - nWidth - i];
//pnIntImage[nIdx] = pnIntImage[nIdx - 1] + tex1Dfetch(imageTexture, nIdx - nWidth - i);
//pn64SqrIntImage[nIdx] = pn64SqrIntImage[nIdx - 1] + tex1Dfetch(imageTexture, nIdx - nWidth - i) * tex1Dfetch(imageTexture, nIdx - nWidth - i);
}
}
}
__global__ void SumV(unsigned char* pbImage, int* pnIntImage, __int64* pn64SqrIntImage, float rHSpan, int nHeight, int nWidth)
{
int nStartX, nEndX, nIdx;
if (!threadIdx.x)
{
nStartX = 1;
}
else
nStartX = (int)(threadIdx.x * rHSpan);
nEndX = (int)((threadIdx.x + 1) * rHSpan);
for (int i = 1; i < nHeight; i ++)
{
for (int j = nStartX; j < nEndX; j ++)
{
nIdx = i * nWidth + j;
pnIntImage[nIdx] = pnIntImage[nIdx - nWidth] + pnIntImage[nIdx];
pn64SqrIntImage[nIdx] = pn64SqrIntImage[nIdx - nWidth] + pn64SqrIntImage[nIdx];
}
}
}
// host code
int nW = image_width;
int nH = image_height;
unsigned char* pbImage;
int* pnIntImage;
__int64* pn64SqrIntImage;
cudaMallocManaged(&pbImage, nH * nW);
// assign image gray values to pbimage
cudaMallocManaged(&pnIntImage, sizeof(int) * (nH + 1) * (nW + 1));
cudaMallocManaged(&pn64SqrIntImage, sizeof(__int64) * (nH + 1) * (nW + 1));
float rHSpan, rVSpan;
int nHThreadNum, nVThreadNum;
if (nW + 1 <= 1024)
{
rHSpan = 1;
nVThreadNum = nW + 1;
}
else
{
rHSpan = (float)(nW + 1) / 1024;
nVThreadNum = 1024;
}
if (nH + 1 <= 1024)
{
rVSpan = 1;
nHThreadNum = nH + 1;
}
else
{
rVSpan = (float)(nH + 1) / 1024;
nHThreadNum = 1024;
}
SumH<<<1, nHThreadNum>>>(pbImage, pnIntImage, pn64SqrIntImage, rVSpan, nW + 1);
cudaDeviceSynchronize();
SumV<<<1, nVThreadNum>>>(pbImage, pnIntImage, pn64SqrIntImage, rHSpan, nH + 1, nW + 1);
cudaDeviceSynchronize();
Regarding the code that is currently in the question. There are two things I'd like to mention: launch parameters and timing methodology.
1) Launch parameters
When you launch a kernel there are two main arguments that specify the amount of threads you are launching. These are between the <<< and >>> sections, and are the number of blocks in the grid, and the number of threads per block as follows:
foo <<< numBlocks, numThreadsPerBlock >>> (args);
For a single kernel to be efficient on a current GPU you can use the rule of thumb that numBlocks * numThreadsPerBlock should be at least 10,000. Ie. 10,000 pieces of work. This is a rule of thumb, so you may get good results with only 5,000 threads (it varies with GPU: cheaper GPUs can get away with fewer threads), but this is the order of magnitude you need to be looking at as a minimum. You are running 1024 threads. This is almost certainly not enough (Hint: the loops inside your kernel look like scan primatives, these can be done in parallel).
Further to this there are a few other things to consider.
The number of blocks should be large in comparison to the number of SMs on your GPU. A Kepler K40 has 15 SMs, and to avoid a signficant tail effect you'd probably want at least ~100 blocks on this GPU. Other GPUs have fewer SMs, but you haven't specified which you have, so I can't be more specific.
The number of threads per block should not be too small. You can only have so many blocks on each SM, so if your blocks are too small you will use the GPU suboptimally. Furthermore, on newer GPUs up to four warps can receive instructions on a SM simultaneously, and as such is it often a good idea to have block sizes as multiples of 128.
2) Timing
I'm not going to go into so much depth here, but make sure your timing is sane. GPU code tends to have a one-time initialisation delay. If this is within your timing, you will see erroneously large runtimes for codes designed to represent a much larger code. Similarly, data transfer between the CPU and GPU takes time. In a real application you may only do this once for thousands of kernel calls, but in a test application you may do it once per kernel launch.
If you want to get accurate timings you must make your example more representitive of the final code, or you must be sure that you are only timing the regions that will be repeated.
The only way to be sure is to profile the code, but in this case we can probably make a reasonable guess.
You're basically just doing a single scan through some data, and doing extremely minimal processing on each item.
Given how little processing you're doing on each item, the bottleneck when you process the data with the CPU is probably just reading the data from memory.
When you do the processing on the GPU, the data still needs to be read from memory and copied into the GPU's memory. That means we still have to read all the data from main memory, just like if the CPU did the processing. Worse, it all has to be written to the GPU's memory, causing a further slowdown. By the time the GPU even gets to start doing real processing, you've already used up more time than it would have taken the CPU to finish the job.
For Cuda to make sense, you generally need to be doing a lot more processing on each individual data item. In this case, the CPU is probably already nearly idle most of the time, waiting for data from memory. In such a case, the GPU is unlikely to be of much help unless the input data was already in the GPU's memory so the GPU could do the processing without any extra copying.
When working with CUDA there are a few things you should keep in mind.
Copying from host memory to device memory is 'slow' - when you copy some data from the host to the device you should do as much calculations as possible (do all the work) before you copy it back to the host.
On the device there are 3 types of memory - global, shared, local. You can rank them in speed like global < shared < local (local = fastest).
Reading from consecutive memory blocks is faster than random access. When working with array of structures you would like to transpose it to a structure of arrays.
You can always consult the CUDA Visual Profiler to show you the bottleneck of your program.
the above mentioned GTX750 has 512 CUDA cores (these are the same as the shader units, just driven in a /different/ mode).
http://www.nvidia.de/object/geforce-gtx-750-de.html#pdpContent=2
the duty of creating integral images is only partially able to be parallel'ized as any result value in the results array depends on a bigger bunch of it's predecessors. further it is only a tiny math portion per memory transfer so the ALU powers and thus the unavoidable memory transfers might be the bottle neck. such an accelerator might provide some speed up, but not a thrilling speed up because of the duty itself does not allow it.
if you would compute multiple variations of integral images on the same input data you would be able to see the "thrill" much more likely due to much higher parallelism options and a higher amount of math ops. but that would be a different duty then.
as a wild guess from google search - others have already fiddled with those item: https://www.google.de/url?sa=t&rct=j&q=&esrc=s&source=web&cd=11&cad=rja&uact=8&ved=0CD8QFjAKahUKEwjjnoabw8bIAhXFvhQKHbUpA1Y&url=http%3A%2F%2Fdspace.mit.edu%2Fopenaccess-disseminate%2F1721.1%2F71883&usg=AFQjCNHBbOEB_OHAzLZI9__lXO_7FPqdqA

CUDA - no blocks, just threads for undefined dimensions

I have some matrices with unknown sizes varying from 10-20.000 in both directions.
I designed a CUDA kernel with (x;y) blocks and (x;y) threads.
Since matrices width/height aren't multiple of my dimensions, it was a terrible pain to get things work and the code is becoming more and more complicated to get coalescence memory reads.
Besides all of that, the kernel is growing in size using more and more registers to check for correctness... so I think this is not the way I should adopt.
My question is: what if I totally eliminate blocks and just create a grid of x;y threads? Will a SM unit have problems without many blocks?
Can I eliminate blocks and use a large amount of threads or is the block subdivision necessary?
You can't really just make a "grid of threads", since you have to organize threads into blocks and you can have a maximum of 512 threads per block. However, you could effectively do this by using 1 thread per block, which will result in a X by Y grid of 1x1 blocks. However, this will result in pretty terrible performance due to several factors:
According to the CUDA Programming Guide, a SM can handle a maximum of 8 blocks at any time. This will limit you to 8 threads per SM, which isn't enough to fill even a single warp. If you have, say, 48 CUDA cores, you will only be able to handle 384 threads at any given time.
With only 8 threads available on a SM, there will be too few warps to hide memory latencies. The GPU will spend most of its time waiting for memory accesses to complete, rather than doing any computations.
You will be unable to coalesce memory reads and writes, resulting in poor memory bandwidth usage.
You will be effectively unable to leverage shared memory, as this is a shared resource between threads in a block.
While having to ensure correctness for threads in a block is annoying, your performance will be vastly better than your "grid of threads" idea.
Here's the code i use to divide a given task requiring num_threads into block and grid. Yes, you might end up launching to many blocks (but only very few) and you will probably end up having more actual threads than required, but it's easy and efficient this way. See the second code example below for my simple in-kernel boundary check.
PS: I always have block_size == 128 because it has been a good tradeoff between multicore occupancy, register usage, shared memory requirements and coalescent access for all of my kernels.
Code to calculate a good grid size (host):
#define GRID_SIZE 65535
//calculate grid size (store result in grid/block)
void kernelUtilCalcGridSize(unsigned int num_threads, unsigned int block_size, dim3* grid, dim3* block) {
//block
block->x = block_size;
block->y = 1;
block->z = 1;
//number of blocks
unsigned int num_blocks = kernelUtilCeilDiv(num_threads, block_size);
unsigned int total_threads = num_blocks * block_size;
assert(total_threads >= num_threads);
//calculate grid size
unsigned int gy = kernelUtilCeilDiv(num_blocks, GRID_SIZE);
unsigned int gx = kernelUtilCeilDiv(num_blocks, gy);
unsigned int total_blocks = gx * gy;
assert(total_blocks >= num_blocks);
//grid
grid->x = gx;
grid->y = gy;
grid->z = 1;
}
//ceil division (rounding up)
unsigned int kernelUtilCeilDiv(unsigned int numerator, unsigned int denominator) {
return (numerator + denominator - 1) / denominator;
}
Code to calculate the unique thread id and check boundaries (device):
//some kernel
__global__ void kernelFoo(unsigned int num_threads, ...) {
//calculate unique id
const unsigned int thread_id = threadIdx.x;
const unsigned int block_id = blockIdx.x + blockIdx.y * gridDim.x;
const unsigned int unique_id = thread_id + block_id * blockDim.x;
//check range
if (unique_id >= num_threads) return;
//do the actual work
...
}
I don't think that's a lot of effort/registers/lines-of-code to check for correctness.