CUDA Speed Slower than expected - Image Processing

CUDA Speed Slower than expected - Image Processing - c++

I am new to CUDA development and wanted to write a simple benchmark to test some image processing feasibility. I have 32 images that are each 720x540, one byte per pixel greyscale.
I am running benchmarks for 10 seconds, and counting how many times they are able to process. There are three benchmarks I am running:
The first is just transferring the images into the GPU global memory, via cudaMemcpy
The second is transferring and processing the images.
The third is running the equivalent test on a CPU.
For a starting, simple test, the image processing is just counting the number of pixels above a certain greyscale value. I'm finding that accessing global memory on the GPU is very slow. I have my benchmark structured such that it creates one block per image, and one thread per row in each image. Each thread counts its pixels into a shared memory array, after which the first thread sums them up (See below).
The issue I am having is that this all runs very slowly - about 50fps. Much slower than a CPU version - about 230fps. If I comment out the pixel value comparison, resulting in just a count of all pixels, I get 6x the performance. I tried using texture memory but didn't see a performance gain. I am running a Quadro K2000. Also: the image copy only benchmark is able to copy at around 330fps, so that doesn't appear to be the issue.
Any help / pointers would be appreciated. Thank you.
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}
extern "C" void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}

Two changes to your kernel design can result in a significant speedup:
Perform the operations column-wise instead of row-wise. The general background for why this matters/helps is described here.
Replace your final operation with a canonical parallel reduction.
According to my testing, those 2 changes result in ~22x speedup in kernel performance:
$ cat t49.cu
#include <iostream>
#include <helper_cuda.h>
typedef unsigned char U8;
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}
__global__ void ThreadPerColCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns, int rsize)
{
extern __shared__ int col_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
unsigned char *imageStart = AllPixels[myImage];
int myStartCol = (threadIdx.y * blockDim.x + threadIdx.x);
int col_count = 0;
for (int i = 0; i < H; i++) if (imageStart[myStartCol+i*W]> Threshold) col_count++;
col_counts[threadIdx.x] = col_count;
__syncthreads();
for (int i = rsize; i > 0; i>>=1){
if ((threadIdx.x+i < W) && (threadIdx.x < i)) col_counts[threadIdx.x] += col_counts[threadIdx.x+i];
__syncthreads();}
if (!threadIdx.x) AllReturns[myImage] = col_counts[0];
}
void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
unsigned next_power_of_2(unsigned v){
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;}
void cuda_Benchmark1(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
int rsize = next_power_of_2(W/2);
ThreadPerColCounter<<<nImages, W, sizeof(int)*W>>> (
Threshold,
W, H,
AllPixels,
AllReturns, rsize);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
int main(){
const int my_W = 720;
const int my_H = 540;
const int n_img = 128;
const int my_thresh = 10;
U8 **img_p, **img_ph;
U8 *img, *img_h;
int *res, *res_h, *res_h1;
img_ph = (U8 **)malloc(n_img*sizeof(U8*));
cudaMalloc(&img_p, n_img*sizeof(U8*));
cudaMalloc(&img, n_img*my_W*my_H*sizeof(U8));
img_h = new U8[n_img*my_W*my_H];
for (int i = 0; i < n_img*my_W*my_H; i++) img_h[i] = rand()%20;
cudaMemcpy(img, img_h, n_img*my_W*my_H*sizeof(U8), cudaMemcpyHostToDevice);
for (int i = 0; i < n_img; i++) img_ph[i] = img+my_W*my_H*i;
cudaMemcpy(img_p, img_ph, n_img*sizeof(U8*), cudaMemcpyHostToDevice);
cudaMalloc(&res, n_img*sizeof(int));
cuda_Benchmark(n_img, my_W, my_H, img_p, res, my_thresh);
res_h = new int[n_img];
cudaMemcpy(res_h, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
cuda_Benchmark1(n_img, my_W, my_H, img_p, res, my_thresh);
res_h1 = new int[n_img];
cudaMemcpy(res_h1, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n_img; i++) if (res_h[i] != res_h1[i]) {std::cout << "mismatch at: " << i << " was: " << res_h1[i] << " should be: " << res_h[i] << std::endl; return 0;}
}
$ nvcc -o t49 t49.cu -I/usr/local/cuda/samples/common/inc
$ cuda-memcheck ./t49
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t49
==1756== NVPROF is profiling process 1756, command: ./t49
==1756== Profiling application: ./t49
==1756== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 72.02% 54.325ms 1 54.325ms 54.325ms 54.325ms ThreadPerRowCounter(int, int, int, unsigned char**, int*)
24.71% 18.639ms 2 9.3195ms 1.2800us 18.638ms [CUDA memcpy HtoD]
3.26% 2.4586ms 1 2.4586ms 2.4586ms 2.4586ms ThreadPerColCounter(int, int, int, unsigned char**, int*, int)
0.00% 3.1040us 2 1.5520us 1.5360us 1.5680us [CUDA memcpy DtoH]
API calls: 43.63% 59.427ms 3 19.809ms 18.514us 59.159ms cudaMalloc
41.70% 56.789ms 2 28.394ms 2.4619ms 54.327ms cudaDeviceSynchronize
14.02% 19.100ms 4 4.7749ms 17.749us 18.985ms cudaMemcpy
0.52% 705.26us 96 7.3460us 203ns 327.21us cuDeviceGetAttribute
0.05% 69.268us 1 69.268us 69.268us 69.268us cuDeviceTotalMem
0.04% 50.688us 1 50.688us 50.688us 50.688us cuDeviceGetName
0.04% 47.683us 2 23.841us 14.352us 33.331us cudaLaunchKernel
0.00% 3.1770us 1 3.1770us 3.1770us 3.1770us cuDeviceGetPCIBusId
0.00% 1.5610us 3 520ns 249ns 824ns cuDeviceGetCount
0.00% 1.0550us 2 527ns 266ns 789ns cuDeviceGet
$
(Quadro K2000, CUDA 9.2.148, Fedora Core 27)
(The next_power_of_2 code is lifted from this answer)
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the questions in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

Related

Memory copy by two CUDA kernels - why speed differs?

Can anyone help me understand performance difference between memCopy2dA and memCopy2dB kernels?
They are supposed to copy 2D data with size xLen,yLen from one place to the other but they are using different strategies:
when memCopy2dA is used blocks/threads cover whole 2D space since this kernel is suppose to copy only one data point
when memCopy2dB is used blocks/threads are created only for one whole X row, and then each kernel is looping over Y direction to copy all data.
According to profiler (nvvp) in both cases GPU access memory pattern is 100% and X dimension is big enough to saturate device for "B" kernel (Titan X, 24SM). Unfortunately "B" kernel is slower and on my machine result is:
GB/s: 270.715
GB/s: 224.405
Additional question: Is it even possible to be close to theoretical memory bandwidth limit which is 336.48 GB/s (3505MHz * 384 bits * 2 / 8)? At least my tests shows max always around 271-272 GB/s.
Test code:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <chrono>
template<typename T>
__global__ void memCopy2dA(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y * blockDim.y + threadIdx.y;
if (xi < xLen && yi < yLen) {
out[yi * xLen + xi] = in[yi * xLen + xi];
}
}
template<typename T>
__global__ void memCopy2dB(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
if (xi < xLen) {
size_t idx = xi;
for (int y = 0; y < yLen; ++y) {
out[idx] = in[idx];
idx += xLen;
}
}
}
static void waitForCuda() {
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
}
int main() {
typedef float T;
size_t xLen = 24 * 32 * 64; //49152
size_t yLen = 1024;
size_t dataSize = xLen * yLen * sizeof(T);
T *dInput;
cudaMalloc(&dInput, dataSize);
T *dOutput;
cudaMalloc(&dOutput, dataSize);
const int numOfRepetitions = 100;
double gigabyte = 1000 * 1000 * 1000;
{
dim3 threadsPerBlock(64, 1);
dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x,
(yLen + threadsPerBlock.y - 1) / threadsPerBlock.y);
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < numOfRepetitions; ++i) {
memCopy2dA <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
waitForCuda();
}
auto stopTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = stopTime - startTime;
std::cout << "GB/s: " << (2 * dataSize * numOfRepetitions) / elapsed.count() / gigabyte << std::endl;
}
{
dim3 threadsPerBlock(64);
dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x);
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < numOfRepetitions; ++i) {
memCopy2dB <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
waitForCuda();
}
auto stopTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = stopTime - startTime;
std::cout << "GB/s: " << ((2 * dataSize * numOfRepetitions) / elapsed.count()) / gigabyte << std::endl;
}
cudaFree(dInput);
cudaFree(dOutput);
return 0;
}
compiled with:
nvcc -std=c++11 memTest.cu -o memTest

I found a solution how to speedup memCopy2dB kernel. Here are a tests performed on 1080Ti (TITAN X is not available to me anymore).
Code from question part yields following results:
GB/s: 365.423
GB/s: 296.678
more or less it is the same percentage difference as observed earlier on Titan X.
And now modified memCopy2dB kernel looks like:
template<typename T>
__global__ void memCopy2dB(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
if (xi < xLen) {
size_t idx = xi;
for (int y = 0; y < yLen; ++y) {
__syncthreads(); // <------ this line added
out[idx] = in[idx];
idx += xLen;
}
}
}
There is a lot of information about how important are coalesced memory operations on warp level when all threads in warp should access same aligned segments of memory.
But it seems that synchronizing warps in a block makes coalescing possible on inter-warp level probably utilizing better memory bus width on different GPUs <- this is just my "explanation" to this problem since I could not find any literature on that.
Anyway adding this one not needed line (since from code logic I do not need to sychronize warps) gives me following results for both kernels:
GB/s: 365.255
GB/s: 352.026
So even if the code execution is slow down by synchronization we get much better results. I have tried this technique on some of my code which was processing data in memCopy2dB access pattern manner and it gave me nice speedup.

Atomic Add on Cuda not working..

My problem is to find out the number of integer points in n dimensional sphere using CUDA. I dont understand what is wrong with the below code but it is giving 0 output all the time. CUDA compute capability is 2.0 and tool kit version is 3.10.
Thanks for all the help.
__global__ void count_in(int pow_rad, int ndim,int *digit,int w,unsigned int *count,double radius)
{
long int i,j;
int rem,idx,sq,num;
int iy=blockDim.y * blockIdx.y + threadIdx.y;
int ix=blockDim.x * blockIdx.x + threadIdx.x;
int width=gridDim.x*blockDim.x;
int h=2*w+1;
i=iy*width+ix;
if(i>pow_rad) return;
sq=0;
idx=0;
num=i;
for(j=0;j<ndim;j++)
{digit[j]=0;}
while(num!=0)
{
rem=num%w;
num/=w;
digit[idx]=rem;
idx++;
}
for(j=0;j<ndim;j++)
{sq+=(digit[j]-h)*(digit[j]-h);}
if(sq<(radius*radius))
atomicInc(count,(unsigned int)1);
__syncthreads();
}
int main(int argc, char* argv[])
{
const long ntrials = 5;
int i;
for (int n = 0; n < ntrials; ++n) {
int *digit;
unsigned int *count;
std::cout<<n<<std::endl;
int pow_rad;
unsigned int num;
// Select radius and number of dimensions at random
const double r = drand48() * (RMAX - RMIN) + RMIN;
const int nd = lrand48() % (MAXDIM - 1) + 1;
cudaMalloc((void**) &digit,sizeof(int)*nd);
cudaMalloc((void**) &count,sizeof(unsigned int));
cudaMemset(count,0,sizeof(unsigned int));
int h=(int)floor(r);
int w=2*h+1;
std::cout << "###"<< r <<" "<< nd<< std::endl;
for(i=1;i<=nd;i++)
pow_rad*=w;
int width=(int)sqrt(pow_rad);
// Call your function
dim3 dimBlock(32,32);
dim3 dimGrid((width/32)+1,(width/32)+1);
count_in<<<dimGrid,dimBlock>>>(pow_rad, nd,digit,w,count,r);
cudaMemcpy(&num,count,sizeof(unsigned int),cudaMemcpyDeviceToHost);
std::cout << "-->"<<num << std::endl;
}
}

I didn't look at all of your code, but the lines
atomicInc(count,(unsigned int)1);
seems to show a common misunderstanding of the atomicInc function. The second argument is not the amount to increment, but the modulus; when the global variable reaches that amount, it resets to zero. With the value you specified, each time the statement executes the variable count is reset to 0.
If you change atomicInc to atomicAdd, or if you change the modulus to something large enough that it will never be reached, it should work better.

Why is my CUDA implementation equally fast as my CPU implementation

I created some code to do a 2D convlution on a 1300x1300 grayscale image and a 15x15 kernel, in standard C++ and in CUDA. Both versions:
CPU:
#include <iostream>
#include <exception>
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
inline int index(int x, int y)
{
return x*my + y;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
for (int x=0; x<N; ++x)
for (int y=0; y<N; ++y)
{
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
delete[] image;
delete[] kernel;
delete[] result;
}
GPU:
#include <iostream>
#include <exception>
// ignore, just for error handling
struct ErrorHandler {
int d_line;
char const *d_file;
ErrorHandler(int line, char const *file) : d_line(line), d_file(file) {};
};
#define EH ErrorHandler(__LINE__, __FILE__)
ErrorHandler operator<<(ErrorHandler eh, cudaError_t err)
{
if (err != cudaSuccess)
{
std::cerr << cudaGetErrorString( err ) << " in " << eh.d_file << " at line " << eh.d_line << '\n';
throw std::exception();
}
return eh;
}
// end.
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
__device__ inline int index(int x, int y)
{
return x*my + y;
}
__global__ void kernelkernel(double *image, double *kernel, double *result)
{
int x = blockIdx.x;
int y = blockIdx.y; // becomes: int y = threadIdx.x;
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
double *image_cuda;
double *kernel_cuda;
double *result_cuda;
EH << cudaMalloc((void **) &image_cuda, N*N*sizeof(double));
EH << cudaMalloc((void **) &kernel_cuda, K*K*sizeof(double));
EH << cudaMalloc((void **) &result_cuda, N*N*sizeof(double));
EH << cudaMemcpy(image_cuda, image, N*N*sizeof(double), cudaMemcpyHostToDevice);
EH << cudaMemcpy(kernel_cuda, kernel, K*K*sizeof(double), cudaMemcpyHostToDevice);
dim3 grid ( N, N );
kernelkernel<<<grid, 1>>>(image_cuda, kernel_cuda, result_cuda);
// replace previous 2 statements with:
// kernelkernel<<<N, N>>>(image_cuda, kernel_cuda, result_cuda);
EH << cudaMemcpy(result, result_cuda, N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree( image_cuda );
cudaFree( kernel_cuda );
cudaFree( result_cuda );
delete[] image;
delete[] kernel;
delete[] result;
}
I would expect the cuda code to be a lot faster, however:
$ nvprof ./gpuversion
==17806== NVPROF is profiling process 17806, command: ./gpuversion
==17806== Profiling application: ./gpuversion
==17806== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.89% 3.83149s 1 3.83149s 3.83149s 3.83149s kernelkernel(double*, double*, double*)
0.07% 2.6420ms 1 2.6420ms 2.6420ms 2.6420ms [CUDA memcpy DtoH]
0.04% 1.5111ms 2 755.54us 736ns 1.5103ms [CUDA memcpy HtoD]
And:
$ time ./cpuversion
real 0m3.382s
user 0m3.371s
sys 0m0.012s
Their difference is statistically insignificant. The CUDA-kernel takes approximately 3-4 seconds, why isn't it a lot faster? Is my code run in parallel?
PS: I'm new to CUDA, so I could be missing something trivial.
SOLUTION
What I found out, is that CUDA does not let you access memory willy-nilly from blocks. I guess the general strategy of CUDA programming is:
allocate and copy memory from RAM to cuda using cudaMalloc and cudaMemCpy
divide the workload among blocks and threads in such a way that the memory accessed by different blocks doesn't overlap much.
If there is overlap between the memory used by blocks, start each block by copying the memory inside a shared array. Notice that:
the size of this array must be known compile time
it's size is limited
this memory is shared by each thread in ONE block, so __shared double foo[10] allocates 10 doubles for each BLOCK.
copy the memory needed by one block to the shared variables inside the kernel. Of course, you use the different threads to do this 'efficiently'
sync the threads, such that all data is there before it is used.
process the data, and write the result. it to the output array of the kernel
synch again, I'm not sure why, but everyone on the internet is doing it :S
copy the GPU memory back to RAM
clean up the GPU memory.
This gives the following code. It is mex-code, for Matlab for the structural similarity, which also works via a sliding kernel, but over 2 images and with a different aggregate than the dot-product.
// author: Herbert Kruitbosch, CC: be nice, include my name in documentation/papers/publications when used
#include <matrix.h>
#include <mex.h>
#include <cmath>
#include <iostream>
#include <fstream>
#include <iostream>
#include <stdio.h>
static void HandleError(
cudaError_t err,
const char *file,
int line )
{
if (err != cudaSuccess)
{
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define TILE_WIDTH 31
__device__ inline double sim(double v0, double v1, double c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
__device__ inline int index(int rows, int cols, int row, int col)
{
return row + col*rows;
}
__global__ void ssimkernel(double *test, double *reference, const double * __restrict__ kernel, double *ssim, int k, int rows, int cols, int tile_batches_needed)
{
int radius = k / 2;
int block_width = TILE_WIDTH - k + 1;
__shared__ double tile_test [TILE_WIDTH][TILE_WIDTH];
__shared__ double tile_reference[TILE_WIDTH][TILE_WIDTH];
for(int offset=0; offset < tile_batches_needed; ++offset)
{
int dest = block_width*block_width*offset + threadIdx.y * block_width + threadIdx.x;
int destRow = dest / TILE_WIDTH;
int destCol = dest % TILE_WIDTH;
int srcRow = blockIdx.y * block_width + destRow - radius;
int srcCol = blockIdx.x * block_width + destCol - radius;
int src = srcCol * rows + srcRow;
if (destRow < TILE_WIDTH)
{
if (srcRow >= 0 and srcRow < rows and
srcCol >= 0 and srcCol < cols)
{
tile_test [destRow][destCol] = test [src];
tile_reference[destRow][destCol] = reference[src];
}
else
{
tile_test [destRow][destCol] = 0;
tile_reference[destRow][destCol] = 0;
}
}
}
__syncthreads();
double mean_test = 0;
double mean_reference = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
mean_test += w * tile_test [threadIdx.y+i][threadIdx.x+j];
mean_reference += w * tile_reference[threadIdx.y+i][threadIdx.x+j];
}
double var_test = 0;
double var_reference = 0;
double correlation = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
double a = (tile_test [threadIdx.y+i][threadIdx.x+j] - mean_test );
double b = (tile_reference[threadIdx.y+i][threadIdx.x+j] - mean_reference);
var_test += w * a * a;
var_reference += w * b * b;
correlation += w * a * b;
}
int destRow = blockIdx.y * block_width + threadIdx.y;
int destCol = blockIdx.x * block_width + threadIdx.x;
if (destRow < rows and destCol < cols)
ssim[destCol * rows + destRow] = sim(mean_test, mean_reference, 0.01) * (0.03 + 2*correlation) / (0.03 + var_test + var_reference);
__syncthreads();
}
template<typename T>
inline T sim(T v0, T v1, T c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
inline int upperdiv(int a, int b) {
return (a + b - 1) / b;
}
void mexFunction(int nargout, mxArray *argout[], int nargin, const mxArray *argin[])
{
mwSize rows = mxGetDimensions(argin[0])[0];
mwSize cols = mxGetDimensions(argin[0])[1];
mwSize k = mxGetDimensions(argin[2])[0];
mwSize channels = mxGetNumberOfDimensions(argin[0]) <= 2 ? 1 : mxGetDimensions(argin[0])[2];
int dims[] = {rows, cols, channels};
argout[0] = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL);
double *test = (double *)mxGetData(argin[0]);
double *reference = (double *)mxGetData(argin[1]);
double *gaussian = (double *)mxGetData(argin[2]);
double *ssim = (double *)mxGetData(argout[0]);
double *test_cuda;
double *reference_cuda;
double *gaussian_cuda;
double *ssim_cuda;
HANDLE_ERROR( cudaMalloc((void **) &test_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &reference_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &gaussian_cuda, k*k*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &ssim_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMemcpy(gaussian_cuda, gaussian, k*k*sizeof(double), cudaMemcpyHostToDevice) );
int block_width = TILE_WIDTH - k + 1;
int tile_batches_needed = upperdiv(TILE_WIDTH*TILE_WIDTH, block_width*block_width);
for(int c=0; c<channels; ++c)
{
HANDLE_ERROR( cudaMemcpy(test_cuda, test + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(reference_cuda, reference + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
dim3 dimGrid(upperdiv(cols, block_width), upperdiv(rows, block_width), 1);
dim3 dimBlock(block_width, block_width, 1);
ssimkernel<<<dimGrid, dimBlock>>>(test_cuda, reference_cuda, gaussian_cuda, ssim_cuda, k, rows, cols, tile_batches_needed);
HANDLE_ERROR( cudaMemcpy(ssim + rows*cols*c, ssim_cuda, rows*cols*sizeof(double), cudaMemcpyDeviceToHost) );
}
cudaFree( test_cuda );
cudaFree( reference_cuda );
cudaFree( gaussian_cuda );
cudaFree( ssim_cuda );
}

kernelkernel<<<grid, 1>>>
This is a significant issue; threads on nVidia GPUs work in warps of 32 threads. However, you've only assigned a single thread to each block, which means 31 of those threads will sit idle while a single thread does work. And usually, for kernels where you have the flexibility, you'll usually want several warps per block rather than just one.
You could get an immediate speedup by using N blocks and N threads per block, rather than using N^2 blocks.
Actually, N might be too big, since there's an upper limit on the number of threads per block. Although you could choose a suitable M so that that you use N/M threads per block, and N * M blocks.
In fact, you'll probably get the best results in this regard by picking some M (I'm guessing 256 will probably be near optimal) and launching with L=ceiling(N*N/M) blocks and M blocks per thread. Then each thread figures reconstructs an index in [0, M*L) based on its block and thread ID, and then those whose index is in [0,N*N) will proceed to split that index into an x and y coordinate and do work.

Accessing global memory in a kernel is costly, because of its latency. A global memory request (both reading and writing) takes hundreds of clock cycles to complete. You want to minimise the amount of times global memory is accessed, and access it in contiguous blocks.
If each piece of data is accessed exactly once, there's nothing to do about the latency, but that's seldom the case. And definitely not the case in your code, where the kernel array is accessed by all threads in the same pattern, and a lot of image is accessed by multiple threads as well.
The solution for that is to start the kernel by fetching the data from the high-latency global memory into the low-latency shared memory. Shared memory is a block of memory on the multiprocessor, and its latency is comparable to that of registers. So most simple kernels follow a structure like this:
Each thread fetches data from global memory to shared memory. You want to fetch data in contiguous sequences if possible, as global memory is accessed through transactions. If there's not enough data for all threads to fetch, leave some of them idle.
Threads operate on the data in shared memory.
Data is written from shared memory back to global memory in the same pattern as it was fetched in step 1.
Shared memory is shared by all threads within a thread block. Which leads us to the second big issue in your code: you're not using thread blocks at all. Threads in one block run on one multiprocessor, share shared memory, can be synchronised with each other etc. You need to organise threads into blocks well to get the most out of them.
The grid of blocks is just a mechanism to be able to run more blocks at one invocation. All the goodies of parallel instruction execution and shared memory access are within a block. The grid of blocks is just "yeah, sorry, my data's so big a single block won't do, just run many of them."
You're doing the exact opposite: your blocks have one thread each, which means that in each step, only one thread from each warp runs on the multiprocessor (based on your device's compute capability and the number of warp schedulers available, this means something like 2–4 threads on one multiprocessor at most).
You'll have to re-structure your threads to mirror the data access patterns, and prefetch data into shared memory. This will give you the performance boost you expect.
The above is just a short summary. Refer to the CUDA programming guide for details on block organisation, shared memory, and global memory transactions.

If you're using global memory in CUDA, all the data access will be synchronized in something like queue, and you'll receive almost linear solution, not parallel.
Also, transfering a large dataset from your RAM memory to GPU memory also takes a lot of time (the speed of bus is limited).
So, i think you have to somehow parallel your data across computation units in your GPU (part them into shared memory).
Check this to see solution of how to improve your GPU memory usage in the case that similar to yours.

Example of increasing the work per thread in CUDA

Algorithm :
I'm writing a program with CUDA and the problem is the following:
Two matrices A (n * 128) and B (m * 128)
I take the first row of A, and I compute the distance between that vector and all the rows of B, one by one.
I write the result of each distance on a row of a matrix C, so the element C(i,j) of C contains the distance between row i of A and row j of B.
and I proceed with the next row of A.
I've implemented it this way: I've got a grid made by ( n * m ) blocks, and 128 threads per block. ( 1 * 128 ).
QUESTION: The program runs successfully with the expected results but the time execution is only around 5 to 10 times faster than the one-threaded CPU version of it. So I would like to know how to increase the work per thread before reduction in order to increase performance.
Kernel code (original : Not optimized)
__global__ void EuclideanDistances( float *A, float *B , float *C , int n , int m)
{
// SIZE is equal to 128
__shared__ float accumResult[SIZE];
float sA;
float sB;
// MAPPING
int bx = blockIdx.x; // n
int by = blockIdx.y; // m
int ty = threadIdx.y; // 128
int tx = threadIdx.x; // 1
sA = A [bx * SIZE + ty];
sB = B [by * SIZE + ty];
__syncthreads();
accumResult[ty] = (sA - sB) * (sA - sB);
__syncthreads();
// Parallel tree-reduction
for (int stride = SIZE/2 ; stride > 0 ; stride >>= 1)
if (ty < stride)
{
accumResult[ty] += accumResult [stride + ty];
__syncthreads();
}
// Writing results to output matrix
if ((threadIdx.y == 0))
C [bx * m + by] = accumResult[ty];
__syncthreads();
}
UPDATE
Now, I'm using another mapping : Instead of taking a grid of n by m blocks and a block of 128 threads, I'm increasing the number of threads within a block in order to decrease the number of blocks.
New mapping:
Block of 128 by 8 threads (total of 1024 threads, which is the max size)
Grid of n/8 by m/8 blocks
Unfortunately, it's giving wrong results ).
Optimized kernel code (to be updated)
__global__ void EuclideanDistances( float *A, float *B , float *C, int n , int m)
{
__shared__ float accumResult[SIZE][8];
__shared__ float sA[SIZE][8];
__shared__ float sB[SIZE][8];
int bx = blockIdx.x; // n / 8
int by = blockIdx.y; // m / 8
int tx = threadIdx.x; // 8
int ty = threadIdx.y; // 128
int i = bx * tx * SIZE + ty;
int j = by * tx * SIZE + ty;
sA[ty][tx] = A [i];
sB[ty][tx] = B[j];
__syncthreads();
accumResult[ty][tx] = (sA[ty][tx] - sB[ty][tx]) * (sA[ty][tx] - sB[ty][tx]);
__syncthreads();
// Reduction
for (int stride = SIZE/2 ; stride > 0 ; stride>>=1)
if (ty < stride)
{
accumResult[ty][tx] += accumResult [stride + ty][tx];
__syncthreads();
}
C[bx * m + by] = accumResult[0][tx];
}
HOST CODE (allocations + kernel calls)
int main()
{
int m = 20000; //MatrixA size : m * SIZE
int n = 4000; //MatrixB size : n * SIZE
srand((unsigned)time(0));
// Host Allocations
float *matrixA = (float *) malloc (n * SIZE * sizeof(float));
for(int i=0; i < n * SIZE; i++)
matrixA[i] = (float) (rand()%100)+1;
float *matrixB = (float *) malloc (m * SIZE * sizeof(float));
for(int i=0; i < m * SIZE; i++)
matrixB[i] = (float) (rand()%100)+1;
float *results_kernel1 = (float *) malloc (n * m * sizeof(float));
float *results_kernel2 = (float *) malloc (n * m * sizeof(float));
//Device Allocation
float *d_matrixA;
float *d_matrixB;
cudaMalloc((void **)&d_matrixA, n * SIZE * sizeof(float));
cudaMalloc((void **)&d_matrixB, m * SIZE * sizeof(float));
cudaMemcpy(d_matrixA , matrixA , n * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_matrixB , matrixB , m * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
float *d_results_kernel1;
float *d_results_kernel2;
cudaMalloc((void **)&d_results_kernel1 , n * m * sizeof(float));
cudaMalloc((void **)&d_results_kernel2 , n * m * sizeof(float));
dim3 threads1 (1 , 128);
dim3 blocks1 (n , m);
EuclideanDistances1 <<<blocks1 , threads1>>> (d_matrixA , d_matrixB , d_results_kernel1 , n , m);
cudaDeviceSynchronize();
cudaMemcpy(results_kernel1 , d_results_kernel1 , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
cudaFree(d_results_kernel1);
dim3 threads2 (8 , 128); // 1024 threads per block (maximum)
dim3 blocks2 (ceil((float)n/8) , ceil((float)m/8));
EuclideanDistances2 <<<blocks2 , threads2>>> (d_matrixA , d_matrixB , d_results_kernel2 , n , m);
cudaDeviceSynchronize();
cudaMemcpy(results_kernel2 , d_results_kernel2 , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
cudaFree(d_results_kernel2);
// Visualising and comparing results
for (int i = 0 ; i < 50 ; i++)
std::cout << "kernel1 : " << results_kernel1[i] << " | kernel2 : " << results_kernel2[i] << std::endl;
free(matrixA);
free(matrixB);
free(results_kernel1);
free(results_kernel2);
return 0;
}
PS: I have CUDA 6.0 with a NVIDIA GTX 650 (compute capability 3.0)

It seems your question has 2 components:
why isn't my second kernel working?
how do I make my code run faster?
Why isn't my second kernel working?
You had several issues:
indexing problems in initial calculation of i, j as well as the index for storing the C value.
violation of usage of _syncthreads() inside a conditional block
item 1 was the key element to get the code working.
How do I make my code run faster?
This is more involved. First of all, your attempt at "increasing work per thread" didn't do anything of the kind, it was merely an increase in the number of threads per block (from 128 to 8*128). Each thread was doing approximately the same amount of work. Furthermore, in the process of going to a 2D threadblock for this attempt, I believe a couple of bad things happened:
various coalescing and shared-memory-bank-conflict load and store patterns were broken.
effective occupancy went down, due the amount of shared memory required per block.
The net effect of the second kernel was to approximately double the execution time. So that is not what we want.
However, increasing work per thread may be a good idea, along with using shared memory, as well as trying to preserve good (global, shared) memory access patterns, as well as allowing for increased occupancy.
What follows is a work-in-progress along those lines. The following code has your second kernel fixed, along with timing infrastructure, as well as full data verification, as well as 2 new kernels. The first new kernel (#3) is what I would call a "naive" kernel. It simply allocates one thread per output point, and each thread loops through the necessary vectors, computing its individual result. No usage of shared memory, or even much attention to coalescing or any other optimization. However with a tweak to threadblock configuration (16,16) -> (8,32) threads, which I observed from #talonmies answer (now deleted), this kernel performs significantly (3x) faster than your "fast" kernel. After further thought about the (8,32) observation, I concluded that the next attempt at optimization should focus on:
elimination of the usage of a parallel reduction to compute the vector distance (i.e. allow adjacent threads to use a straight for-loop to loop through the vectors)
maximization of benefit from the cache
efficient usage of shared memory
insist on perfect global coalescing/perfect usage of shared memory for all reads and writes
Item 4 prompted the question in the comments "may I transpose the matrices?" With this permission, it's possible to re-organize the data to facilitate item 4 above. Item 2 above is addressed in my "fast" kernel (#4) by loading the B vector into shared memory, while allowing the cache to mostly focus on caching the A vectors, hopefully reducing cache-thrashing (A is the smaller of the 2 vector arrays, at about 2MB - fermi L2 is 768K, Kepler L2 is 1.5MB). By delivering A in transposed form, and effectively "transposing" B on-chip from shared memory, it's possible to use a straight for-loop to compute the vector distance, while allowing adjacent threads to have perfectly coalesced reads and writes, as well as "efficient" use of shared memory (i.e. non-bank-conflicted loads, and broadcast reads).
For my particular timing, (Quadro5000 cc2.0 GPU, CUDA 6, RHEL 5.5) I see that your "fast" kernel requires about 2 seconds, my "naive" kernel requires about 0.7 seconds, and my "fast" kernel requires about 0.2 seconds, albeit with transposed (A,C) data.
EDIT: I've made one additional optimization, that is to have each block compute multiple (CHKSIZE) B vectors at one time. You can set CHKSIZE to 1 to see the previous result (~0.2sec). I found CHKSIZE of 4 gave good improvement. This is an attack at attempting to exploit the data re-use of A. With this additional optimization at CHKSIZE of 4, the kernel time for kernel 4 drops to about 0.1 second.
Following is the code and a sample run:
$ cat t460.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
// both M and N must be evenly divisible by SIZE, M must be evenly divisible by CHKSIZE
#define SIZE 128
#define N 4000
#define M 20000
#define CHKSIZE 4
__global__ void EuclideanDistances1( float *A, float *B , float *C , int n , int m)
{
// SIZE is equal to 128
__shared__ float accumResult[SIZE];
float sA;
float sB;
// MAPPING
int bx = blockIdx.x; // n
int by = blockIdx.y; // m
int ty = threadIdx.y; // 128
//int tx = threadIdx.x; // 1
sA = A [bx * SIZE + ty];
sB = B [by * SIZE + ty];
__syncthreads();
accumResult[ty] = (sA - sB) * (sA - sB);
__syncthreads();
// Parallel tree-reduction
for (int stride = SIZE/2 ; stride > 0 ; stride >>= 1){
if (ty < stride)
{
accumResult[ty] += accumResult [stride + ty];
}
__syncthreads();
}
// Writing results to output matrix
if ((ty == 0))
C [bx * m + by] = accumResult[ty];
__syncthreads();
}
__global__ void EuclideanDistances2( float *A, float *B , float *C, int n , int m)
{
__shared__ float accumResult[SIZE][8];
__shared__ float sA[SIZE][8];
__shared__ float sB[SIZE][8];
int bx = blockIdx.x; // n / 8
int by = blockIdx.y; // m
int tx = threadIdx.x; // 8
int ty = threadIdx.y; // 128
int i = ((bx*8) + tx) * SIZE + ty;
int j = by * SIZE + ty;
sA[ty][tx] = A[i];
sB[ty][tx] = B[j];
__syncthreads();
accumResult[ty][tx] = (sA[ty][tx] - sB[ty][tx]) * (sA[ty][tx] - sB[ty][tx]);
__syncthreads();
// Reduction
for (int stride = SIZE/2 ; stride > 0 ; stride>>=1){
if (ty < stride)
{
accumResult[ty][tx] += accumResult [stride + ty][tx];
}
__syncthreads();
}
if (ty == 0)
C[((bx*8)+tx) * m + by] = accumResult[0][tx];
}
//naive kernel
__global__ void EuclideanDistances3( float *A, float *B , float *C, int n , int m){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
float result = 0.0f;
if ((idx < n) && (idy < m)){
for (int i = 0; i < SIZE; i++){
float temp = A[(idx*SIZE)+i] - B[(idy*SIZE)+i];
result += temp * temp;}
C[(idx*m) + idy] = result;
}
}
//optimized kernel
__global__ void EuclideanDistances4( const float *A, const float *B , float *C, const int n , const int m){
// n, A, 4000 this kernel assumes A is column-major A(SIZE, n)
// m, B, 20000 this kernel assumes B is row-major B(m, SIZE)
// this kernel assumes C is column-major C(m,n)
// this kernel assumes number of threads per threadblock == SIZE
// CHKSIZE is the number of B vectors that will be compute per block
__shared__ float my_sB[CHKSIZE*SIZE]; // enough shared storage for CHKSIZE vectors of B
int bx = blockIdx.x; // one block per CHKSIZE rows of B (the larger input matrix)
while ((bx*CHKSIZE) < m){ // not used, this while loop could be used to extend a block to multiple chunks
int tx = threadIdx.x;
for (int i = 0; i < CHKSIZE; i++) // load vectors of B into shared memory
my_sB[(i*SIZE)+tx] = B[(((bx*CHKSIZE)+i)*SIZE)+tx];
__syncthreads();
while (tx < n){ //loop across all vectors in A
float result[CHKSIZE];
for (int i = 0; i < CHKSIZE; i++)
result[i] = 0.0f;
for (int i = 0; i < SIZE; i++){
float Atemp = A[(n*i)+tx];
for (int j = 0; j < CHKSIZE; j++){ // compute all CHKSIZE B vectors with read of A
float temp = Atemp - my_sB[i + (j*SIZE)];
result[j] += temp * temp;}}
for (int i = 0; i < CHKSIZE; i++) // store CHKSIZE results
C[((i+(bx*CHKSIZE))*n)+ tx] = result[i];
tx += blockDim.x; } // continue looping across vectors in A
__syncthreads(); // necessary to prevent warps from racing ahead, if block looping is used
bx += gridDim.x;}
}
float comp_euclid_sq(const float *rA, const float *rB, const int size){
float result = 0.0f;
float temp;
for (int i = 0; i < size; i++){
temp = (rA[i] - rB[i]);
result += temp * temp;}
return result;
}
int main()
{
float et1=0.0f, et2=0.0f, et3=0.0f, et4=0.0f;
cudaEvent_t start1, start2, start3,start4, stop1, stop2, stop3, stop4;
cudaEventCreate(&start1);
cudaEventCreate(&start2);
cudaEventCreate(&start3);
cudaEventCreate(&start4);
cudaEventCreate(&stop1);
cudaEventCreate(&stop2);
cudaEventCreate(&stop3);
cudaEventCreate(&stop4);
int n = N; //MatrixA size : n * SIZE
int m = M; //MatrixB size : m * SIZE
srand((unsigned)time(0));
// Host Allocations
float *matrixA = (float *) malloc (n * SIZE * sizeof(float));
for(int i=0; i < n * SIZE; i++)
matrixA[i] = (float) (rand()%100)+1;
float *matrixB = (float *) malloc (m * SIZE * sizeof(float));
for(int i=0; i < m * SIZE; i++)
matrixB[i] = (float) (rand()%100)+1;
float *results_kernel = (float *) malloc (n * m * sizeof(float));
float *cpu_results_kernel = (float *) malloc (n * m * sizeof(float));
for (int i = 0; i< n*m; i++)
cpu_results_kernel[i] = comp_euclid_sq(matrixA + ((i/m)*SIZE), matrixB + (i%m)*SIZE, SIZE);
//Device Allocation
float *d_matrixA;
float *d_matrixB;
cudaMalloc((void **)&d_matrixA, n * SIZE * sizeof(float));
cudaMalloc((void **)&d_matrixB, m * SIZE * sizeof(float));
cudaMemcpy(d_matrixA , matrixA , n * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_matrixB , matrixB , m * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
float *d_results_kernel;
cudaMalloc((void **)&d_results_kernel , n * m * sizeof(float));
dim3 threads1 (1 , SIZE);
dim3 blocks1 (n , m);
cudaEventRecord(start1);
EuclideanDistances1 <<<blocks1 , threads1>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
cudaEventRecord(stop1);
cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
for (int i = 0; i< n*m; i++) {
if (results_kernel[i] != cpu_results_kernel[i]) {printf("cpu/kernel1 mismatch at %d, cpu: %f, kernel1: %f\n", i, cpu_results_kernel[i], results_kernel[i]); return 1;}}
cudaMemset(d_results_kernel, 0, n*m*sizeof(float));
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&et1, start1, stop1);
dim3 threads2 (8 , SIZE); // 1024 threads per block (maximum)
dim3 blocks2 (n/8 , m); // assumes n evenly divisible by 8
cudaEventRecord(start2);
EuclideanDistances2 <<<blocks2 , threads2>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
cudaEventRecord(stop2);
cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
for (int i = 0; i< n*m; i++) {
if (results_kernel[i] != cpu_results_kernel[i]) {printf("cpu/kernel2 mismatch at %d, cpu: %f, kernel1: %f\n", i, cpu_results_kernel[i], results_kernel[i]); return 1;}}
cudaMemset(d_results_kernel, 0, n*m*sizeof(float));
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&et2, start2, stop2);
cudaFuncSetCacheConfig(EuclideanDistances3, cudaFuncCachePreferL1);
dim3 threads3 (8, 32); // 1024 threads per block (maximum)
dim3 blocks3 (n/threads3.x , m/threads3.y); // assumes evenly divisible
cudaEventRecord(start3);
EuclideanDistances3 <<<blocks3 , threads3>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
cudaEventRecord(stop3);
cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
for (int i = 0; i< n*m; i++) {
if (results_kernel[i] != cpu_results_kernel[i]) {printf("cpu/kernel3 mismatch at %d, cpu: %f, kernel3: %f\n", i, cpu_results_kernel[i], results_kernel[i]); return 1;}}
cudaMemset(d_results_kernel, 0, n*m*sizeof(float));
cudaEventSynchronize(stop3);
cudaEventElapsedTime(&et3, start3, stop3);
// transpose matrix A
float *matrixA_T = (float *) malloc (n * SIZE * sizeof(float));
for (int i = 0; i < n; i++)
for (int j = 0; j < SIZE; j++)
matrixA_T[(j*n)+i] = matrixA[(i*SIZE)+j];
cudaMemcpy(d_matrixA , matrixA_T , n * SIZE * sizeof(float) , cudaMemcpyHostToDevice);
cudaFuncSetCacheConfig(EuclideanDistances4, cudaFuncCachePreferL1);
dim3 threads4(SIZE); // one thread per vector element
dim3 blocks4(m/CHKSIZE);
cudaEventRecord(start4);
EuclideanDistances4 <<<blocks4 , threads4>>> (d_matrixA , d_matrixB , d_results_kernel , n , m);
cudaEventRecord(stop4);
cudaMemcpy(results_kernel , d_results_kernel , n * m *sizeof(float) , cudaMemcpyDeviceToHost);
// test for correct transposed result C(m,n)
for (int i = 0; i< n; i++)
for (int j = 0; j < m; j++)
if (results_kernel[(j*n)+i] != cpu_results_kernel[(i*m)+j]) {printf("cpu/kernel4 mismatch at %d,%d, cpu: %f, kernel4: %f\n", i,j, cpu_results_kernel[(i*m)+j], results_kernel[(j*n)+i]); return 1;}
cudaEventSynchronize(stop4);
cudaEventElapsedTime(&et4, start4, stop4);
cudaFree(d_results_kernel);
printf("Success!\n");
printf("kernel1 : %.fms, kernel2 : %.fms, kernel3 : %.fms, kernel4 : %.fms\n", et1, et2, et3, et4);
free(matrixA);
free(matrixB);
free(results_kernel);
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t460 t460.cu
$ ./t460
Success!
kernel1 : 2213ms, kernel2 : 4660ms, kernel3 : 691ms, kernel4 : 99ms
$
Hopefully that will get you going with more ideas of things to work on. You may get different timings of course on your cc3.0 device.
Are further optimizations possible? Probably. The first target I would look at would be to figure out how to take advantage of the data-reuse opportunities on vector A. (data re-use of vector B is already handled in the kernel 4 by loading it into shared memory. There may be ways to use some shared memory to store portions of A to make the code run even faster.)
I guess I should also mention that following the lead of the code you provided, this code is computing the square of the euclidean distance. A trivial modification to the kernels can make it compute the actual euclidean distance instead (C[...] = sqrtf(...);) The validation I have included, however, assumes the results are "in-range" for perfect storage of an integer quantity in a float. Your test case satisfies this requirement, but otherwise the validation code would need to be modified (if sqrtf were used).

count3's in cuda is very slow

I have written a small program in CUDA that counts how many 3's are in a C array and prints them.
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cstdlib>
__global__ void incrementArrayOnDevice(int *a, int N, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ int s_a[512]; // one for each thread
//s_a[threadIdx.x] = a[id];
if( id < N )
{
//if( s_a[threadIdx.x] == 3 )
if( a[id] == 3 )
{
atomicAdd(count, 1);
}
}
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
int N = 16777216;
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
// do calculation on device
int blockSize = 512;
int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1);
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", count);
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
The result I get is:
real 0m3.025s
user 0m2.989s
sys 0m0.029s
When I run it on the CPU with 4 threads I get:
real 0m0.101s
user 0m0.100s
sys 0m0.024s
Note that the GPU is an old one - I don't know the exact model because I do not have root access to it, but the OpenGL version it runs is 1.2 using the MESA driver.
Am I doing something wrong? What can I do to make it run faster?
Note: I have tried using buckets for each block (so the atomicAdd()s would be reduced for each one) but I get exactly the same performance.
I have also tried copying the 512 integers that are assigned to this block to a shared block of memory (you can see it in the comments) and the time is the same again.

This is in response to your question "What can I do to make it run faster?" As I mentioned in the comments, there are issues (probably) with the timing methodology, and the main suggestion I have for speed improvement is to use a "classical parallel reduction" algorithm. The following code implements a better (in my opinion) timing measurement, and also converts your kernel to a reduction style kernel:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
When I run this on a reasonably fast GPU (a Quadro 5000, a little slower than a Tesla M2050) I get the following:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
We see that the GPU is substantially faster than this (naive, single-threaded) CPU implementation for the compute portion. When we add in the cost to transfer the data, the GPU version is slower but is not 30x slower.
By way of comparison, when I timed your original algorithm, I got numbers like this:
GPU total time = 0.118131s
GPU compute time = 0.093213s
My system config for this was Xeon X5560 CPU, RHEL 5.5, CUDA 5.0, Quadro5000 GPU.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js