CUDA atomicAdd with floats [duplicate]

CUDA atomicAdd with floats [duplicate] - c++

This question already has answers here:
Define atomicAdd function doesn't work in CUDA
(1 answer)
CUDA atomicAdd() produces wrong result
(1 answer)
Closed 6 years ago.
I am trying to add all elements of a large vector on the CPU and the GPU and benchmark the result.
My CPU implementation looks like this
void reductionCPU(float *result, float *input)
{
int i;
for (i = 0; i < SIZE; i++)
{
*result += input[i];
}
}
And my GPU kernel like this:
__global__ void reductionKernel(float *result, float *input)
{
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * BLOCK_X_NAIVE * BLOCK_COUNT_X + col;
if (index < SIZE)
{
atomicAdd(result, input[index]);
}
}
(Entire minimal working example below)
Both are quite simple but behave a little strange. If I let the CPU and GPU add only numbers the results always match. The output is:
CPU Time: 22.596495 ms, bandwidth: 3.540372 GB/s
---block_x:32, block_y:32, dim_x:100, dim_y:98
GPU Time: 30.625248 ms, bandwidth: 2.612224 GB/s CPU result matches
GPU result in naive atomic add. CPU: 10000000.000000, GPU:
10000000.000000
If I want to add arbitrary floating points, however, the results never match.
CPU Time: 22.472712 ms, bandwidth: 3.559873 GB/s
---block_x:32, block_y:32, dim_x:100, dim_y:98
GPU Time: 30.625153 ms, bandwidth: 2.612232 GB/s CPU result does not
match GPU result in naive atomic add. CPU: 4996870656.000000, GPU:
4996921856.000000, diff:-51200.000000
Changing the amount of elements that are added to something like 50 results in correct calculation in some runs and in others to a false calculation. Increasing the size results in an increased number of incorrect calculations.
I assume that is has something to do with the precision of floating points but that is only a guess.
The same issue occurs if I add only tens or random whole floats between 0 and 10:
for (i = 0; i < SIZE; i++)
{
input[i] = floorf(((float)rand() / (float)(RAND_MAX)) * 10);
//input[i] = 10.0;
}
I develop using the latest version of Visual Studio on Windows 10. Further I found that code generation parameter could have an influence as well. I use compute_30,sm_30. Replacing the 30 with 60 does not work on my GPU, the result then is always 0.0.
If there is any information missing please let me know.
Here is the entire minimal working code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
cudaError_t reductionWithCuda(float *result, float *input);
__global__ void reductionKernel(float *result, float *input);
void reductionCPU(float *result, float *input);
#define SIZE 10000000
//#define SIZE 50
#define BLOCK_X_NAIVE 32
#define BLOCK_Y_NAIVE 32
#define BLOCK_COUNT_X 100
int main()
{
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0;
resultGPU = 0;
srand((int)time(NULL));
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
{
input[i] = ((float)rand() / (float)(RAND_MAX)) * 1000; // random floats between 0 and 1000
//input[i] = 1.0;
}
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCuda(&resultGPU, input);
if (resultCPU != resultGPU)
printf("CPU result does not match GPU result in naive atomic add. CPU: %f, GPU: %f, diff:%f\n", resultCPU, resultGPU, (resultCPU - resultGPU));
else
printf("CPU result matches GPU result in naive atomic add. CPU: %f, GPU: %f\n", resultCPU, resultGPU);
cudaDeviceReset();
return 0;
}
void reductionCPU(float *result, float *input)
{
int i;
for (i = 0; i < SIZE; i++)
{
*result += input[i];
}
}
__global__ void reductionKernel(float *result, float *input)
{
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * BLOCK_X_NAIVE * BLOCK_COUNT_X + col;
if (index < SIZE)
{
atomicAdd(result, input[index]);
}
}
cudaError_t reductionWithCuda(float *result, float *input)
{
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaError_t cudaStatus;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_NAIVE;
dim_block.y = BLOCK_Y_NAIVE;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X;
dim_grid.y = (int)ceil((float)SIZE / (float)(BLOCK_X_NAIVE * BLOCK_Y_NAIVE * BLOCK_COUNT_X));
dim_grid.z = 1;
printf("\n---block_x:%d, block_y:%d, dim_x:%d, dim_y:%d\n", dim_block.x, dim_block.y, dim_grid.x, dim_grid.y);
cudaSetDevice(0);
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reductionKernel << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaDeviceSynchronize();
cudaStatus = cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_input);
cudaFree(dev_result);
return cudaStatus;
}

Related

How can i make GPU process much faster than CPU process with CUDA 10.0 in Visual Studio 2017?

Smart developer!
I am the beginner of CUDA programming and I have a big problem with my code.
Following code is a sample code from Nvidia and I changed a little bit for showing the GPU process much faster than from CPU process. However, after compiling this code, I got a unexpected result from that CPU process is much faster than GPU process.
This is my laptop gpu info.
This is my cuda code for Visual Studio 2017.
===========================================================================
#define N 10
This is add2 function() from GPU process
`___global____ void add2(int *a, int *b, int *c) {`
// GPU block from grid sector
//int tid = blockIdx.x; // checking the data of index = if you
insert min of N, you will get slow result from CPU. But if you put big number, this show much faster than CPU
// GPU thread
//int tid = threadIdx.x; // Same result as blockIdx.x
// GPU unexpected vector // Same result as above
int tid = threadIdx.x + blockIdx.x*blockDim.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
This is add function() from CPU process
`void add(int *a, int *b, int *c) {
int tid = 0;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1;
}
}
This is Main function()
int main() {
// Values for time duration
LARGE_INTEGER tFreq, tStart, tEnd;
cudaEvent_t start, stop;
float tms, ms;
int a[N], b[N], c[N]; // CPU values
int *dev_a, *dev_b, *dev_c; // GPU values----------------------------------------------
// Creating alloc for GPU--------------------------------------------------------------
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
// Fill 'a' and 'b' from CPU
for (int i = 0; i < N; i++) {
a[i] = -i;
b[i] = i * i;
}
// Copy values of CPU to GPU values----------------------------------------------------
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
//////////////////////////////////////
QueryPerformanceFrequency(&tFreq); // Frequency set
QueryPerformanceCounter(&tStart); // Time count Start
// CPU operation
add(a, b, c);
//////////////////////////////////////
QueryPerformanceCounter(&tEnd); // TIme count End
tms = ((tEnd.QuadPart - tStart.QuadPart) / (float)tFreq.QuadPart) * 1000;
//////////////////////////////////////
// show result of CPU
cout << fixed;
cout.precision(10);
cout << "CPU Time=" << tms << endl << endl;
for (int i = 0; i < N; i++) {
printf("CPU calculate = %d + %d = %d\n", a[i], b[i], c[i]);
}
cout << endl;
///////////////////////////////////////
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// GPU operatinog---------------------------------------------------------------------
//add2 <<<N,1 >>> (dev_a, dev_b, dev_c); // block
//add2 << <1,N >> > (dev_a, dev_b, dev_c); // Thread
add2 << <N/32+1, 32 >> > (dev_a, dev_b, dev_c); // grid
///////////////////////////////////////
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
///////////////////////////////////////
// show result of GPU
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cout << fixed;
cout.precision(10);
cout << "GPU Time=" << ms << endl << endl;
for (int i = 0; i < N; i++) {
printf("GPU calculate = %d + %d = %d\n", a[i], b[i], c[i]);
}
//Free GPU values
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
This is result of compiling this code.
I want to make GPU process much faster than CPU process.

The GPU is generally actually slower than the CPU for running a single operation. Additionally it takes time to send data to the GPU and read it back again.
The advantage of the GPU is it can execute many operations in parallel.
As you have defined N to be 10 it probably takes longer to upload and download the data than to execute on the CPU. In order to see the advantage of the GPU increase your problem size to something much larger. Ideally you want to execute a minimum of a few operations on each GPU core before you start seeing some benefit. For example with your GPU's 1280 cores you would want to execute something like 4000 operations or more at once to get the benefit of the GPU.

Memory copy by two CUDA kernels - why speed differs?

Can anyone help me understand performance difference between memCopy2dA and memCopy2dB kernels?
They are supposed to copy 2D data with size xLen,yLen from one place to the other but they are using different strategies:
when memCopy2dA is used blocks/threads cover whole 2D space since this kernel is suppose to copy only one data point
when memCopy2dB is used blocks/threads are created only for one whole X row, and then each kernel is looping over Y direction to copy all data.
According to profiler (nvvp) in both cases GPU access memory pattern is 100% and X dimension is big enough to saturate device for "B" kernel (Titan X, 24SM). Unfortunately "B" kernel is slower and on my machine result is:
GB/s: 270.715
GB/s: 224.405
Additional question: Is it even possible to be close to theoretical memory bandwidth limit which is 336.48 GB/s (3505MHz * 384 bits * 2 / 8)? At least my tests shows max always around 271-272 GB/s.
Test code:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <chrono>
template<typename T>
__global__ void memCopy2dA(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y * blockDim.y + threadIdx.y;
if (xi < xLen && yi < yLen) {
out[yi * xLen + xi] = in[yi * xLen + xi];
}
}
template<typename T>
__global__ void memCopy2dB(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
if (xi < xLen) {
size_t idx = xi;
for (int y = 0; y < yLen; ++y) {
out[idx] = in[idx];
idx += xLen;
}
}
}
static void waitForCuda() {
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
}
int main() {
typedef float T;
size_t xLen = 24 * 32 * 64; //49152
size_t yLen = 1024;
size_t dataSize = xLen * yLen * sizeof(T);
T *dInput;
cudaMalloc(&dInput, dataSize);
T *dOutput;
cudaMalloc(&dOutput, dataSize);
const int numOfRepetitions = 100;
double gigabyte = 1000 * 1000 * 1000;
{
dim3 threadsPerBlock(64, 1);
dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x,
(yLen + threadsPerBlock.y - 1) / threadsPerBlock.y);
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < numOfRepetitions; ++i) {
memCopy2dA <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
waitForCuda();
}
auto stopTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = stopTime - startTime;
std::cout << "GB/s: " << (2 * dataSize * numOfRepetitions) / elapsed.count() / gigabyte << std::endl;
}
{
dim3 threadsPerBlock(64);
dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x);
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < numOfRepetitions; ++i) {
memCopy2dB <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
waitForCuda();
}
auto stopTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = stopTime - startTime;
std::cout << "GB/s: " << ((2 * dataSize * numOfRepetitions) / elapsed.count()) / gigabyte << std::endl;
}
cudaFree(dInput);
cudaFree(dOutput);
return 0;
}
compiled with:
nvcc -std=c++11 memTest.cu -o memTest

I found a solution how to speedup memCopy2dB kernel. Here are a tests performed on 1080Ti (TITAN X is not available to me anymore).
Code from question part yields following results:
GB/s: 365.423
GB/s: 296.678
more or less it is the same percentage difference as observed earlier on Titan X.
And now modified memCopy2dB kernel looks like:
template<typename T>
__global__ void memCopy2dB(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
if (xi < xLen) {
size_t idx = xi;
for (int y = 0; y < yLen; ++y) {
__syncthreads(); // <------ this line added
out[idx] = in[idx];
idx += xLen;
}
}
}
There is a lot of information about how important are coalesced memory operations on warp level when all threads in warp should access same aligned segments of memory.
But it seems that synchronizing warps in a block makes coalescing possible on inter-warp level probably utilizing better memory bus width on different GPUs <- this is just my "explanation" to this problem since I could not find any literature on that.
Anyway adding this one not needed line (since from code logic I do not need to sychronize warps) gives me following results for both kernels:
GB/s: 365.255
GB/s: 352.026
So even if the code execution is slow down by synchronization we get much better results. I have tried this technique on some of my code which was processing data in memCopy2dB access pattern manner and it gave me nice speedup.

Why is my CUDA implementation equally fast as my CPU implementation

I created some code to do a 2D convlution on a 1300x1300 grayscale image and a 15x15 kernel, in standard C++ and in CUDA. Both versions:
CPU:
#include <iostream>
#include <exception>
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
inline int index(int x, int y)
{
return x*my + y;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
for (int x=0; x<N; ++x)
for (int y=0; y<N; ++y)
{
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
delete[] image;
delete[] kernel;
delete[] result;
}
GPU:
#include <iostream>
#include <exception>
// ignore, just for error handling
struct ErrorHandler {
int d_line;
char const *d_file;
ErrorHandler(int line, char const *file) : d_line(line), d_file(file) {};
};
#define EH ErrorHandler(__LINE__, __FILE__)
ErrorHandler operator<<(ErrorHandler eh, cudaError_t err)
{
if (err != cudaSuccess)
{
std::cerr << cudaGetErrorString( err ) << " in " << eh.d_file << " at line " << eh.d_line << '\n';
throw std::exception();
}
return eh;
}
// end.
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
__device__ inline int index(int x, int y)
{
return x*my + y;
}
__global__ void kernelkernel(double *image, double *kernel, double *result)
{
int x = blockIdx.x;
int y = blockIdx.y; // becomes: int y = threadIdx.x;
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
double *image_cuda;
double *kernel_cuda;
double *result_cuda;
EH << cudaMalloc((void **) &image_cuda, N*N*sizeof(double));
EH << cudaMalloc((void **) &kernel_cuda, K*K*sizeof(double));
EH << cudaMalloc((void **) &result_cuda, N*N*sizeof(double));
EH << cudaMemcpy(image_cuda, image, N*N*sizeof(double), cudaMemcpyHostToDevice);
EH << cudaMemcpy(kernel_cuda, kernel, K*K*sizeof(double), cudaMemcpyHostToDevice);
dim3 grid ( N, N );
kernelkernel<<<grid, 1>>>(image_cuda, kernel_cuda, result_cuda);
// replace previous 2 statements with:
// kernelkernel<<<N, N>>>(image_cuda, kernel_cuda, result_cuda);
EH << cudaMemcpy(result, result_cuda, N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree( image_cuda );
cudaFree( kernel_cuda );
cudaFree( result_cuda );
delete[] image;
delete[] kernel;
delete[] result;
}
I would expect the cuda code to be a lot faster, however:
$ nvprof ./gpuversion
==17806== NVPROF is profiling process 17806, command: ./gpuversion
==17806== Profiling application: ./gpuversion
==17806== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.89% 3.83149s 1 3.83149s 3.83149s 3.83149s kernelkernel(double*, double*, double*)
0.07% 2.6420ms 1 2.6420ms 2.6420ms 2.6420ms [CUDA memcpy DtoH]
0.04% 1.5111ms 2 755.54us 736ns 1.5103ms [CUDA memcpy HtoD]
And:
$ time ./cpuversion
real 0m3.382s
user 0m3.371s
sys 0m0.012s
Their difference is statistically insignificant. The CUDA-kernel takes approximately 3-4 seconds, why isn't it a lot faster? Is my code run in parallel?
PS: I'm new to CUDA, so I could be missing something trivial.
SOLUTION
What I found out, is that CUDA does not let you access memory willy-nilly from blocks. I guess the general strategy of CUDA programming is:
allocate and copy memory from RAM to cuda using cudaMalloc and cudaMemCpy
divide the workload among blocks and threads in such a way that the memory accessed by different blocks doesn't overlap much.
If there is overlap between the memory used by blocks, start each block by copying the memory inside a shared array. Notice that:
the size of this array must be known compile time
it's size is limited
this memory is shared by each thread in ONE block, so __shared double foo[10] allocates 10 doubles for each BLOCK.
copy the memory needed by one block to the shared variables inside the kernel. Of course, you use the different threads to do this 'efficiently'
sync the threads, such that all data is there before it is used.
process the data, and write the result. it to the output array of the kernel
synch again, I'm not sure why, but everyone on the internet is doing it :S
copy the GPU memory back to RAM
clean up the GPU memory.
This gives the following code. It is mex-code, for Matlab for the structural similarity, which also works via a sliding kernel, but over 2 images and with a different aggregate than the dot-product.
// author: Herbert Kruitbosch, CC: be nice, include my name in documentation/papers/publications when used
#include <matrix.h>
#include <mex.h>
#include <cmath>
#include <iostream>
#include <fstream>
#include <iostream>
#include <stdio.h>
static void HandleError(
cudaError_t err,
const char *file,
int line )
{
if (err != cudaSuccess)
{
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define TILE_WIDTH 31
__device__ inline double sim(double v0, double v1, double c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
__device__ inline int index(int rows, int cols, int row, int col)
{
return row + col*rows;
}
__global__ void ssimkernel(double *test, double *reference, const double * __restrict__ kernel, double *ssim, int k, int rows, int cols, int tile_batches_needed)
{
int radius = k / 2;
int block_width = TILE_WIDTH - k + 1;
__shared__ double tile_test [TILE_WIDTH][TILE_WIDTH];
__shared__ double tile_reference[TILE_WIDTH][TILE_WIDTH];
for(int offset=0; offset < tile_batches_needed; ++offset)
{
int dest = block_width*block_width*offset + threadIdx.y * block_width + threadIdx.x;
int destRow = dest / TILE_WIDTH;
int destCol = dest % TILE_WIDTH;
int srcRow = blockIdx.y * block_width + destRow - radius;
int srcCol = blockIdx.x * block_width + destCol - radius;
int src = srcCol * rows + srcRow;
if (destRow < TILE_WIDTH)
{
if (srcRow >= 0 and srcRow < rows and
srcCol >= 0 and srcCol < cols)
{
tile_test [destRow][destCol] = test [src];
tile_reference[destRow][destCol] = reference[src];
}
else
{
tile_test [destRow][destCol] = 0;
tile_reference[destRow][destCol] = 0;
}
}
}
__syncthreads();
double mean_test = 0;
double mean_reference = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
mean_test += w * tile_test [threadIdx.y+i][threadIdx.x+j];
mean_reference += w * tile_reference[threadIdx.y+i][threadIdx.x+j];
}
double var_test = 0;
double var_reference = 0;
double correlation = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
double a = (tile_test [threadIdx.y+i][threadIdx.x+j] - mean_test );
double b = (tile_reference[threadIdx.y+i][threadIdx.x+j] - mean_reference);
var_test += w * a * a;
var_reference += w * b * b;
correlation += w * a * b;
}
int destRow = blockIdx.y * block_width + threadIdx.y;
int destCol = blockIdx.x * block_width + threadIdx.x;
if (destRow < rows and destCol < cols)
ssim[destCol * rows + destRow] = sim(mean_test, mean_reference, 0.01) * (0.03 + 2*correlation) / (0.03 + var_test + var_reference);
__syncthreads();
}
template<typename T>
inline T sim(T v0, T v1, T c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
inline int upperdiv(int a, int b) {
return (a + b - 1) / b;
}
void mexFunction(int nargout, mxArray *argout[], int nargin, const mxArray *argin[])
{
mwSize rows = mxGetDimensions(argin[0])[0];
mwSize cols = mxGetDimensions(argin[0])[1];
mwSize k = mxGetDimensions(argin[2])[0];
mwSize channels = mxGetNumberOfDimensions(argin[0]) <= 2 ? 1 : mxGetDimensions(argin[0])[2];
int dims[] = {rows, cols, channels};
argout[0] = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL);
double *test = (double *)mxGetData(argin[0]);
double *reference = (double *)mxGetData(argin[1]);
double *gaussian = (double *)mxGetData(argin[2]);
double *ssim = (double *)mxGetData(argout[0]);
double *test_cuda;
double *reference_cuda;
double *gaussian_cuda;
double *ssim_cuda;
HANDLE_ERROR( cudaMalloc((void **) &test_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &reference_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &gaussian_cuda, k*k*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &ssim_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMemcpy(gaussian_cuda, gaussian, k*k*sizeof(double), cudaMemcpyHostToDevice) );
int block_width = TILE_WIDTH - k + 1;
int tile_batches_needed = upperdiv(TILE_WIDTH*TILE_WIDTH, block_width*block_width);
for(int c=0; c<channels; ++c)
{
HANDLE_ERROR( cudaMemcpy(test_cuda, test + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(reference_cuda, reference + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
dim3 dimGrid(upperdiv(cols, block_width), upperdiv(rows, block_width), 1);
dim3 dimBlock(block_width, block_width, 1);
ssimkernel<<<dimGrid, dimBlock>>>(test_cuda, reference_cuda, gaussian_cuda, ssim_cuda, k, rows, cols, tile_batches_needed);
HANDLE_ERROR( cudaMemcpy(ssim + rows*cols*c, ssim_cuda, rows*cols*sizeof(double), cudaMemcpyDeviceToHost) );
}
cudaFree( test_cuda );
cudaFree( reference_cuda );
cudaFree( gaussian_cuda );
cudaFree( ssim_cuda );
}

kernelkernel<<<grid, 1>>>
This is a significant issue; threads on nVidia GPUs work in warps of 32 threads. However, you've only assigned a single thread to each block, which means 31 of those threads will sit idle while a single thread does work. And usually, for kernels where you have the flexibility, you'll usually want several warps per block rather than just one.
You could get an immediate speedup by using N blocks and N threads per block, rather than using N^2 blocks.
Actually, N might be too big, since there's an upper limit on the number of threads per block. Although you could choose a suitable M so that that you use N/M threads per block, and N * M blocks.
In fact, you'll probably get the best results in this regard by picking some M (I'm guessing 256 will probably be near optimal) and launching with L=ceiling(N*N/M) blocks and M blocks per thread. Then each thread figures reconstructs an index in [0, M*L) based on its block and thread ID, and then those whose index is in [0,N*N) will proceed to split that index into an x and y coordinate and do work.

Accessing global memory in a kernel is costly, because of its latency. A global memory request (both reading and writing) takes hundreds of clock cycles to complete. You want to minimise the amount of times global memory is accessed, and access it in contiguous blocks.
If each piece of data is accessed exactly once, there's nothing to do about the latency, but that's seldom the case. And definitely not the case in your code, where the kernel array is accessed by all threads in the same pattern, and a lot of image is accessed by multiple threads as well.
The solution for that is to start the kernel by fetching the data from the high-latency global memory into the low-latency shared memory. Shared memory is a block of memory on the multiprocessor, and its latency is comparable to that of registers. So most simple kernels follow a structure like this:
Each thread fetches data from global memory to shared memory. You want to fetch data in contiguous sequences if possible, as global memory is accessed through transactions. If there's not enough data for all threads to fetch, leave some of them idle.
Threads operate on the data in shared memory.
Data is written from shared memory back to global memory in the same pattern as it was fetched in step 1.
Shared memory is shared by all threads within a thread block. Which leads us to the second big issue in your code: you're not using thread blocks at all. Threads in one block run on one multiprocessor, share shared memory, can be synchronised with each other etc. You need to organise threads into blocks well to get the most out of them.
The grid of blocks is just a mechanism to be able to run more blocks at one invocation. All the goodies of parallel instruction execution and shared memory access are within a block. The grid of blocks is just "yeah, sorry, my data's so big a single block won't do, just run many of them."
You're doing the exact opposite: your blocks have one thread each, which means that in each step, only one thread from each warp runs on the multiprocessor (based on your device's compute capability and the number of warp schedulers available, this means something like 2–4 threads on one multiprocessor at most).
You'll have to re-structure your threads to mirror the data access patterns, and prefetch data into shared memory. This will give you the performance boost you expect.
The above is just a short summary. Refer to the CUDA programming guide for details on block organisation, shared memory, and global memory transactions.

If you're using global memory in CUDA, all the data access will be synchronized in something like queue, and you'll receive almost linear solution, not parallel.
Also, transfering a large dataset from your RAM memory to GPU memory also takes a lot of time (the speed of bus is limited).
So, i think you have to somehow parallel your data across computation units in your GPU (part them into shared memory).
Check this to see solution of how to improve your GPU memory usage in the case that similar to yours.

Numerical error in cuda/cublas simple kernel using particular input

I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.

Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}

count3's in cuda is very slow

I have written a small program in CUDA that counts how many 3's are in a C array and prints them.
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cstdlib>
__global__ void incrementArrayOnDevice(int *a, int N, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ int s_a[512]; // one for each thread
//s_a[threadIdx.x] = a[id];
if( id < N )
{
//if( s_a[threadIdx.x] == 3 )
if( a[id] == 3 )
{
atomicAdd(count, 1);
}
}
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
int N = 16777216;
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
// do calculation on device
int blockSize = 512;
int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1);
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", count);
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
The result I get is:
real 0m3.025s
user 0m2.989s
sys 0m0.029s
When I run it on the CPU with 4 threads I get:
real 0m0.101s
user 0m0.100s
sys 0m0.024s
Note that the GPU is an old one - I don't know the exact model because I do not have root access to it, but the OpenGL version it runs is 1.2 using the MESA driver.
Am I doing something wrong? What can I do to make it run faster?
Note: I have tried using buckets for each block (so the atomicAdd()s would be reduced for each one) but I get exactly the same performance.
I have also tried copying the 512 integers that are assigned to this block to a shared block of memory (you can see it in the comments) and the time is the same again.

This is in response to your question "What can I do to make it run faster?" As I mentioned in the comments, there are issues (probably) with the timing methodology, and the main suggestion I have for speed improvement is to use a "classical parallel reduction" algorithm. The following code implements a better (in my opinion) timing measurement, and also converts your kernel to a reduction style kernel:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
When I run this on a reasonably fast GPU (a Quadro 5000, a little slower than a Tesla M2050) I get the following:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
We see that the GPU is substantially faster than this (naive, single-threaded) CPU implementation for the compute portion. When we add in the cost to transfer the data, the GPU version is slower but is not 30x slower.
By way of comparison, when I timed your original algorithm, I got numbers like this:
GPU total time = 0.118131s
GPU compute time = 0.093213s
My system config for this was Xeon X5560 CPU, RHEL 5.5, CUDA 5.0, Quadro5000 GPU.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js