Cuda matrix multiplication gives wrong answer

Cuda matrix multiplication gives wrong answer - c++

Update!
My current code doesn't check for out of bounds memory access. When I run the cuda memcheck, it says memory access is bad even for matrices of just 2 by 2! I'm accessing memory where I shouldn't somehow and that's the problem!
To check for out of bounds memory access, run cuda-memcheck ./(insert executable here)
Shown below is my code for the matrix multiplication itself:
dim3 block(32,32);
dim3 grid( (n+31)/32, (n+31)/32 );
matrixMul<<<grid,block>>>(d_C, d_A, d_B, n, k);
kA and kB are matrices with values in them (they're all 2's to make it easier).
m, n, k are all the same number for my square matrices
kC is the matrix to store the answer.
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
__global__ void matrixMul(float *kC, float *kA, float *kB, int n, int k)
{
int tx = blockIdx.x * 32 + threadIdx.x;
int ty = blockIdx.y * 32 + threadIdx.y;
float value = 0;
for (int i=0;i<n;i++)
{
float elementA=kA[ty*n+i];
float elementB=kB[i*k+tx];
value += elementA*elementB;
}
kC[ty*n+tx] = value;
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_

Based on how you are defining the grid of threads, you should add a thread check to the kernel code like this:
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
__global__ void matrixMul(float *kC, float *kA, float *kB, int n, int k)
{
int tx = blockIdx.x * 32 + threadIdx.x;
int ty = blockIdx.y * 32 + threadIdx.y;
if ((ty < n) && (tx < n)) { // add this line
float value = 0;
for (int i=0;i<n;i++)
{
float elementA=kA[ty*n+i];
float elementB=kB[i*k+tx];
value += elementA*elementB;
}
kC[ty*n+tx] = value;
} // add this line
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_
Otherwise threads outside the valid array array will corrupt your results. Things work for multiples of 32x32 because there are no invalid threads. In that case you're launching exactly the required number of threads. But in other cases you are launching extra threads. These extra threads, if allowed to compute an invalid matrix position, will corrupt the results.

Related

CUDA Vector Reduction to handle vectors of length less than 512?

I'm working on parallel vector_reduction algorithm tutorial from NVIDIA to implement the algorithm using CUDA C++ API. I have implemented the algorithm but it only works for vector lengths that are fixed to 512. I am not able to figure out how to get it working for vectors less than 512? I want it to work for arbitrary sizes, i.e, 324, 123, 23.
#include <stdio.h>
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float partialSum[NUM_ELEMENTS];
int tx = threadIdx.x;
int i = tx + blockIdx.x * blockDim.x;
if (i < n) {
partialSum[tx] = g_data[i];
}
int stride;
for (stride = blockDim.x/2; stride > 0; stride >>= 1) {
__syncthreads();
if (tx < stride) {
partialSum[tx] += partialSum[tx + stride];
}
}
if (tx == 0) {
g_data[blockIdx.x] = partialSum[tx];
}
}
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
dim3 blockSize, gridSize;
// Number of threads in each thread block
blockSize = dim3(num_elements, 1, 1);
// Number of thread blocks in grid
gridSize = dim3(1, 1, 1);
// Invoke the kernel
reduction<<<gridSize, blockSize>>>(d_data, num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
int main() {
float *data = new float[NUM_ELEMENTS];
for (int i = 0; i < NUM_ELEMENTS; i++) data[i] = 1;
float r = computeOnDevice(data, NUM_ELEMENTS);
printf(" result = %f\n" , r);
}

Your code is 100% correct. The problem is that your bitshifts don't account for the last part of your array. You can easily fix this by artificially extending the array to the next power of 2. This way your entire array will be reduced and the extra "elements" (they don't actually exist) are just ignored.
#include <math.h>
__global__ void reduction(float *g_data, int n){
// figure out exponent of next larger power of 2
int exponent = ceilf(log2f(n));
// calculate next larger power of 2
int size = (int)powf(2, exponent);
__shared__ float partialSum[NUM_ELEMENTS];
int tx = threadIdx.x;
int i = tx + blockIdx.x * blockDim.x;
if (i < n){
partialSum[tx] = g_data[i];
}
for (int stride = size / 2; stride > 0; stride >>= 1){
__syncthreads();
if (tx < stride) {
// all threads that run out of bounds do nothing
// equivalent to adding 0
if((tx + stride) < n)
partialSum[tx] += partialSum[tx + stride];
}
}
if (tx == 0){
g_data[blockIdx.x] = partialSum[tx];
}
}
Edit
Regarding your comment, this method of reduction will never work for an array that is being reduced in multiple blocks. So, for compute capability 1.0-1.3, the largest array you can reduce is 512 elements, for compute capability >1.3 you can do up to 1024 elements, this is the maximum number of threads per block.
This is because __shared__ memory is shared among threads not blocks. So, to reduce an array scattered over multiple blocks you'd need to partition the array such that each block reduces a chunk and then leverage __global__ memory to reduce the values from all blocks. However, __global__ memory is approximately 10-20 times slower than the (on-chip) __shared__ memory, so once you start using a lot of blocks, this will become very inefficient.
The alternative would be to have each thread process multiple indices, however, eventually your partialSum array won't fit into shared memory anymore and overflow into global memory anyway. This approach would also mean you can never use more than 512 (or 1024) threads, which defeats the purpose of using CUDA which depends on running a very large number of threads to hide latency and make the expensive memory transfer from host to device worth it.

Optimizing execution of a CUDA kernel for Triangular Matrix calculation

I am developing my first Cuda application, and I have a kernel with "below-expected throughput", which seems to be the biggest bottleneck at the moment.
The task of the kernel is to compute an N by N sized matrix (DD) containing squared distances between all elements on a data matrix. The data matrix (Y) is size N by D (to support multi dimensional data) and stored as row-major.
Source:
__global__ void computeSquaredEuclideanDistance(const float * __restrict__ Y, float * __restrict__ DD, const int N, const int D) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < N * N; i += stride) {
const int m = i / N;
const int n = i % N;
float tmp = 0;
for (int d = 0; d < D; ++d) {
const float Ynd = Y[d + D * n];
const float Ymd = Y[d + D * m];
const float Ydiff = Ynd - Ymd;
tmp += Ydiff * Ydiff;
}
DD[n + N * m] = tmp;
}
}
This is being called with size_t blockSize = 256 and size_t numBlocks = (N*N + blockSize - 1)/blockSize.
How can I optimize this kernel? My initial thought is that the time-consuming part is reading data without exploiting some sort of shared memory, but can anyone give me pointers on how to approach this?
Remarks from the nvvc profiling tool:
Latency analysis:
Compute utilization at around 40%
Memory (L2 cache) utilization at around 35%
Occupancy is not an issue
Active Warps at 57.59 of a theoretical 64
Occupancy at 90% of a theoretical 100
For my application, typical values are:
5k < N < 30k
D is either 2 or 3

I typically disregard these types of optimization questions because they are on the verge of off-topic, in my opinion. Worst still, you provide no MCVE so anyone trying to answer would have to write all their own support code to compile and benchmark your kernel. And this sort of work does require benchmarking and code analysis. But because your problem is basically a linear algebra problem (and I like linear algebra), I answered it rather than close voting it as too broad......
With that off my chest. there are a couple of things which immediately jump out in the code which could be improved and which would probably have a material affect on the run time.
The first is that the trip count of the inner loop is known a priori. Anytime you have a situation like that, let the compiler know. Loop unrolling and code reordering is a very powerful compiler optimization and the NVIDIA compiler is extremely good at it. If you move D into a template parameter, you can do something like this:
template<int D>
__device__ float esum(const float *x, const float *y)
{
float val = 0.f;
#pragma unroll
for(int i=0; i<D; i++) {
float diff = x[i] - y[i];
val += diff * diff;
}
return val;
}
template<int D>
__global__
void vdistance0(const float * __restrict__ Y, float * __restrict__ DD, const int N)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < N * N; i += stride) {
const int m = i / N;
const int n = i % N;
DD[n + N * m] = esum<D>(Y + D * n, Y + D * m);
}
}
template __global__ void vdistance0<2>(const float *, float *, const int);
template __global__ void vdistance0<3>(const float *, float *, const int);
The compiler will inline esum and unroll the inner loop and it can then use its reordering heuristics to better interleave loads and flops to improve throughput. The resulting code has a lower register footprint too. When I run this for N=10000 and D=2, I get about 35% speed up (7.1ms versus 4.5ms on a GTX 970 with CUDA 9.1).
But there is an even more glaringly obvious optimization than this. The calculation you are performing will produce a symmetric output matrix. You only need to do (N*N)/2 operations to compute the full matrix, rather than the N*N you are doing in your code [technically N(N/2 -1) because the diagonal entries are zero, but lets forget the diagonal for the purposes of this discussion].
So taking a different approach and using one block to calculate each row of the upper triangular output matrix, then you can do something like this:
struct udiag
{
float *p;
int m;
__device__ __host__ udiag(float *_p, int _m) : p(_p), m(_m) {};
__device__ __host__ float* get_row(int i) { return p + (i * (i + 1)) / 2; };
};
template<int D>
__global__
void vdistance2(const float * __restrict__ Y, float * __restrict__ DD, const int N)
{
int rowid = blockIdx.x;
int colid = threadIdx.x;
udiag m(DD, N);
for(; rowid < N; rowid += gridDim.x) {
float* p = m.get_row(rowid);
const float* y = Y + D * rowid;
for(int i=colid; i < (N-rowid); i += blockDim.x) {
p[i] = esum<D>(y, y + D * i);
}
}
}
template __global__ void vdistance2<2>(const float *, float *, const int);
template __global__ void vdistance2<3>(const float *, float *, const int);
This uses a little helper class to encapsulate the triangle numbers needed for the addressing scheme for the upper triangular output matrix. Doing this saves an enormous amount of memory and memory bandwidth as well as reducing the total FLOP count for the calculation. If you need to do other things afterwards BLAS (and CUBLAS) supports computations on upper or lower triangular matrices. Use them. When I run this I get about 75% speedup (7.1ms versus 1.6ms on the same GTX 970).
Huge disclaimer: All the code you see here was written during a 45 minute lunch break and as been very lightly tested. I make absolutely no claims that anything in this answer is actually correct. I have confirmed that it compiles and doesn't produce a runtime error when I run it to get profiling data. That is it. Cavaet Emptor and all that.

Why is my CUDA implementation equally fast as my CPU implementation

I created some code to do a 2D convlution on a 1300x1300 grayscale image and a 15x15 kernel, in standard C++ and in CUDA. Both versions:
CPU:
#include <iostream>
#include <exception>
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
inline int index(int x, int y)
{
return x*my + y;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
for (int x=0; x<N; ++x)
for (int y=0; y<N; ++y)
{
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
delete[] image;
delete[] kernel;
delete[] result;
}
GPU:
#include <iostream>
#include <exception>
// ignore, just for error handling
struct ErrorHandler {
int d_line;
char const *d_file;
ErrorHandler(int line, char const *file) : d_line(line), d_file(file) {};
};
#define EH ErrorHandler(__LINE__, __FILE__)
ErrorHandler operator<<(ErrorHandler eh, cudaError_t err)
{
if (err != cudaSuccess)
{
std::cerr << cudaGetErrorString( err ) << " in " << eh.d_file << " at line " << eh.d_line << '\n';
throw std::exception();
}
return eh;
}
// end.
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
__device__ inline int index(int x, int y)
{
return x*my + y;
}
__global__ void kernelkernel(double *image, double *kernel, double *result)
{
int x = blockIdx.x;
int y = blockIdx.y; // becomes: int y = threadIdx.x;
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
double *image_cuda;
double *kernel_cuda;
double *result_cuda;
EH << cudaMalloc((void **) &image_cuda, N*N*sizeof(double));
EH << cudaMalloc((void **) &kernel_cuda, K*K*sizeof(double));
EH << cudaMalloc((void **) &result_cuda, N*N*sizeof(double));
EH << cudaMemcpy(image_cuda, image, N*N*sizeof(double), cudaMemcpyHostToDevice);
EH << cudaMemcpy(kernel_cuda, kernel, K*K*sizeof(double), cudaMemcpyHostToDevice);
dim3 grid ( N, N );
kernelkernel<<<grid, 1>>>(image_cuda, kernel_cuda, result_cuda);
// replace previous 2 statements with:
// kernelkernel<<<N, N>>>(image_cuda, kernel_cuda, result_cuda);
EH << cudaMemcpy(result, result_cuda, N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree( image_cuda );
cudaFree( kernel_cuda );
cudaFree( result_cuda );
delete[] image;
delete[] kernel;
delete[] result;
}
I would expect the cuda code to be a lot faster, however:
$ nvprof ./gpuversion
==17806== NVPROF is profiling process 17806, command: ./gpuversion
==17806== Profiling application: ./gpuversion
==17806== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.89% 3.83149s 1 3.83149s 3.83149s 3.83149s kernelkernel(double*, double*, double*)
0.07% 2.6420ms 1 2.6420ms 2.6420ms 2.6420ms [CUDA memcpy DtoH]
0.04% 1.5111ms 2 755.54us 736ns 1.5103ms [CUDA memcpy HtoD]
And:
$ time ./cpuversion
real 0m3.382s
user 0m3.371s
sys 0m0.012s
Their difference is statistically insignificant. The CUDA-kernel takes approximately 3-4 seconds, why isn't it a lot faster? Is my code run in parallel?
PS: I'm new to CUDA, so I could be missing something trivial.
SOLUTION
What I found out, is that CUDA does not let you access memory willy-nilly from blocks. I guess the general strategy of CUDA programming is:
allocate and copy memory from RAM to cuda using cudaMalloc and cudaMemCpy
divide the workload among blocks and threads in such a way that the memory accessed by different blocks doesn't overlap much.
If there is overlap between the memory used by blocks, start each block by copying the memory inside a shared array. Notice that:
the size of this array must be known compile time
it's size is limited
this memory is shared by each thread in ONE block, so __shared double foo[10] allocates 10 doubles for each BLOCK.
copy the memory needed by one block to the shared variables inside the kernel. Of course, you use the different threads to do this 'efficiently'
sync the threads, such that all data is there before it is used.
process the data, and write the result. it to the output array of the kernel
synch again, I'm not sure why, but everyone on the internet is doing it :S
copy the GPU memory back to RAM
clean up the GPU memory.
This gives the following code. It is mex-code, for Matlab for the structural similarity, which also works via a sliding kernel, but over 2 images and with a different aggregate than the dot-product.
// author: Herbert Kruitbosch, CC: be nice, include my name in documentation/papers/publications when used
#include <matrix.h>
#include <mex.h>
#include <cmath>
#include <iostream>
#include <fstream>
#include <iostream>
#include <stdio.h>
static void HandleError(
cudaError_t err,
const char *file,
int line )
{
if (err != cudaSuccess)
{
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define TILE_WIDTH 31
__device__ inline double sim(double v0, double v1, double c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
__device__ inline int index(int rows, int cols, int row, int col)
{
return row + col*rows;
}
__global__ void ssimkernel(double *test, double *reference, const double * __restrict__ kernel, double *ssim, int k, int rows, int cols, int tile_batches_needed)
{
int radius = k / 2;
int block_width = TILE_WIDTH - k + 1;
__shared__ double tile_test [TILE_WIDTH][TILE_WIDTH];
__shared__ double tile_reference[TILE_WIDTH][TILE_WIDTH];
for(int offset=0; offset < tile_batches_needed; ++offset)
{
int dest = block_width*block_width*offset + threadIdx.y * block_width + threadIdx.x;
int destRow = dest / TILE_WIDTH;
int destCol = dest % TILE_WIDTH;
int srcRow = blockIdx.y * block_width + destRow - radius;
int srcCol = blockIdx.x * block_width + destCol - radius;
int src = srcCol * rows + srcRow;
if (destRow < TILE_WIDTH)
{
if (srcRow >= 0 and srcRow < rows and
srcCol >= 0 and srcCol < cols)
{
tile_test [destRow][destCol] = test [src];
tile_reference[destRow][destCol] = reference[src];
}
else
{
tile_test [destRow][destCol] = 0;
tile_reference[destRow][destCol] = 0;
}
}
}
__syncthreads();
double mean_test = 0;
double mean_reference = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
mean_test += w * tile_test [threadIdx.y+i][threadIdx.x+j];
mean_reference += w * tile_reference[threadIdx.y+i][threadIdx.x+j];
}
double var_test = 0;
double var_reference = 0;
double correlation = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
double a = (tile_test [threadIdx.y+i][threadIdx.x+j] - mean_test );
double b = (tile_reference[threadIdx.y+i][threadIdx.x+j] - mean_reference);
var_test += w * a * a;
var_reference += w * b * b;
correlation += w * a * b;
}
int destRow = blockIdx.y * block_width + threadIdx.y;
int destCol = blockIdx.x * block_width + threadIdx.x;
if (destRow < rows and destCol < cols)
ssim[destCol * rows + destRow] = sim(mean_test, mean_reference, 0.01) * (0.03 + 2*correlation) / (0.03 + var_test + var_reference);
__syncthreads();
}
template<typename T>
inline T sim(T v0, T v1, T c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
inline int upperdiv(int a, int b) {
return (a + b - 1) / b;
}
void mexFunction(int nargout, mxArray *argout[], int nargin, const mxArray *argin[])
{
mwSize rows = mxGetDimensions(argin[0])[0];
mwSize cols = mxGetDimensions(argin[0])[1];
mwSize k = mxGetDimensions(argin[2])[0];
mwSize channels = mxGetNumberOfDimensions(argin[0]) <= 2 ? 1 : mxGetDimensions(argin[0])[2];
int dims[] = {rows, cols, channels};
argout[0] = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL);
double *test = (double *)mxGetData(argin[0]);
double *reference = (double *)mxGetData(argin[1]);
double *gaussian = (double *)mxGetData(argin[2]);
double *ssim = (double *)mxGetData(argout[0]);
double *test_cuda;
double *reference_cuda;
double *gaussian_cuda;
double *ssim_cuda;
HANDLE_ERROR( cudaMalloc((void **) &test_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &reference_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &gaussian_cuda, k*k*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &ssim_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMemcpy(gaussian_cuda, gaussian, k*k*sizeof(double), cudaMemcpyHostToDevice) );
int block_width = TILE_WIDTH - k + 1;
int tile_batches_needed = upperdiv(TILE_WIDTH*TILE_WIDTH, block_width*block_width);
for(int c=0; c<channels; ++c)
{
HANDLE_ERROR( cudaMemcpy(test_cuda, test + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(reference_cuda, reference + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
dim3 dimGrid(upperdiv(cols, block_width), upperdiv(rows, block_width), 1);
dim3 dimBlock(block_width, block_width, 1);
ssimkernel<<<dimGrid, dimBlock>>>(test_cuda, reference_cuda, gaussian_cuda, ssim_cuda, k, rows, cols, tile_batches_needed);
HANDLE_ERROR( cudaMemcpy(ssim + rows*cols*c, ssim_cuda, rows*cols*sizeof(double), cudaMemcpyDeviceToHost) );
}
cudaFree( test_cuda );
cudaFree( reference_cuda );
cudaFree( gaussian_cuda );
cudaFree( ssim_cuda );
}

kernelkernel<<<grid, 1>>>
This is a significant issue; threads on nVidia GPUs work in warps of 32 threads. However, you've only assigned a single thread to each block, which means 31 of those threads will sit idle while a single thread does work. And usually, for kernels where you have the flexibility, you'll usually want several warps per block rather than just one.
You could get an immediate speedup by using N blocks and N threads per block, rather than using N^2 blocks.
Actually, N might be too big, since there's an upper limit on the number of threads per block. Although you could choose a suitable M so that that you use N/M threads per block, and N * M blocks.
In fact, you'll probably get the best results in this regard by picking some M (I'm guessing 256 will probably be near optimal) and launching with L=ceiling(N*N/M) blocks and M blocks per thread. Then each thread figures reconstructs an index in [0, M*L) based on its block and thread ID, and then those whose index is in [0,N*N) will proceed to split that index into an x and y coordinate and do work.

Accessing global memory in a kernel is costly, because of its latency. A global memory request (both reading and writing) takes hundreds of clock cycles to complete. You want to minimise the amount of times global memory is accessed, and access it in contiguous blocks.
If each piece of data is accessed exactly once, there's nothing to do about the latency, but that's seldom the case. And definitely not the case in your code, where the kernel array is accessed by all threads in the same pattern, and a lot of image is accessed by multiple threads as well.
The solution for that is to start the kernel by fetching the data from the high-latency global memory into the low-latency shared memory. Shared memory is a block of memory on the multiprocessor, and its latency is comparable to that of registers. So most simple kernels follow a structure like this:
Each thread fetches data from global memory to shared memory. You want to fetch data in contiguous sequences if possible, as global memory is accessed through transactions. If there's not enough data for all threads to fetch, leave some of them idle.
Threads operate on the data in shared memory.
Data is written from shared memory back to global memory in the same pattern as it was fetched in step 1.
Shared memory is shared by all threads within a thread block. Which leads us to the second big issue in your code: you're not using thread blocks at all. Threads in one block run on one multiprocessor, share shared memory, can be synchronised with each other etc. You need to organise threads into blocks well to get the most out of them.
The grid of blocks is just a mechanism to be able to run more blocks at one invocation. All the goodies of parallel instruction execution and shared memory access are within a block. The grid of blocks is just "yeah, sorry, my data's so big a single block won't do, just run many of them."
You're doing the exact opposite: your blocks have one thread each, which means that in each step, only one thread from each warp runs on the multiprocessor (based on your device's compute capability and the number of warp schedulers available, this means something like 2–4 threads on one multiprocessor at most).
You'll have to re-structure your threads to mirror the data access patterns, and prefetch data into shared memory. This will give you the performance boost you expect.
The above is just a short summary. Refer to the CUDA programming guide for details on block organisation, shared memory, and global memory transactions.

If you're using global memory in CUDA, all the data access will be synchronized in something like queue, and you'll receive almost linear solution, not parallel.
Also, transfering a large dataset from your RAM memory to GPU memory also takes a lot of time (the speed of bus is limited).
So, i think you have to somehow parallel your data across computation units in your GPU (part them into shared memory).
Check this to see solution of how to improve your GPU memory usage in the case that similar to yours.

Numerical error in cuda/cublas simple kernel using particular input

I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.

Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}

Cuda, calculate distance matrix between 3d objects

I have a "string"(molecule) of connected N objects(atoms) in 3D (each atom has a coordinates). And I need to calculate a distance between each pair of atoms in a molecule (see pseudo code below ). How could it be done with CUDA? Should I pass to a kernel function 2 3D Arrays? Or 3 arrays with coordinates: X[N], Y[N], Z[N]? Thanks.
struct atom
{
double x,y,z;
}
int main()
{
//N number of atoms in a molecule
double DistanceMatrix[N][N];
double d;
atom Atoms[N];
for (int i = 0; i < N; i ++)
for (int j = 0; j < N; j++)
DistanceMatrix[i][j] = (atoms[i].x -atoms[j].x)*(atoms[i].x -atoms[j].x) +
(atoms[i].y -atoms[j].y)* (atoms[i].y -atoms[j].y) + (atoms[i].z -atoms[j].z)* (atoms[i].z -atoms[j].z;
}

Unless you're working with very large molecules, there probably won't be enough work to keep the GPU busy, so calculations will be faster with the CPU.
If you meant to calculate the Euclidean distance, your calculation is not correct. You need the 3D version of the Pythagorean theorem.
I would use a SoA for storing the coordinates.
You want to generate a memory access pattern with as many coalesced reads and writes as possible. To do that, arrange for addresses or indexes generated by the 32 threads in each warp to be as close to each other as possible (a bit simplified).
threadIdx designates thread indexes within a block and blockIdx designates block indexes within the grid. blockIdx is always the same for all threads in a warp. Only threadIdx varies within the threads in a block. To visualize how the 3 dimensions of threadIdx are assigned to threads, think of them as nested loops where x is the inner loop and z is the outer loop. So, threads with adjacent x values are the most likely to be within the same warp and, if x is divisible by 32, only threads sharing the same x / 32 value are within the same warp.
I have included a complete example for your algorithm below. In the example, the i index is derived from threadIdx.x so, to check that warps would generate coalesced reads and writes, I would go over the code while inserting a few consecutive values such as 0, 1 and 2 for i and checking that the generated indexes would also be consecutive.
Addresses generated from the j index are less important as j is derived from threadIdx.y and so is less likely to vary within a warp (and will never vary if threadIdx.x is divisible by 32).
#include "cuda_runtime.h"
#include <iostream>
using namespace std;
const int N(20);
#define check(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
int div_up(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z);
int main(int argc, char **argv)
{
double* atoms_x_h;
check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));
double* atoms_y_h;
check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));
double* atoms_z_h;
check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));
for (int i(0); i < N; ++i) {
atoms_x_h[i] = i;
atoms_y_h[i] = i;
atoms_z_h[i] = i;
}
double* atoms_x_d;
check(cudaMalloc(&atoms_x_d, N * sizeof(double)));
double* atoms_y_d;
check(cudaMalloc(&atoms_y_d, N * sizeof(double)));
double* atoms_z_d;
check(cudaMalloc(&atoms_z_d, N * sizeof(double)));
check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));
double* distances_d;
check(cudaMalloc(&distances_d, N * N * sizeof(double)));
const int threads_per_block(256);
dim3 n_blocks(div_up(N, threads_per_block));
calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);
check(cudaPeekAtLastError());
check(cudaDeviceSynchronize());
double* distances_h;
check(cudaMallocHost(&distances_h, N * N * sizeof(double)));
check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));
for (int i(0); i < N; ++i) {
for (int j(0); j < N; ++j) {
cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
}
}
check(cudaFree(distances_d));
check(cudaFreeHost(distances_h));
check(cudaFree(atoms_x_d));
check(cudaFreeHost(atoms_x_h));
check(cudaFree(atoms_y_d));
check(cudaFreeHost(atoms_y_h));
check(cudaFree(atoms_z_d));
check(cudaFreeHost(atoms_z_h));
return 0;
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z)
{
int i(threadIdx.x + blockIdx.x * blockDim.x);
int j(threadIdx.y + blockIdx.y * blockDim.y);
if (i >= N || j >= N) {
return;
}
distances[i + N * j] =
(atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
(atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
(atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js