I just started learning CUDA and I have been looking at examples on NVIDIA's website. Specifically, I have implemented the non-shared version of the matrix multiply (the first sample is the non-shared version even though it is in the shared memory section):
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory
I am having a problem with the output when I change the block sizes. NVIDIA's code has a default block size of 16 and this gives me the correct output when I multiply two matrices. However, if I change the block size to anything above 16 (while still being a multiple of 16), I get an output of zero for all elements in the matrix. I tested this on my laptop too and noticed the same results for anything over 32 rather than 16. Could someone explain what is happening? I have two 9800GTX+ video cards in SLI and so I should have a maximum block size of (512,512,1). Why can I only do 16?
Also, I am noticing the same behavior in the shared version of the matrix multiplication (also on the NVIDIA page).
I didn't post my code because I get the same problem if I directly copy the code from the NVIDIA site.
I would really appreciate any help with this or with resources to learn more about these kinds of CUDA details.
Thank you!
I have attached the code as requested:
#include "stdio.h"
#include <cuda.h>
#include <assert.h>
#include <time.h>
#include <math.h>
// This is an example CUDA program that compares the timings of a matrix multiplication.
// The comparisons are between the CPU, GPU, and the GPU with shared memory.
#define BLOCK_SIZE 32
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
typedef void (*FuncPtr)(Matrix& A, Matrix& B, Matrix& C);
void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C);
// Helper declarations
void initializeMatrix(Matrix& A, int rows, int cols, float val);
void copyMatrix(Matrix& dest, Matrix& src);
void freeMatrix(Matrix& A);
void printError(cudaError_t err);
void printMat(Matrix& A);
void setVal(Matrix& A, float val);
double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters);
// CUDA declarations
__global__ void cudaMultMat(Matrix A, Matrix B, Matrix C);
int main() {
printf("Beginning Matrix Multiplication Comparison\n");
// Initialize matrix
Matrix A, B, C;
int rowsA = 32;
int colsA = 32;
int colsB = 32;
initializeMatrix(A, rowsA, colsA, 5.0f);
initializeMatrix(B, colsA, colsB, 2.0f);
initializeMatrix(C, rowsA, colsB, 0.0f);
// C = A * B using CPU, GPU, and GPU with shared memory
FuncPtr gpuMatMult = &multiplyMatrix;
int numOfIterations = 100;
double multTime = applyMultFunc(gpuMatMult, A, B, C, numOfIterations);
printMat(C);
// Update user
printf("Normal Mat Mult Time: %f\n", multTime);
// Cleanup
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
printf("\nPress Enter to continue...\n");
getchar();
return 0;
}
void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C) {
// Initialize device matrices
Matrix deviceA, deviceB, deviceC;
copyMatrix(deviceA, A);
copyMatrix(deviceB, B);
copyMatrix(deviceC, C);
// Initialize number of blocks and threads
dim3 numOfThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
int xSize = (C.width + numOfThreadsPerBlock.x - 1) / numOfThreadsPerBlock.x;
int ySize = (C.height + numOfThreadsPerBlock.y - 1) / numOfThreadsPerBlock.y;
dim3 numOfBlocks(xSize, ySize);
// Call CUDA kernel
cudaMultMat<<<numOfBlocks, numOfThreadsPerBlock>>>(deviceA, deviceB, deviceC);
printError(cudaThreadSynchronize());
printError(cudaMemcpy(C.elements, deviceC.elements, C.height * C.width * sizeof(float), cudaMemcpyDeviceToHost));
// Free cuda memory
printError(cudaFree(deviceA.elements));
printError(cudaFree(deviceB.elements));
printError(cudaFree(deviceC.elements));
}
// CUDA definitions
// GPU matrix multiplication (non-shared memory)
__global__ void cudaMultMat(Matrix A, Matrix B, Matrix C) {
// If the matrices are of the wrong size then return
if(A.width != B.height) {
return;
}
// Initialize the indexes into the grid
int col = (blockDim.x * blockIdx.x) + threadIdx.x;
int row = (blockDim.y * blockIdx.y) + threadIdx.y;
// Initialize the result
float cVal = 0.0f;
// Find the result for the dot product of a row of A and a column of B
for(int i = 0; i < A.width; i++) {
cVal += A.elements[row * A.width + i] * B.elements[i * B.width + col];
}
// If we are in bounds then save the result
if(row < C.height && col < C.width) {
C.elements[row * C.width + col] = cVal;
}
}
// Helper functions
void initializeMatrix(Matrix& A, int rows, int cols, float val) {
A.width = cols;
A.height = rows;
A.stride = A.width;
int numOfElements = A.width * A.height;
A.elements = (float*) malloc(numOfElements * sizeof(float));
for(int i = 0; i < numOfElements; i++) {
A.elements[i] = val;
}
}
void copyMatrix(Matrix& dest, Matrix& src) {
dest.width = src.width;
dest.height = src.height;
dest.stride = src.stride;
int size = src.width * src.height * sizeof(float);
printError(cudaMalloc(&dest.elements, size));
printError(cudaMemcpy(dest.elements, src.elements, size, cudaMemcpyHostToDevice));
}
void freeMatrix(Matrix& A) {
free(A.elements);
}
void printError(cudaError_t err) {
if(err != 0) {
printf("CUDA ERROR: %s\n", cudaGetErrorString(err));
getchar();
}
}
void printMat(Matrix& A) {
printf("*********************************\n");
for(int i = 0; i < A.height; i++) {
for(int j = 0; j < A.width; j++) {
int index = i * A.width + j;
printf("%2.1f, ", A.elements[index]);
}
printf("\n");
}
}
void setVal(Matrix& A, float val) {
for(int i = 0; i < A.width * A.height; i++) {
A.elements[i] = val;
}
}
double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters) {
clock_t startTime = clock();
for(int i = 0; i < numOfIters; i++) {
func(A, B, C);
}
clock_t endTime = clock();
return (double) (endTime - startTime) / CLOCKS_PER_SEC;
}
You're exceeding the threads per block specification of your GPU when you increase the block sizes.
The 9800GTX has a limit of 512 threads per block, regardless of how you create the block. 16*16 = 256 which is OK. 32 x 32 = 1024 which is not OK. In this case the kernel fails to run and so the output is not correct.
Your laptop probably has a newer GPU which supports 1024 threads per block, so 32 x 32 is OK but anything larger is not.
If you add proper cuda error checking to the code you can confirm this. Note that this code appears to have cuda error checking, but the checking implemented on the kernel call is incoomplete. Study the link I gave and you will see the difference. If you modify the code with complete error checking, you will see the error.
if your GPU's compute capability is 1.0/1.1, you can have at most 512 threads per block. But in new GPU device, every block can have at most 1024 threads.
Related
I created some code to do a 2D convlution on a 1300x1300 grayscale image and a 15x15 kernel, in standard C++ and in CUDA. Both versions:
CPU:
#include <iostream>
#include <exception>
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
inline int index(int x, int y)
{
return x*my + y;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
for (int x=0; x<N; ++x)
for (int y=0; y<N; ++y)
{
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
delete[] image;
delete[] kernel;
delete[] result;
}
GPU:
#include <iostream>
#include <exception>
// ignore, just for error handling
struct ErrorHandler {
int d_line;
char const *d_file;
ErrorHandler(int line, char const *file) : d_line(line), d_file(file) {};
};
#define EH ErrorHandler(__LINE__, __FILE__)
ErrorHandler operator<<(ErrorHandler eh, cudaError_t err)
{
if (err != cudaSuccess)
{
std::cerr << cudaGetErrorString( err ) << " in " << eh.d_file << " at line " << eh.d_line << '\n';
throw std::exception();
}
return eh;
}
// end.
#define N 1300
#define K 15
#define K2 ((K - 1) / 2)
template<int mx, int my>
__device__ inline int index(int x, int y)
{
return x*my + y;
}
__global__ void kernelkernel(double *image, double *kernel, double *result)
{
int x = blockIdx.x;
int y = blockIdx.y; // becomes: int y = threadIdx.x;
double r = 0;
for(int i=0; i<K; ++i)
for(int j=0; j<K; ++j)
{
if (x + i - K2 >= 0 and
x + i - K2 < N and
y + j - K2 >= 0 and
y + j - K2 < N)
{
r += kernel[index<K,K>(i,j)] * image[index<N,N>(x+i-K2, y+j-K2)];
}
}
result[index<N,N>(x, y)] = r;
}
int main() {
double *image = new double[N * N];
double *kernel = new double[K * K];
double *result = new double[N * N];
double *image_cuda;
double *kernel_cuda;
double *result_cuda;
EH << cudaMalloc((void **) &image_cuda, N*N*sizeof(double));
EH << cudaMalloc((void **) &kernel_cuda, K*K*sizeof(double));
EH << cudaMalloc((void **) &result_cuda, N*N*sizeof(double));
EH << cudaMemcpy(image_cuda, image, N*N*sizeof(double), cudaMemcpyHostToDevice);
EH << cudaMemcpy(kernel_cuda, kernel, K*K*sizeof(double), cudaMemcpyHostToDevice);
dim3 grid ( N, N );
kernelkernel<<<grid, 1>>>(image_cuda, kernel_cuda, result_cuda);
// replace previous 2 statements with:
// kernelkernel<<<N, N>>>(image_cuda, kernel_cuda, result_cuda);
EH << cudaMemcpy(result, result_cuda, N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree( image_cuda );
cudaFree( kernel_cuda );
cudaFree( result_cuda );
delete[] image;
delete[] kernel;
delete[] result;
}
I would expect the cuda code to be a lot faster, however:
$ nvprof ./gpuversion
==17806== NVPROF is profiling process 17806, command: ./gpuversion
==17806== Profiling application: ./gpuversion
==17806== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.89% 3.83149s 1 3.83149s 3.83149s 3.83149s kernelkernel(double*, double*, double*)
0.07% 2.6420ms 1 2.6420ms 2.6420ms 2.6420ms [CUDA memcpy DtoH]
0.04% 1.5111ms 2 755.54us 736ns 1.5103ms [CUDA memcpy HtoD]
And:
$ time ./cpuversion
real 0m3.382s
user 0m3.371s
sys 0m0.012s
Their difference is statistically insignificant. The CUDA-kernel takes approximately 3-4 seconds, why isn't it a lot faster? Is my code run in parallel?
PS: I'm new to CUDA, so I could be missing something trivial.
SOLUTION
What I found out, is that CUDA does not let you access memory willy-nilly from blocks. I guess the general strategy of CUDA programming is:
allocate and copy memory from RAM to cuda using cudaMalloc and cudaMemCpy
divide the workload among blocks and threads in such a way that the memory accessed by different blocks doesn't overlap much.
If there is overlap between the memory used by blocks, start each block by copying the memory inside a shared array. Notice that:
the size of this array must be known compile time
it's size is limited
this memory is shared by each thread in ONE block, so __shared double foo[10] allocates 10 doubles for each BLOCK.
copy the memory needed by one block to the shared variables inside the kernel. Of course, you use the different threads to do this 'efficiently'
sync the threads, such that all data is there before it is used.
process the data, and write the result. it to the output array of the kernel
synch again, I'm not sure why, but everyone on the internet is doing it :S
copy the GPU memory back to RAM
clean up the GPU memory.
This gives the following code. It is mex-code, for Matlab for the structural similarity, which also works via a sliding kernel, but over 2 images and with a different aggregate than the dot-product.
// author: Herbert Kruitbosch, CC: be nice, include my name in documentation/papers/publications when used
#include <matrix.h>
#include <mex.h>
#include <cmath>
#include <iostream>
#include <fstream>
#include <iostream>
#include <stdio.h>
static void HandleError(
cudaError_t err,
const char *file,
int line )
{
if (err != cudaSuccess)
{
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define TILE_WIDTH 31
__device__ inline double sim(double v0, double v1, double c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
__device__ inline int index(int rows, int cols, int row, int col)
{
return row + col*rows;
}
__global__ void ssimkernel(double *test, double *reference, const double * __restrict__ kernel, double *ssim, int k, int rows, int cols, int tile_batches_needed)
{
int radius = k / 2;
int block_width = TILE_WIDTH - k + 1;
__shared__ double tile_test [TILE_WIDTH][TILE_WIDTH];
__shared__ double tile_reference[TILE_WIDTH][TILE_WIDTH];
for(int offset=0; offset < tile_batches_needed; ++offset)
{
int dest = block_width*block_width*offset + threadIdx.y * block_width + threadIdx.x;
int destRow = dest / TILE_WIDTH;
int destCol = dest % TILE_WIDTH;
int srcRow = blockIdx.y * block_width + destRow - radius;
int srcCol = blockIdx.x * block_width + destCol - radius;
int src = srcCol * rows + srcRow;
if (destRow < TILE_WIDTH)
{
if (srcRow >= 0 and srcRow < rows and
srcCol >= 0 and srcCol < cols)
{
tile_test [destRow][destCol] = test [src];
tile_reference[destRow][destCol] = reference[src];
}
else
{
tile_test [destRow][destCol] = 0;
tile_reference[destRow][destCol] = 0;
}
}
}
__syncthreads();
double mean_test = 0;
double mean_reference = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
mean_test += w * tile_test [threadIdx.y+i][threadIdx.x+j];
mean_reference += w * tile_reference[threadIdx.y+i][threadIdx.x+j];
}
double var_test = 0;
double var_reference = 0;
double correlation = 0;
for(int i=0; i<k; ++i)
for(int j=0; j<k; ++j)
{
double w = kernel[i * k + j];
double a = (tile_test [threadIdx.y+i][threadIdx.x+j] - mean_test );
double b = (tile_reference[threadIdx.y+i][threadIdx.x+j] - mean_reference);
var_test += w * a * a;
var_reference += w * b * b;
correlation += w * a * b;
}
int destRow = blockIdx.y * block_width + threadIdx.y;
int destCol = blockIdx.x * block_width + threadIdx.x;
if (destRow < rows and destCol < cols)
ssim[destCol * rows + destRow] = sim(mean_test, mean_reference, 0.01) * (0.03 + 2*correlation) / (0.03 + var_test + var_reference);
__syncthreads();
}
template<typename T>
inline T sim(T v0, T v1, T c)
{
return (c + 2*v0*v1) / (c + v1*v1 + v0*v0);
}
inline int upperdiv(int a, int b) {
return (a + b - 1) / b;
}
void mexFunction(int nargout, mxArray *argout[], int nargin, const mxArray *argin[])
{
mwSize rows = mxGetDimensions(argin[0])[0];
mwSize cols = mxGetDimensions(argin[0])[1];
mwSize k = mxGetDimensions(argin[2])[0];
mwSize channels = mxGetNumberOfDimensions(argin[0]) <= 2 ? 1 : mxGetDimensions(argin[0])[2];
int dims[] = {rows, cols, channels};
argout[0] = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL);
double *test = (double *)mxGetData(argin[0]);
double *reference = (double *)mxGetData(argin[1]);
double *gaussian = (double *)mxGetData(argin[2]);
double *ssim = (double *)mxGetData(argout[0]);
double *test_cuda;
double *reference_cuda;
double *gaussian_cuda;
double *ssim_cuda;
HANDLE_ERROR( cudaMalloc((void **) &test_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &reference_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &gaussian_cuda, k*k*sizeof(double)) );
HANDLE_ERROR( cudaMalloc((void **) &ssim_cuda, rows*cols*sizeof(double)) );
HANDLE_ERROR( cudaMemcpy(gaussian_cuda, gaussian, k*k*sizeof(double), cudaMemcpyHostToDevice) );
int block_width = TILE_WIDTH - k + 1;
int tile_batches_needed = upperdiv(TILE_WIDTH*TILE_WIDTH, block_width*block_width);
for(int c=0; c<channels; ++c)
{
HANDLE_ERROR( cudaMemcpy(test_cuda, test + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(reference_cuda, reference + rows*cols*c, rows*cols*sizeof(double), cudaMemcpyHostToDevice) );
dim3 dimGrid(upperdiv(cols, block_width), upperdiv(rows, block_width), 1);
dim3 dimBlock(block_width, block_width, 1);
ssimkernel<<<dimGrid, dimBlock>>>(test_cuda, reference_cuda, gaussian_cuda, ssim_cuda, k, rows, cols, tile_batches_needed);
HANDLE_ERROR( cudaMemcpy(ssim + rows*cols*c, ssim_cuda, rows*cols*sizeof(double), cudaMemcpyDeviceToHost) );
}
cudaFree( test_cuda );
cudaFree( reference_cuda );
cudaFree( gaussian_cuda );
cudaFree( ssim_cuda );
}
kernelkernel<<<grid, 1>>>
This is a significant issue; threads on nVidia GPUs work in warps of 32 threads. However, you've only assigned a single thread to each block, which means 31 of those threads will sit idle while a single thread does work. And usually, for kernels where you have the flexibility, you'll usually want several warps per block rather than just one.
You could get an immediate speedup by using N blocks and N threads per block, rather than using N^2 blocks.
Actually, N might be too big, since there's an upper limit on the number of threads per block. Although you could choose a suitable M so that that you use N/M threads per block, and N * M blocks.
In fact, you'll probably get the best results in this regard by picking some M (I'm guessing 256 will probably be near optimal) and launching with L=ceiling(N*N/M) blocks and M blocks per thread. Then each thread figures reconstructs an index in [0, M*L) based on its block and thread ID, and then those whose index is in [0,N*N) will proceed to split that index into an x and y coordinate and do work.
Accessing global memory in a kernel is costly, because of its latency. A global memory request (both reading and writing) takes hundreds of clock cycles to complete. You want to minimise the amount of times global memory is accessed, and access it in contiguous blocks.
If each piece of data is accessed exactly once, there's nothing to do about the latency, but that's seldom the case. And definitely not the case in your code, where the kernel array is accessed by all threads in the same pattern, and a lot of image is accessed by multiple threads as well.
The solution for that is to start the kernel by fetching the data from the high-latency global memory into the low-latency shared memory. Shared memory is a block of memory on the multiprocessor, and its latency is comparable to that of registers. So most simple kernels follow a structure like this:
Each thread fetches data from global memory to shared memory. You want to fetch data in contiguous sequences if possible, as global memory is accessed through transactions. If there's not enough data for all threads to fetch, leave some of them idle.
Threads operate on the data in shared memory.
Data is written from shared memory back to global memory in the same pattern as it was fetched in step 1.
Shared memory is shared by all threads within a thread block. Which leads us to the second big issue in your code: you're not using thread blocks at all. Threads in one block run on one multiprocessor, share shared memory, can be synchronised with each other etc. You need to organise threads into blocks well to get the most out of them.
The grid of blocks is just a mechanism to be able to run more blocks at one invocation. All the goodies of parallel instruction execution and shared memory access are within a block. The grid of blocks is just "yeah, sorry, my data's so big a single block won't do, just run many of them."
You're doing the exact opposite: your blocks have one thread each, which means that in each step, only one thread from each warp runs on the multiprocessor (based on your device's compute capability and the number of warp schedulers available, this means something like 2–4 threads on one multiprocessor at most).
You'll have to re-structure your threads to mirror the data access patterns, and prefetch data into shared memory. This will give you the performance boost you expect.
The above is just a short summary. Refer to the CUDA programming guide for details on block organisation, shared memory, and global memory transactions.
If you're using global memory in CUDA, all the data access will be synchronized in something like queue, and you'll receive almost linear solution, not parallel.
Also, transfering a large dataset from your RAM memory to GPU memory also takes a lot of time (the speed of bus is limited).
So, i think you have to somehow parallel your data across computation units in your GPU (part them into shared memory).
Check this to see solution of how to improve your GPU memory usage in the case that similar to yours.
I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.
Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}
I'm trying to do a parallel reduction to sum an array in CUDA. Currently i pass an array in which to store the sum of the elements in each block. This is my code:
#include <cstdlib>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <helper_cuda.h>
#include <host_config.h>
#define THREADS_PER_BLOCK 256
#define CUDA_ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
using namespace std;
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct double3c {
double x;
double y;
double z;
__host__ __device__ double3c() : x(0), y(0), z(0) {}
__host__ __device__ double3c(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
__host__ __device__ double3c& operator+=(const double3c& rhs) { x += rhs.x; y += rhs.y; z += rhs.z;}
__host__ __device__ double3c& operator/=(const double& rhs) { x /= rhs; y /= rhs; z /= rhs;}
};
class VectorField {
public:
double3c *data;
int size_x, size_y, size_z;
bool is_copy;
__host__ VectorField () {}
__host__ VectorField (int x, int y, int z) {
size_x = x; size_y = y; size_z = z;
is_copy = false;
CUDA_ERROR_CHECK (cudaMalloc(&data, x * y * z * sizeof(double3c)));
}
__host__ VectorField (const VectorField& other) {
size_x = other.size_x; size_y = other.size_y; size_z = other.size_z;
this->data = other.data;
is_copy = true;
}
__host__ ~VectorField() {
if (!is_copy) CUDA_ERROR_CHECK (cudaFree(data));
}
};
__global__ void KernelCalculateMeanFieldBlock (VectorField m, double3c* result) {
__shared__ double3c blockmean[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
else blockmean[threadIdx.x] = double3c(0,0,0);
__syncthreads();
for(int s = THREADS_PER_BLOCK / 2; s > 0; s /= 2) {
if (threadIdx.x < s) blockmean[threadIdx.x] += blockmean[threadIdx.x + s];
__syncthreads();
}
if(threadIdx.x == 0) result[blockIdx.x] = blockmean[0];
}
double3c CalculateMeanField (VectorField& m) {
int blocknum = (m.size_x * m.size_y * m.size_z - 1) / THREADS_PER_BLOCK + 1;
double3c *mean = new double3c[blocknum]();
double3c *cu_mean;
CUDA_ERROR_CHECK (cudaMalloc(&cu_mean, sizeof(double3c) * blocknum));
CUDA_ERROR_CHECK (cudaMemset (cu_mean, 0, sizeof(double3c) * blocknum));
KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK>>> (m, cu_mean);
CUDA_ERROR_CHECK (cudaPeekAtLastError());
CUDA_ERROR_CHECK (cudaDeviceSynchronize());
CUDA_ERROR_CHECK (cudaMemcpy(mean, cu_mean, sizeof(double3c) * blocknum, cudaMemcpyDeviceToHost));
CUDA_ERROR_CHECK (cudaFree(cu_mean));
for (int i = 1; i < blocknum; i++) {mean[0] += mean[i];}
mean[0] /= m.size_x * m.size_y * m.size_z;
double3c aux = mean[0];
delete[] mean;
return aux;
}
int main() {
VectorField m(100,100,100);
double3c sum = CalculateMeanField (m);
cout << sum.x << '\t' << sum.y << '\t' <<sum.z;
return 0;
}
EDIT
Posted a functional code. Constructing a VectorField with 10x10x10 elements works fine and gives mean 1, but constructing it with 100x100x100 elements gives mean ~0.97 (it varies from run to run). Is this a right way to do a parallel reduction, or should I stick to one kernel launch per block?
When I compile the code you have now on linux, I get the following warning:
t614.cu(55): warning: __shared__ memory variable with non-empty constructor or destructor (potential race between threads)
This type of warning should not be ignored. It is associated with this line of code:
__shared__ double3c blockmean[THREADS_PER_BLOCK];
Since the initialization of these objects stored in shared memory (by the constructor) will happen in some arbitrary order, and you have no barrier between that and the subsequent code that will also set these values, unpredictable things (*) can happen.
If I insert a __syncthreads() in the code to isolate the constructor activity from the subsequent code, I get expected results:
__shared__ double3c blockmean[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads(); // add this line
if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
else blockmean[threadIdx.x] = double3c(0,0,0);
__syncthreads();
This still leaves us with the warning, however. A modification to fix this and make the warning go away would be to allocate the necessary __shared__ size dynamically. Change your shared memory declaration to this:
extern __shared__ double3c blockmean[];
and modify your kernel call:
KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK, THREADS_PER_BLOCK*sizeof(double3c)>>> (m, cu_mean);
This will eliminate the warning, produce the correct result, and avoid the unnecessary constructor traffic on the shared memory variable. (And the additional __syncthreads() described above is no longer necessary.)
*regarding "unpredictable things", if you look under the hood by inspecting either the generated SASS (cuobjdump -sass ...) or the PTX (**) (nvcc -ptx ...), you will see that each thread initializes the entire __shared__ array of objects to zero (the behavior of the default constructor). As a result of this, some of the threads (i.e. warps) can race ahead and begin populating the shared memory area according to this line:
if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
Then, when other warps begin executing, those threads will clear out the entire shared memory array again. This racing behavior leads to unpredictable results.
** I don't normally suggest judging code behavior by inspecting the PTX, but in this case it is equally instructive. The final compile stages will not optimize away the constructor behavior.
this is my third post and attempt to solve this problem, which first
showed up using numpy.dot(A, A.T) where A is large, 150,000 x 265 elements.
With numpy, I got back an array with many missing values, that were just zeros.
I've tried to call BLAS thru CBLAS. I'm getting a segmentation fault error
with large arrays.
I'm running this on a machine with about 250 GB free memory.
Thanks for reading...
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
#include <cblas.h> /* C BLAS BLAS */
#include "blaio.h"
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
// row_order transform transform rowsA colsB K alpha a lda b ldb beta c ldc
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, row, row, col, 1.0f, a, col, b, row, 0.0f, c, row);
int num_bad = 0;
for(i=0; i<(row*row); i++)
{
if (c[i] != col)
{
printf("Bad value found: %f, at index: %i\n", c[i], i );
num_bad += 1;
}
}
printf("Number of bad values found: %i \n\n", num_bad);
//printMatrix(CblasRowMajor, row, row, c, 8, 3, NULL, NULL, NULL, NULL, NULL, "c = ");
return 0;
} /* end func main */
UPDATE:
Ray has expertly noticed that the blas I'm using via cblas, must be 32 bit and not able to access the array indices. Therefore, I've installed blas64.x86_64 and blas64-devel.x86_64.
Then, rewrote a few lines of the code above to use the direct call to sgemm without cblas.
#include <stdio.h> /* I/O lib ISOC */
#include <stdlib.h> /* Standard Lib ISOC */
int main(int argc, char **argv) {
int row = 100000;
int col = 265;
float *a, *b, *c;
a = (float *) malloc(row * col * sizeof(float));
b = (float *) malloc(row * col * sizeof(float));
c = (float *) malloc(row * row * sizeof(float));
int i, end;
end = row * col;
for(i=0; i<end; i++)
{
a[i] = 1.0;
b[i] = 1.0;
}
for(i=0; i<(row*row); i++)
c[i] = 2.0;
float alpha = 1.0, beta = 1.0;
sgemm_('N','N', &row, &row, &col, &alpha, &a[0], &col, &b[0], &row, &beta, &c[0], &row);
I compiled with:
gcc sgemm_test_fortran.c -o test -L /usr/lib64 -lblas64
The code compiled and I think it might run.. :)
The problem is that the size of your output matrix (100,000x100,000 = 1e10 elements) can't be stored in an int (2.14e9). You can fix this in your C++ code by switching the types to size_t, but you're going to run into the same problem inside the BLAS library.
What you need to to do is use a BLAS library that is compiled to use 8-byte integers; most BLAS libraries are compiled with 4-byte integers. You don't mention what BLAS library you're linking to, so it's hard to guess what the correct library name is (if it even exists) on your system.
I have a "string"(molecule) of connected N objects(atoms) in 3D (each atom has a coordinates). And I need to calculate a distance between each pair of atoms in a molecule (see pseudo code below ). How could it be done with CUDA? Should I pass to a kernel function 2 3D Arrays? Or 3 arrays with coordinates: X[N], Y[N], Z[N]? Thanks.
struct atom
{
double x,y,z;
}
int main()
{
//N number of atoms in a molecule
double DistanceMatrix[N][N];
double d;
atom Atoms[N];
for (int i = 0; i < N; i ++)
for (int j = 0; j < N; j++)
DistanceMatrix[i][j] = (atoms[i].x -atoms[j].x)*(atoms[i].x -atoms[j].x) +
(atoms[i].y -atoms[j].y)* (atoms[i].y -atoms[j].y) + (atoms[i].z -atoms[j].z)* (atoms[i].z -atoms[j].z;
}
Unless you're working with very large molecules, there probably won't be enough work to keep the GPU busy, so calculations will be faster with the CPU.
If you meant to calculate the Euclidean distance, your calculation is not correct. You need the 3D version of the Pythagorean theorem.
I would use a SoA for storing the coordinates.
You want to generate a memory access pattern with as many coalesced reads and writes as possible. To do that, arrange for addresses or indexes generated by the 32 threads in each warp to be as close to each other as possible (a bit simplified).
threadIdx designates thread indexes within a block and blockIdx designates block indexes within the grid. blockIdx is always the same for all threads in a warp. Only threadIdx varies within the threads in a block. To visualize how the 3 dimensions of threadIdx are assigned to threads, think of them as nested loops where x is the inner loop and z is the outer loop. So, threads with adjacent x values are the most likely to be within the same warp and, if x is divisible by 32, only threads sharing the same x / 32 value are within the same warp.
I have included a complete example for your algorithm below. In the example, the i index is derived from threadIdx.x so, to check that warps would generate coalesced reads and writes, I would go over the code while inserting a few consecutive values such as 0, 1 and 2 for i and checking that the generated indexes would also be consecutive.
Addresses generated from the j index are less important as j is derived from threadIdx.y and so is less likely to vary within a warp (and will never vary if threadIdx.x is divisible by 32).
#include "cuda_runtime.h"
#include <iostream>
using namespace std;
const int N(20);
#define check(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
int div_up(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z);
int main(int argc, char **argv)
{
double* atoms_x_h;
check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));
double* atoms_y_h;
check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));
double* atoms_z_h;
check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));
for (int i(0); i < N; ++i) {
atoms_x_h[i] = i;
atoms_y_h[i] = i;
atoms_z_h[i] = i;
}
double* atoms_x_d;
check(cudaMalloc(&atoms_x_d, N * sizeof(double)));
double* atoms_y_d;
check(cudaMalloc(&atoms_y_d, N * sizeof(double)));
double* atoms_z_d;
check(cudaMalloc(&atoms_z_d, N * sizeof(double)));
check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));
double* distances_d;
check(cudaMalloc(&distances_d, N * N * sizeof(double)));
const int threads_per_block(256);
dim3 n_blocks(div_up(N, threads_per_block));
calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);
check(cudaPeekAtLastError());
check(cudaDeviceSynchronize());
double* distances_h;
check(cudaMallocHost(&distances_h, N * N * sizeof(double)));
check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));
for (int i(0); i < N; ++i) {
for (int j(0); j < N; ++j) {
cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
}
}
check(cudaFree(distances_d));
check(cudaFreeHost(distances_h));
check(cudaFree(atoms_x_d));
check(cudaFreeHost(atoms_x_h));
check(cudaFree(atoms_y_d));
check(cudaFreeHost(atoms_y_h));
check(cudaFree(atoms_z_d));
check(cudaFreeHost(atoms_z_h));
return 0;
}
__global__ void calc_distances(double* distances,
double* atoms_x, double* atoms_y, double* atoms_z)
{
int i(threadIdx.x + blockIdx.x * blockDim.x);
int j(threadIdx.y + blockIdx.y * blockDim.y);
if (i >= N || j >= N) {
return;
}
distances[i + N * j] =
(atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
(atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
(atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
}