I have implemented the following CUDA code but i am a little bit confused about the behavior.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
void PrintMatrix(float* a, int n)
{
int j, i;
for (j = 1; j <= n; j++)
{
for (i = 1; i <= n; i++)
{
printf("%7.0f", a[IDX2F(i, j, n)]);
}
printf("\n");
}
}
float* CreateMatrix(int n)
{
float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
if (!matrix)
{
printf("host memory allocation failed");
return nullptr;
}
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
matrix[IDX2F(i, j, n)] = 2;
}
}
return matrix;
}
long CudaMatrixMultiply(float* matrix, int n)
{
cudaError_t cudaStat;
cublasStatus_t status;
cublasHandle_t handle;
float* deviceMatrix;
cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
if (cudaStat != cudaSuccess)
{
printf("device memory allocation failed");
return EXIT_FAILURE;
}
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data download failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
float alpha = 1;
float beta = 0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);
status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data upload failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_SUCCESS;
}
float* CpuMatrixMultiply(float* matrix, int size)
{
float* result = new float[size * size]();
// Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
for (int row = 1; row <= size; row++)
{
for (int col = 1; col <= size; col++)
{
// Multiply the row of A by the column of B to get the row, column of product.
for (int inner = 1; inner <= size; inner++)
{
// result[row][col] += matrix[row][inner] * matrix[inner][col];
result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
}
}
}
free(matrix);
return result;
}
int main(void)
{
// printf("Matrix * Matrix Test\n");
int size = 1000;
int runs = 10;
for (int run = 0; run != runs; run++)
{
printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);
float* cpuMatrix = CreateMatrix(size);
cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);
PrintMatrix(cpuMatrix, 5);
float* gpuMatrix = CreateMatrix(size);
CudaMatrixMultiply(gpuMatrix, size);
PrintMatrix(gpuMatrix, 5);
free(cpuMatrix);
free(gpuMatrix);
}
getchar();
return EXIT_SUCCESS;
}
The ouput of the CPU version of the MatrixMultiplication is the following as expected:
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
but the result of the GPU computed is sometimes the right one (see above) or a wrong random(?) one. When the loop is executed the first time then the result was always the right one.
I am not able to find a mistake in my code and it would be great if you could help me.
Additionally if i set size (int the main method) to e.g. 16000 then my driver is crashing and i get an error message. For this i have written a bug report to NVidea because my pc crashed twice. But maybe it is a programming fault by me?
Driver: 364.72 (newest one)
SDK: CUDA Toolkit 7.5
Graphics Card: NVidia GeForce GTX 960 (4GB)
Windows 10 64Bit
Driver Error
Display driver NVIDIA Windows kernel Mode Driver, Version 362.72 stopped responding and has successfully recovered.
Edit: With the help of the community i found out that this is a problem with the watchdog timer. See answer below.
Regarding the second part of the question, following njuffa's remark, you may change the settings for driver behavior to avoid the error when increasing size. Open NSIGHT Monitor and in Options, General, Microsoft Display Driver, change to False the WDDM TDR enabled field.
From spec, the 32bits FPU flops should be around 2.4 TFLOPS in single precision, hence your operation for a 16000 sized matrix should take at the minimum 3.5 seconds. Hence the Driver Recovery after 2 seconds.
Related
I am testing a simple CUDA algorithm for timing and I came across a case that when I increase the grid size of the kernel it gives incorrect results:
#include <unistd.h>
#include <stdio.h>
#include <assert.h>
/* we need these includes for CUDA's random number stuff */
#include <curand.h>
#include <curand_kernel.h>
#define MAX 10
#ifdef GRID
#define REPEAT GRID
#else
#define REPEAT 65535
#endif
#ifdef VECSIZE
#define SIZE VECSIZE
#else
#define SIZE 1024
#endif
__global__ void random(int *result) {
curandState_t state;
curand_init(100, 0, threadIdx.x, &state);
result[threadIdx.x] = curand(&state) % MAX;
//printf("th %d random %d\n", threadIdx.x, *result);
}
__global__ void myadd(const int *in, int *sum) {
sum[blockIdx.x] = 0;
//printf("thread %d value %d\n",threadIdx.x, in[threadIdx.x]);
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
//atomicAdd(sum, in[threadIdx.x]);
}
int main() {
int check = 0;
/* allocate an int on the GPU */
int *x = new int[SIZE];
int *sum = new int[REPEAT];
int *d_x, *d_sum;
cudaMalloc(&d_x, sizeof(int) * SIZE);
cudaMalloc(&d_sum, sizeof(int) * REPEAT);
/* invoke the GPU to initialize all of the random states */
random<<<1, SIZE>>>(d_x);
myadd<<<REPEAT, SIZE>>>(d_x, d_sum);
cudaDeviceSynchronize();
/* copy the random number back */
cudaMemcpy(x, d_x, sizeof(int) * SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(sum, d_sum, sizeof(int)* REPEAT, cudaMemcpyDeviceToHost);
for (int i = 0; i < SIZE; ++i) {
check += x[i];
//printf("Random[%d] = %d\n", i, x[i]);
}
cudaError_t err = cudaGetLastError(); // Get error code
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
exit(-1);
}
for (int i = 0; i < REPEAT; ++i) {
printf("i %d check %d sum[i] %d\n", i, check, sum[i]);
assert(check == sum[i]);
}
/* free the memory we allocated */
cudaFree(d_x);
cudaFree(d_sum);
delete[] x;
delete[] sum;
return 0;
}
My card is V100 with compute capability of 7.0. As you can see I can compile the above code with different grid and vector sizes with nvcc test.cu -arch=sm_70 -O3 -g -G -DGRID=1024 -DVECSIZE=512, for small vector and grid sizes everything looks good but when I increase the grid size to max (65535) sometimes the computed sum value is incorrect. For example:
.
.
.
i 511 check 2331 sum[i] 2331
i 512 check 2331 sum[i] 2331
i 513 check 2331 sum[i] 2188
a.out: test.cu:87: int main(): Assertion `check == sum[i]' failed.
There is a race condition in kernel myadd. The sum must only be set to 0 once. And it should not be set to 0 after some other threads added their value to it.
__global__ void myadd(const int *in, int *sum) {
if(threadIdx.x == 0){
sum[blockIdx.x] = 0;
}
__syncthreads(); // all threads wait until sum is initialized with 0
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
}
If you want to time your code properly, you should remove the -G compiler flag.
I want to run the following simple code on two GPUs simultaneously. Here I have a variable A[i]=[0 1 2 3 4 5 6 7 8 9] and want to calculate C[i]=A[i+1]+A[i]+A[i-1]. This is the answer: C[i]=[1 3 6 9 7 11 18 21 24 17]. Bold numbers are wrong. For two devices, C[4] from device=1 needs to access to A[5] from device=2. How can I do it in the simplest way?
My expertise is not programming and I suppose to use multiGPU to solve a PDE equation. So, I really appreciate any help to modify this code for my current problem.
Thank you.
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include<time.h>
__global__ void iKernel(float *A, float *C, const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i-1] + A[i] + A[i+1];
}
int main(int argc, char **argv)
{
int ngpus;
printf("> starting %s", argv[0]);
cudaGetDeviceCount(&ngpus);
printf(" CUDA-capable devices: %i\n", ngpus);
ngpus = 2;
int size = 10;
int iSize = size / ngpus;
size_t iBytes = iSize * sizeof(float);
printf("> total array size %d M, using %d devices with each device "
"handling %d M\n", size / 1024 / 1024, ngpus, iSize / 1024 / 1024);
// allocate device memory
float **d_A = (float **)malloc(sizeof(float *) * ngpus);
float **d_C = (float **)malloc(sizeof(float *) * ngpus);
float **h_A = (float **)malloc(sizeof(float *) * ngpus);
float **gpuRef = (float **)malloc(sizeof(float *) * ngpus);
cudaStream_t *stream = (cudaStream_t *)malloc(sizeof(cudaStream_t) * ngpus);
for (int i = 0; i < ngpus; i++){
// set current device
cudaSetDevice(i);
// allocate device memory
cudaMalloc((void **)&d_A[i], iBytes);
cudaMalloc((void **)&d_C[i], iBytes);
// allocate page locked host memory for asynchronous data transfer
cudaMallocHost((void **)&h_A[i], iBytes);
cudaMallocHost((void **)&gpuRef[i], iBytes);
// create streams for timing and synchronizing
cudaStreamCreate(&stream[i]);
}
dim3 block(512);
dim3 grid((iSize + block.x - 1) / block.x);
//h_A[ngpus][index]
for (int i = 0; i < ngpus; i++){
cudaSetDevice(i);
for (int j = 0; j < iSize; j++){
h_A[i][j] = j + i*iSize;
printf("%d %d %d %0.8f \n", i,j,iSize, h_A[i][j]);
}
}
// record start time
double iStart = clock();
// distributing the workload across multiple devices
for (int i = 0; i < ngpus; i++){
cudaSetDevice(i);
cudaMemcpyAsync(d_A[i], h_A[i], iBytes, cudaMemcpyHostToDevice, stream[i]);
iKernel << <grid, block, 0, stream[i] >> >(d_A[i], d_C[i], iSize);
cudaMemcpyAsync(gpuRef[i], d_C[i], iBytes, cudaMemcpyDeviceToHost,
stream[i]);
}
// synchronize streams
for (int i = 0; i < ngpus; i++){
cudaSetDevice(i);
cudaStreamSynchronize(stream[i]);
}
for (int i = 0; i < ngpus; i++){
for (int j = 0; j < iSize; j++){
printf("%d %d %0.8f \n", i,j,gpuRef[i][j]);
}
}
return EXIT_SUCCESS;
}
You have to upload the overlap regions to both devices. You can't (easily) read values from another device, so you have to duplicate and pad at least some of the input values as required. iSize is obviously not enough input size when accessing iSize + 2 different input values.
If this were a multi pass algorithm, you would need to explicitly perform a copy of relevant regions in between passes.
Try modeling data dependencies formally on paper when attempting to target multi GPU systems.
Both GPUs can access memory allocated with cudaMallocHost, but it's usually not advisable to use that memory type as performance over PCIe bus is pretty bad compared to device local memory. There is also driver managed memory, but that isn't suited for two GPUs sharing the same active working set either.
Smart developer!
I am the beginner of CUDA programming and I have a big problem with my code.
Following code is a sample code from Nvidia and I changed a little bit for showing the GPU process much faster than from CPU process. However, after compiling this code, I got a unexpected result from that CPU process is much faster than GPU process.
This is my laptop gpu info.
This is my cuda code for Visual Studio 2017.
===========================================================================
#define N 10
This is add2 function() from GPU process
`___global____ void add2(int *a, int *b, int *c) {`
// GPU block from grid sector
//int tid = blockIdx.x; // checking the data of index = if you
insert min of N, you will get slow result from CPU. But if you put big number, this show much faster than CPU
// GPU thread
//int tid = threadIdx.x; // Same result as blockIdx.x
// GPU unexpected vector // Same result as above
int tid = threadIdx.x + blockIdx.x*blockDim.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
This is add function() from CPU process
`void add(int *a, int *b, int *c) {
int tid = 0;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1;
}
}
This is Main function()
int main() {
// Values for time duration
LARGE_INTEGER tFreq, tStart, tEnd;
cudaEvent_t start, stop;
float tms, ms;
int a[N], b[N], c[N]; // CPU values
int *dev_a, *dev_b, *dev_c; // GPU values----------------------------------------------
// Creating alloc for GPU--------------------------------------------------------------
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
// Fill 'a' and 'b' from CPU
for (int i = 0; i < N; i++) {
a[i] = -i;
b[i] = i * i;
}
// Copy values of CPU to GPU values----------------------------------------------------
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
//////////////////////////////////////
QueryPerformanceFrequency(&tFreq); // Frequency set
QueryPerformanceCounter(&tStart); // Time count Start
// CPU operation
add(a, b, c);
//////////////////////////////////////
QueryPerformanceCounter(&tEnd); // TIme count End
tms = ((tEnd.QuadPart - tStart.QuadPart) / (float)tFreq.QuadPart) * 1000;
//////////////////////////////////////
// show result of CPU
cout << fixed;
cout.precision(10);
cout << "CPU Time=" << tms << endl << endl;
for (int i = 0; i < N; i++) {
printf("CPU calculate = %d + %d = %d\n", a[i], b[i], c[i]);
}
cout << endl;
///////////////////////////////////////
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// GPU operatinog---------------------------------------------------------------------
//add2 <<<N,1 >>> (dev_a, dev_b, dev_c); // block
//add2 << <1,N >> > (dev_a, dev_b, dev_c); // Thread
add2 << <N/32+1, 32 >> > (dev_a, dev_b, dev_c); // grid
///////////////////////////////////////
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
///////////////////////////////////////
// show result of GPU
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cout << fixed;
cout.precision(10);
cout << "GPU Time=" << ms << endl << endl;
for (int i = 0; i < N; i++) {
printf("GPU calculate = %d + %d = %d\n", a[i], b[i], c[i]);
}
//Free GPU values
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
This is result of compiling this code.
I want to make GPU process much faster than CPU process.
The GPU is generally actually slower than the CPU for running a single operation. Additionally it takes time to send data to the GPU and read it back again.
The advantage of the GPU is it can execute many operations in parallel.
As you have defined N to be 10 it probably takes longer to upload and download the data than to execute on the CPU. In order to see the advantage of the GPU increase your problem size to something much larger. Ideally you want to execute a minimum of a few operations on each GPU core before you start seeing some benefit. For example with your GPU's 1280 cores you would want to execute something like 4000 operations or more at once to get the benefit of the GPU.
I just started learning CUDA and I have been looking at examples on NVIDIA's website. Specifically, I have implemented the non-shared version of the matrix multiply (the first sample is the non-shared version even though it is in the shared memory section):
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory
I am having a problem with the output when I change the block sizes. NVIDIA's code has a default block size of 16 and this gives me the correct output when I multiply two matrices. However, if I change the block size to anything above 16 (while still being a multiple of 16), I get an output of zero for all elements in the matrix. I tested this on my laptop too and noticed the same results for anything over 32 rather than 16. Could someone explain what is happening? I have two 9800GTX+ video cards in SLI and so I should have a maximum block size of (512,512,1). Why can I only do 16?
Also, I am noticing the same behavior in the shared version of the matrix multiplication (also on the NVIDIA page).
I didn't post my code because I get the same problem if I directly copy the code from the NVIDIA site.
I would really appreciate any help with this or with resources to learn more about these kinds of CUDA details.
Thank you!
I have attached the code as requested:
#include "stdio.h"
#include <cuda.h>
#include <assert.h>
#include <time.h>
#include <math.h>
// This is an example CUDA program that compares the timings of a matrix multiplication.
// The comparisons are between the CPU, GPU, and the GPU with shared memory.
#define BLOCK_SIZE 32
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
typedef void (*FuncPtr)(Matrix& A, Matrix& B, Matrix& C);
void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C);
// Helper declarations
void initializeMatrix(Matrix& A, int rows, int cols, float val);
void copyMatrix(Matrix& dest, Matrix& src);
void freeMatrix(Matrix& A);
void printError(cudaError_t err);
void printMat(Matrix& A);
void setVal(Matrix& A, float val);
double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters);
// CUDA declarations
__global__ void cudaMultMat(Matrix A, Matrix B, Matrix C);
int main() {
printf("Beginning Matrix Multiplication Comparison\n");
// Initialize matrix
Matrix A, B, C;
int rowsA = 32;
int colsA = 32;
int colsB = 32;
initializeMatrix(A, rowsA, colsA, 5.0f);
initializeMatrix(B, colsA, colsB, 2.0f);
initializeMatrix(C, rowsA, colsB, 0.0f);
// C = A * B using CPU, GPU, and GPU with shared memory
FuncPtr gpuMatMult = &multiplyMatrix;
int numOfIterations = 100;
double multTime = applyMultFunc(gpuMatMult, A, B, C, numOfIterations);
printMat(C);
// Update user
printf("Normal Mat Mult Time: %f\n", multTime);
// Cleanup
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
printf("\nPress Enter to continue...\n");
getchar();
return 0;
}
void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C) {
// Initialize device matrices
Matrix deviceA, deviceB, deviceC;
copyMatrix(deviceA, A);
copyMatrix(deviceB, B);
copyMatrix(deviceC, C);
// Initialize number of blocks and threads
dim3 numOfThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
int xSize = (C.width + numOfThreadsPerBlock.x - 1) / numOfThreadsPerBlock.x;
int ySize = (C.height + numOfThreadsPerBlock.y - 1) / numOfThreadsPerBlock.y;
dim3 numOfBlocks(xSize, ySize);
// Call CUDA kernel
cudaMultMat<<<numOfBlocks, numOfThreadsPerBlock>>>(deviceA, deviceB, deviceC);
printError(cudaThreadSynchronize());
printError(cudaMemcpy(C.elements, deviceC.elements, C.height * C.width * sizeof(float), cudaMemcpyDeviceToHost));
// Free cuda memory
printError(cudaFree(deviceA.elements));
printError(cudaFree(deviceB.elements));
printError(cudaFree(deviceC.elements));
}
// CUDA definitions
// GPU matrix multiplication (non-shared memory)
__global__ void cudaMultMat(Matrix A, Matrix B, Matrix C) {
// If the matrices are of the wrong size then return
if(A.width != B.height) {
return;
}
// Initialize the indexes into the grid
int col = (blockDim.x * blockIdx.x) + threadIdx.x;
int row = (blockDim.y * blockIdx.y) + threadIdx.y;
// Initialize the result
float cVal = 0.0f;
// Find the result for the dot product of a row of A and a column of B
for(int i = 0; i < A.width; i++) {
cVal += A.elements[row * A.width + i] * B.elements[i * B.width + col];
}
// If we are in bounds then save the result
if(row < C.height && col < C.width) {
C.elements[row * C.width + col] = cVal;
}
}
// Helper functions
void initializeMatrix(Matrix& A, int rows, int cols, float val) {
A.width = cols;
A.height = rows;
A.stride = A.width;
int numOfElements = A.width * A.height;
A.elements = (float*) malloc(numOfElements * sizeof(float));
for(int i = 0; i < numOfElements; i++) {
A.elements[i] = val;
}
}
void copyMatrix(Matrix& dest, Matrix& src) {
dest.width = src.width;
dest.height = src.height;
dest.stride = src.stride;
int size = src.width * src.height * sizeof(float);
printError(cudaMalloc(&dest.elements, size));
printError(cudaMemcpy(dest.elements, src.elements, size, cudaMemcpyHostToDevice));
}
void freeMatrix(Matrix& A) {
free(A.elements);
}
void printError(cudaError_t err) {
if(err != 0) {
printf("CUDA ERROR: %s\n", cudaGetErrorString(err));
getchar();
}
}
void printMat(Matrix& A) {
printf("*********************************\n");
for(int i = 0; i < A.height; i++) {
for(int j = 0; j < A.width; j++) {
int index = i * A.width + j;
printf("%2.1f, ", A.elements[index]);
}
printf("\n");
}
}
void setVal(Matrix& A, float val) {
for(int i = 0; i < A.width * A.height; i++) {
A.elements[i] = val;
}
}
double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters) {
clock_t startTime = clock();
for(int i = 0; i < numOfIters; i++) {
func(A, B, C);
}
clock_t endTime = clock();
return (double) (endTime - startTime) / CLOCKS_PER_SEC;
}
You're exceeding the threads per block specification of your GPU when you increase the block sizes.
The 9800GTX has a limit of 512 threads per block, regardless of how you create the block. 16*16 = 256 which is OK. 32 x 32 = 1024 which is not OK. In this case the kernel fails to run and so the output is not correct.
Your laptop probably has a newer GPU which supports 1024 threads per block, so 32 x 32 is OK but anything larger is not.
If you add proper cuda error checking to the code you can confirm this. Note that this code appears to have cuda error checking, but the checking implemented on the kernel call is incoomplete. Study the link I gave and you will see the difference. If you modify the code with complete error checking, you will see the error.
if your GPU's compute capability is 1.0/1.1, you can have at most 512 threads per block. But in new GPU device, every block can have at most 1024 threads.
I have written a small program in CUDA that counts how many 3's are in a C array and prints them.
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cstdlib>
__global__ void incrementArrayOnDevice(int *a, int N, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ int s_a[512]; // one for each thread
//s_a[threadIdx.x] = a[id];
if( id < N )
{
//if( s_a[threadIdx.x] == 3 )
if( a[id] == 3 )
{
atomicAdd(count, 1);
}
}
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
int N = 16777216;
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
// do calculation on device
int blockSize = 512;
int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1);
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", count);
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
The result I get is:
real 0m3.025s
user 0m2.989s
sys 0m0.029s
When I run it on the CPU with 4 threads I get:
real 0m0.101s
user 0m0.100s
sys 0m0.024s
Note that the GPU is an old one - I don't know the exact model because I do not have root access to it, but the OpenGL version it runs is 1.2 using the MESA driver.
Am I doing something wrong? What can I do to make it run faster?
Note: I have tried using buckets for each block (so the atomicAdd()s would be reduced for each one) but I get exactly the same performance.
I have also tried copying the 512 integers that are assigned to this block to a shared block of memory (you can see it in the comments) and the time is the same again.
This is in response to your question "What can I do to make it run faster?" As I mentioned in the comments, there are issues (probably) with the timing methodology, and the main suggestion I have for speed improvement is to use a "classical parallel reduction" algorithm. The following code implements a better (in my opinion) timing measurement, and also converts your kernel to a reduction style kernel:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
When I run this on a reasonably fast GPU (a Quadro 5000, a little slower than a Tesla M2050) I get the following:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
We see that the GPU is substantially faster than this (naive, single-threaded) CPU implementation for the compute portion. When we add in the cost to transfer the data, the GPU version is slower but is not 30x slower.
By way of comparison, when I timed your original algorithm, I got numbers like this:
GPU total time = 0.118131s
GPU compute time = 0.093213s
My system config for this was Xeon X5560 CPU, RHEL 5.5, CUDA 5.0, Quadro5000 GPU.