Related
I have like 3 different size of matrices and want to transpose them parallel.
Firstly I put these in a 2D array using malloc and then use cudaMalloc to transfer array from host(h_B) to device (d_B).
Using threadIdx to find each address of matrix in the array. The cublas function is used.
Here are my code.
The code can be compiled but I cannot get result. It seems that in global function float *A = new float[m*n] is not a good way.
Dose anyone have ideas of this?
Thanks so much!
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include<iostream>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
/* Includes, cuda helper functions */
#include <helper_cuda.h>
__global__ void transposeCublasSgeam(int *M_A, int *N_A, float *ptrA, float *ptrC, const int N, int *address)
{
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
if (status != CUBLAS_STATUS_SUCCESS)
{
return;
}
const float d_alpha = 1.0f;
const float d_beta = 0.0f;
int idx = threadIdx.x;
if(idx<N){
int m = M_A[idx]; //A_row
int n = N_A[idx]; //A_col
float *A = new float[m*n];
float *C = new float[m*n];
A = ptrA+address[idx];
C = ptrC+address[idx];
cublasSgeam(cnpHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &d_alpha, (const float*)A, n, &d_beta, (const float *)A, n, C, m);
delete[] A;
delete[] C;
}
cublasDestroy(cnpHandle);
}
int main()
{
const int N = 3;
int M_B[N] = { 2,3,2 }; //row number of matrices
int N_B[N] = { 3,2,4 }; //col number of matrices
float a[6] = { 1,2,3,
4,5,6 };
float b[6] = { 1,2,
3,4,
5,6};
float c[8] = { 1,2,3,1,
2,3,4,5 };
float **h_B = (float**)malloc(N * sizeof(float*));
float **h_BT = (float**)malloc(N * sizeof(float*));
h_B[0] = a, h_BT[0] = a;
h_B[1] = b, h_BT[1] = b;
h_B[2] = c, h_BT[2] = c;
int NUM_B = 20; // total number of elements
int address[] = {0,6,12};
float *d_B, *d_BT;
checkCudaErrors(cudaMalloc((void **)&d_B, NUM_B * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_BT, NUM_B * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_B, h_B, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_BT, h_BT, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
transposeCublasSgeam<<<1,N>>>(M_B, N_B, d_B,d_BT, N,address);
checkCudaErrors(cudaMemcpy(h_BT, d_BT, NUM_B * sizeof(float), cudaMemcpyDeviceToHost));
cudaFree(d_B);
cudaFree(d_BT);
delete[] h_B;
delete[] h_BT;
return 0;
}
There were a number of errors in your code. I will probably miss some in my description.
Note that this cublas-in-device-code functionality is no longer available in newer CUDA versions.
Every pointer that is passed to device code needs an allocation with cudaMalloc. You had done cudaMalloc for a few pointers, but not all of them.
You're confused about pointers and arrays of pointers. I won't be able to sort all of that out for you. Your kernel design really doesn't need the complexity of using arrays of pointers. So I've removed all that.
In CUDA dynamic parallelism (CDP), pointers to the local address space cannot be passed to child kernels. You can't use alpha and beta in the local address space, and pass pointers to those to CUBLAS in CDP.
To do a pure transpose, study the CUBLAS Sgeam documentation for the recommended parameters to use.
I believe there were other things I fixed. Please study this example:
$ cat t1433.cu
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include<iostream>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
/* Includes, cuda helper functions */
#include <helper_cuda.h>
__global__ void transposeCublasSgeam(int *M_A, int *N_A, float *ptrA, float *ptrC, const int N, int *address)
{
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("thread: %d, error1: %d\n", threadIdx.x, (int)status);
return;
}
float *d_alpha = new float; // a pointer to device-heap, not local memory
*d_alpha = 1.0f;
float *d_beta = new float;
*d_beta = 0.0f;
int idx = threadIdx.x;
if(idx<N){
int m = M_A[idx]; //A_row
int n = N_A[idx]; //A_col
status = cublasSgeam(cnpHandle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, d_alpha, ptrA+address[idx], n, d_beta, ptrC+address[idx], m, ptrC+address[idx], m);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("thread: %d, error2: %d\n", threadIdx.x, (int)status);
return;
}
}
cublasDestroy(cnpHandle);
}
int main()
{
const int N = 3;
int M_B[N] = { 2,3,2 }; //row number of matrices
int N_B[N] = { 3,2,4 }; //col number of matrices
float a[6] = { 1,2,3,
4,5,6 };
float b[6] = { 1,2,
3,4,
5,6};
float c[8] = { 1,2,3,1,
2,3,4,5 };
float *h_Bdata = (float *)malloc(sizeof(a)+sizeof(b)+sizeof(c));
float *h_BTdata = (float *)malloc(sizeof(a)+sizeof(b)+sizeof(c));
memcpy(h_Bdata, a, sizeof(a));
memcpy(h_Bdata+(sizeof(a)/sizeof(a[0])), b, sizeof(b));
memcpy(h_Bdata+(sizeof(a)/sizeof(a[0]))+(sizeof(b)/sizeof(b[0])), c, sizeof(c));
int NUM_B = 20; // total number of elements
int address[] = {0,6,12};
int *d_address;
cudaMalloc(&d_address, sizeof(address));
cudaMemcpy(d_address, address, sizeof(address), cudaMemcpyHostToDevice);
int *d_M_B, *d_N_B;
cudaMalloc(&d_M_B, sizeof(M_B));
cudaMalloc(&d_N_B, sizeof(N_B));
cudaMemcpy(d_M_B, M_B, sizeof(M_B), cudaMemcpyHostToDevice);
cudaMemcpy(d_N_B, N_B, sizeof(N_B), cudaMemcpyHostToDevice);
float *d_B, *d_BT;
checkCudaErrors(cudaMalloc((void **)&d_B, NUM_B * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_BT, NUM_B * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_B, h_Bdata, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
transposeCublasSgeam<<<1,N>>>(d_M_B, d_N_B, d_B,d_BT, N,d_address);
checkCudaErrors(cudaMemcpy(h_BTdata, d_BT, NUM_B * sizeof(float), cudaMemcpyDeviceToHost));
std::cout << "B , BT" << std::endl;
for (int i = 0; i < NUM_B; i++){
std::cout << h_Bdata[i] << " , " << h_BTdata[i] << std::endl;}
cudaFree(d_B);
cudaFree(d_BT);
return 0;
}
$ /usr/local/cuda-8.0/bin/nvcc -I/usr/local/cuda-8.0/samples/common/inc t1433.cu -rdc=true -lcublas_device -lcudadevrt -arch=sm_35 -o t1433
$ LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64 CUDA_VISIBLE_DEVICES="3" cuda-memcheck ./t1433
========= CUDA-MEMCHECK
B , BT
1 , 1
2 , 4
3 , 2
4 , 5
5 , 3
6 , 6
1 , 1
2 , 3
3 , 5
4 , 2
5 , 4
6 , 6
1 , 1
2 , 2
3 , 2
1 , 3
2 , 3
3 , 4
4 , 1
5 , 5
========= ERROR SUMMARY: 0 errors
$
Are there any way to suppress "<<< >>>" error with vscode-cpptools.
I associate "*.cu" with "cpp" in setting.json.
// use normal c++ syntax highlighting for CUDA files
"files.associations": {"*.cu": "cpp"},
and work fine except of one problem, kernel execution configuration parameters surrounded by <<< and >>> mistaked as error expected an expression
dim3 dimGrid(2, 2, 1);
dim3 dimBlock(width / 2, width / 2, 1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(d_M, d_N, d_P, width);
Any suggestion
googling for a few hours, find no perfect solution but some workaround.
I summarize here:
use normal c++ syntax highlighting for CUDA files by edittingsetting.json
include necessary header of CUDA in program
include dummy header to workaround INTELLISENSE
Bellow is a concrete example
setting.json
"files.associations": {
"*.cu": "cpp",
"*.cuh": "cpp"
}
cudaDmy.cuh
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads(); // workaround __syncthreads warning
#define KERNEL_ARG2(grid, block)
#define KERNEL_ARG3(grid, block, sh_mem)
#define KERNEL_ARG4(grid, block, sh_mem, stream)
#else
#define KERNEL_ARG2(grid, block) <<< grid, block >>>
#define KERNEL_ARG3(grid, block, sh_mem) <<< grid, block, sh_mem >>>
#define KERNEL_ARG4(grid, block, sh_mem, stream) <<< grid, block, sh_mem,
stream >>>
#endif
matrixMul.cu
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <device_functions.h>
#include <cuda_runtime_api.h>
#include "cudaDmy.cuh"
__global__ void MatrixMulKernel(float *M, float *N, float *P, int width)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (Row < width && Col < width)
{
float Pvalue = 0;
for (int i = 0; i < width; ++i)
{
Pvalue += M[Row * width + i] * N[width * i + Col];
}
P[Row * width + Col] = Pvalue;
}
}
void MatMul(float *M, float *N, float *P, int width)
{
float *d_M;
float *d_N;
float *d_P;
int size = width * width * sizeof(float);
cudaMalloc((void **)&d_M, size);
cudaMemcpy(d_M, M, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_N, size);
cudaMemcpy(d_N, N, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_P, size);
dim3 dimGrid(2, 2, 1);
dim3 dimBlock(width / 2, width / 2, 1);
// <<<>>> will replace macro KERNEL_ARG2 when compiling
MatrixMulKernel KERNEL_ARG2(dimGrid,dimBlock) (d_M, d_M, d_P, width);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
cudaFree(d_M);
cudaFree(d_N);
cudaFree(d_P);
}
int main()
{
int elem = 100;
float *M = new float[elem];
float *N = new float[elem];
float *P = new float[elem];
for (int i = 0; i < elem; ++i)
M[i] = i;
for (int i = 0; i < elem; ++i)
N[i] = i + elem;
time_t t1 = time(NULL);
MatMul(M, N, P, sqrt(elem));
time_t t2 = time(NULL);
double seconds = difftime(t2,t1);
printf ("%.3f seconds total time\n", seconds);
for (int i = 0; i < elem/1000000; ++i)
printf("%.1f\t", P[i]);
printf("\n");
delete[] M;
delete[] N;
delete[] P;
return 0;
}
Let's compile it with NVCC
nvcc matrixMul.cu -Xcudafe "--diag_suppress=unrecognized_pragma" -o runcuda
useful links:
https://devtalk.nvidia.com/default/topic/513485/cuda-programming-and-performance/__syncthreads-is-undefined-need-a-help/post/5189004/#5189004
https://stackoverflow.com/a/6182137/8037585
https://stackoverflow.com/a/27992604/8037585
https://gist.github.com/ruofeidu/df95ba27dfc6b77121b27fd4a6483426
You can just download the vscode-cudacpp extention and than in your workspace(<>.workspace) or user settings(.vscode/settings.json) enable this option:
"settings": {
"files.associations": {
"*.cu": "cuda",
"*.cuh": "cuda"
}
}
As sonulohani pointed out the cuda-cpp extension. It is good and it is the only extension available for CUDA. if you want autocomplete then try the CUDA-C++ package in sublime text editor. That provides excellent autocomplete features.
There is an official extension by NVIDIA named Nsight Visual Studio Code Edition
You could try and install it in your vscode.
I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.
Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}
Update!
My current code doesn't check for out of bounds memory access. When I run the cuda memcheck, it says memory access is bad even for matrices of just 2 by 2! I'm accessing memory where I shouldn't somehow and that's the problem!
To check for out of bounds memory access, run cuda-memcheck ./(insert executable here)
Shown below is my code for the matrix multiplication itself:
dim3 block(32,32);
dim3 grid( (n+31)/32, (n+31)/32 );
matrixMul<<<grid,block>>>(d_C, d_A, d_B, n, k);
kA and kB are matrices with values in them (they're all 2's to make it easier).
m, n, k are all the same number for my square matrices
kC is the matrix to store the answer.
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
__global__ void matrixMul(float *kC, float *kA, float *kB, int n, int k)
{
int tx = blockIdx.x * 32 + threadIdx.x;
int ty = blockIdx.y * 32 + threadIdx.y;
float value = 0;
for (int i=0;i<n;i++)
{
float elementA=kA[ty*n+i];
float elementB=kB[i*k+tx];
value += elementA*elementB;
}
kC[ty*n+tx] = value;
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_
Based on how you are defining the grid of threads, you should add a thread check to the kernel code like this:
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
__global__ void matrixMul(float *kC, float *kA, float *kB, int n, int k)
{
int tx = blockIdx.x * 32 + threadIdx.x;
int ty = blockIdx.y * 32 + threadIdx.y;
if ((ty < n) && (tx < n)) { // add this line
float value = 0;
for (int i=0;i<n;i++)
{
float elementA=kA[ty*n+i];
float elementB=kB[i*k+tx];
value += elementA*elementB;
}
kC[ty*n+tx] = value;
} // add this line
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_
Otherwise threads outside the valid array array will corrupt your results. Things work for multiples of 32x32 because there are no invalid threads. In that case you're launching exactly the required number of threads. But in other cases you are launching extra threads. These extra threads, if allowed to compute an invalid matrix position, will corrupt the results.
I have written a small program in CUDA that counts how many 3's are in a C array and prints them.
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cstdlib>
__global__ void incrementArrayOnDevice(int *a, int N, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ int s_a[512]; // one for each thread
//s_a[threadIdx.x] = a[id];
if( id < N )
{
//if( s_a[threadIdx.x] == 3 )
if( a[id] == 3 )
{
atomicAdd(count, 1);
}
}
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
int N = 16777216;
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
// do calculation on device
int blockSize = 512;
int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1);
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", count);
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
The result I get is:
real 0m3.025s
user 0m2.989s
sys 0m0.029s
When I run it on the CPU with 4 threads I get:
real 0m0.101s
user 0m0.100s
sys 0m0.024s
Note that the GPU is an old one - I don't know the exact model because I do not have root access to it, but the OpenGL version it runs is 1.2 using the MESA driver.
Am I doing something wrong? What can I do to make it run faster?
Note: I have tried using buckets for each block (so the atomicAdd()s would be reduced for each one) but I get exactly the same performance.
I have also tried copying the 512 integers that are assigned to this block to a shared block of memory (you can see it in the comments) and the time is the same again.
This is in response to your question "What can I do to make it run faster?" As I mentioned in the comments, there are issues (probably) with the timing methodology, and the main suggestion I have for speed improvement is to use a "classical parallel reduction" algorithm. The following code implements a better (in my opinion) timing measurement, and also converts your kernel to a reduction style kernel:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
When I run this on a reasonably fast GPU (a Quadro 5000, a little slower than a Tesla M2050) I get the following:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
We see that the GPU is substantially faster than this (naive, single-threaded) CPU implementation for the compute portion. When we add in the cost to transfer the data, the GPU version is slower but is not 30x slower.
By way of comparison, when I timed your original algorithm, I got numbers like this:
GPU total time = 0.118131s
GPU compute time = 0.093213s
My system config for this was Xeon X5560 CPU, RHEL 5.5, CUDA 5.0, Quadro5000 GPU.