I am new to GPU programming and specifically CUDA/C++. I have written a simple code just to use atomicAdd to increase all members of an array by 1.
But the result shows just the first element of the array increased and others stay the same. My code is as follows.
Thanks for any help in advance.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <fstream>
using namespace std;
__global__ void Histcount( int *a)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
{
atomicAdd(&a[i], 1);
}
}
int main()
{
int * hostarray = new int[20];
int * devarray;
cudaError_t error;
error=cudaMalloc(&devarray, sizeof(int) * 20);
for (int i = 0; i < 20; i++)
{
hostarray[i] = i ;
}
cudaMemcpy((int *)devarray, (int *)hostarray, sizeof(int) * 20, cudaMemcpyHostToDevice);
dim3 gs = (1, 1);
dim3 bs = (20, 1, 1);
Histcount <<<gs, bs >>> (devarray);
cudaMemcpy((int *)hostarray, (int *)devarray, sizeof(int) * 20, cudaMemcpyDeviceToHost);
for (int i = 0; i < 20; i++)
{
cout << hostarray[i]<<endl;
}
}
This is not a valid way to specify dim3 variables:
dim3 gs = (1, 1);
dim3 bs = (20, 1, 1);
In fact, the compiler may be throwing warnings on those lines, and if so you should not ignore those.
You should do either:
dim3 gs = dim3(1, 1);
dim3 bs = dim3(20, 1, 1);
or:
dim3 gs(1, 1);
dim3 bs(20, 1, 1);
The problem with your implementation is that the compiler doesn't know your actual intent with for example:
(20, 1, 1)
By itself as you have it, the compiler (may issue a warning and in fact) evaluates that expression to be 1, which it then assigns as a scalar to your dim3 variable. So you end up with a block size of 1 and a grid size of 1 (which was not your intent), and your code ran only 1 thread overall.
Related
I've built a fairly simple c code that reads a pgm image, splits it in different sections and sends it to various cores to elaborate it.
In order to account for some elaboration margins (each core has to access a larger area of the image than the it needs to write on), I can't simply split the image but I first have to create an array where I add the before mentioned margins.
As a quick example: an image is 1600x1200 (width x height), I have 2 cores, I want to access an area of 3x3 centered on the pixel and I'm splitting this image horizontal line by horizontal line then the subdivision would be -> the first core gets the pixels from 0 to 6011600, the second core gets the pixels from 5091600 to 1200*1600.
Now, I believe there is nothing wrong in how I implemented this in my program, still I get this error:
[ct1pt-tnode003:22389:0:22389] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x7ffe7f60ead8)
==== backtrace (tid: 22389) ====
0 0x000000000004ee05 ucs_debug_print_backtrace() ???:0
1 0x0000000000402624 main() ???:0
2 0x0000000000022505 __libc_start_main() ???:0
3 0x0000000000400d99 _start() ???:0
This is my code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <math.h>
#include <time.h>
#include "testlibscatter.h"
#include <mpi.h>
#define MSGLEN 2048
int main(int argc, char *argv[]){
MPI_Init(&argc, &argv);
int m = atoi(argv[1]), n = atoi(argv[2]), kern_type = atoi(argv[3]);
double kernel[m*n];
int i_rank, ranks;
int param, symm;
MPI_Comm_rank( MPI_COMM_WORLD, &i_rank);
MPI_Comm_size( MPI_COMM_WORLD, &ranks);
int xsize, ysize, maxval;
xsize = 0;
ysize = 0;
maxval = 0;
void * ptr;
switch (kern_type){
case 1:
meankernel(m, n, kernel);
break;
case 2:
weightkernel(m, n, param, kernel);
break;
case 3:
gaussiankernel(m, n, param, symm, kernel);
break;
}
if (i_rank == 0){
read_pgm_image(&ptr, &maxval, &xsize, &ysize, "check_me2.pgm");
}
MPI_Bcast(&xsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ysize, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&maxval, 1, MPI_INT, 0, MPI_COMM_WORLD);
int flo, start, end, i;
flo = floor(ysize/ranks);
int first, last;
first = start - (m - 1)/2;
last = end + (m - 1)/2;
if (start == 0){
first = 0;
}
if (end == ysize){
last = ysize;
}
int sendcounts[ranks];
int displs[ranks];
int first2[ranks];
int last2[ranks];
int c_start2[ranks];
int c_end2[ranks];
int num;
num = (ranks - 1) * (m-1);
printf("num is %d\n", num);
unsigned short int bigpic[xsize*(ysize + num)];
if (i_rank == 0){
for(i = 0; i < ranks; i++){
c_start2[i] = i * flo;
c_end2[i] = (i + 1) * flo;
if ( i == ranks - 1){
c_end2[i] = ysize;
}
first2[i] = c_start2[i] - (m - 1)/2;
last2[i] = c_end2[i] + (m - 1)/2;
if (c_start2[i] == 0){
first2[i] = 0;
}
if (c_end2[i] == ysize){
last2[i] = ysize;
}
sendcounts[i] = (last2[i] - first2[i]) * xsize;
}
int i, j, k, index, index_disp = 0;
index = 0;
displs[0] = 0;
for (k = 0; k < ranks; k++){
for (i = first2[k]*xsize; i < last2[k]*xsize; i++){
bigpic[index] = ((unsigned short int *)ptr)[i];
index++;
}
printf("%d\n", displs[index_disp]);
index_disp++;
displs[index_disp] = index;
}
}
MPI_Bcast(displs, ranks, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(sendcounts, ranks, MPI_INT, 0, MPI_COMM_WORLD);
unsigned short int minipic[xsize*(last-first)];
MPI_Barrier(MPI_COMM_WORLD);
MPI_Scatterv(&bigpic[0], sendcounts, displs, MPI_UNSIGNED_SHORT, minipic, (last-first)*xsize, MPI_UNSIGNED_SHORT, 0, MPI_COMM_WORLD);
MPI_Finalize();
}
the function kernel simply returns an array of m*n doubles to edit the image, while the read_pgm_image returns a void pointer with the values of the image read.
I've tried printing the values of bigpic and they show no problem.
In the code shown here, start and end are used uninitialised in the computations of first and last:
int flo, start, end, i;
~~~~~~~~~~
flo = floor(ysize/ranks);
int first, last;
first = start - (m - 1)/2; // <---- start has a random value here
last = end + (m - 1)/2; // <---- end has a random value here
If the values are very large, the size of minipic may become larger than the stack size:
unsigned short int minipic[xsize*(last-first)];
^^^^^^^^^^ random (possibly large) value
A strong indication that this is indeed the cause is the fact that the address of the fault 0x7ffe7f60ead8 is very close to the end of the positive part of the virtual address space, which is where most 64-bit OSes allocate the stack area of the main thread.
Always compile with -Wall in order to get back as many diagnostic messages from the compiler as possible.
Are there any way to suppress "<<< >>>" error with vscode-cpptools.
I associate "*.cu" with "cpp" in setting.json.
// use normal c++ syntax highlighting for CUDA files
"files.associations": {"*.cu": "cpp"},
and work fine except of one problem, kernel execution configuration parameters surrounded by <<< and >>> mistaked as error expected an expression
dim3 dimGrid(2, 2, 1);
dim3 dimBlock(width / 2, width / 2, 1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(d_M, d_N, d_P, width);
Any suggestion
googling for a few hours, find no perfect solution but some workaround.
I summarize here:
use normal c++ syntax highlighting for CUDA files by edittingsetting.json
include necessary header of CUDA in program
include dummy header to workaround INTELLISENSE
Bellow is a concrete example
setting.json
"files.associations": {
"*.cu": "cpp",
"*.cuh": "cpp"
}
cudaDmy.cuh
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads(); // workaround __syncthreads warning
#define KERNEL_ARG2(grid, block)
#define KERNEL_ARG3(grid, block, sh_mem)
#define KERNEL_ARG4(grid, block, sh_mem, stream)
#else
#define KERNEL_ARG2(grid, block) <<< grid, block >>>
#define KERNEL_ARG3(grid, block, sh_mem) <<< grid, block, sh_mem >>>
#define KERNEL_ARG4(grid, block, sh_mem, stream) <<< grid, block, sh_mem,
stream >>>
#endif
matrixMul.cu
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <device_functions.h>
#include <cuda_runtime_api.h>
#include "cudaDmy.cuh"
__global__ void MatrixMulKernel(float *M, float *N, float *P, int width)
{
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (Row < width && Col < width)
{
float Pvalue = 0;
for (int i = 0; i < width; ++i)
{
Pvalue += M[Row * width + i] * N[width * i + Col];
}
P[Row * width + Col] = Pvalue;
}
}
void MatMul(float *M, float *N, float *P, int width)
{
float *d_M;
float *d_N;
float *d_P;
int size = width * width * sizeof(float);
cudaMalloc((void **)&d_M, size);
cudaMemcpy(d_M, M, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_N, size);
cudaMemcpy(d_N, N, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_P, size);
dim3 dimGrid(2, 2, 1);
dim3 dimBlock(width / 2, width / 2, 1);
// <<<>>> will replace macro KERNEL_ARG2 when compiling
MatrixMulKernel KERNEL_ARG2(dimGrid,dimBlock) (d_M, d_M, d_P, width);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
cudaFree(d_M);
cudaFree(d_N);
cudaFree(d_P);
}
int main()
{
int elem = 100;
float *M = new float[elem];
float *N = new float[elem];
float *P = new float[elem];
for (int i = 0; i < elem; ++i)
M[i] = i;
for (int i = 0; i < elem; ++i)
N[i] = i + elem;
time_t t1 = time(NULL);
MatMul(M, N, P, sqrt(elem));
time_t t2 = time(NULL);
double seconds = difftime(t2,t1);
printf ("%.3f seconds total time\n", seconds);
for (int i = 0; i < elem/1000000; ++i)
printf("%.1f\t", P[i]);
printf("\n");
delete[] M;
delete[] N;
delete[] P;
return 0;
}
Let's compile it with NVCC
nvcc matrixMul.cu -Xcudafe "--diag_suppress=unrecognized_pragma" -o runcuda
useful links:
https://devtalk.nvidia.com/default/topic/513485/cuda-programming-and-performance/__syncthreads-is-undefined-need-a-help/post/5189004/#5189004
https://stackoverflow.com/a/6182137/8037585
https://stackoverflow.com/a/27992604/8037585
https://gist.github.com/ruofeidu/df95ba27dfc6b77121b27fd4a6483426
You can just download the vscode-cudacpp extention and than in your workspace(<>.workspace) or user settings(.vscode/settings.json) enable this option:
"settings": {
"files.associations": {
"*.cu": "cuda",
"*.cuh": "cuda"
}
}
As sonulohani pointed out the cuda-cpp extension. It is good and it is the only extension available for CUDA. if you want autocomplete then try the CUDA-C++ package in sublime text editor. That provides excellent autocomplete features.
There is an official extension by NVIDIA named Nsight Visual Studio Code Edition
You could try and install it in your vscode.
I am trying to implement a parallel reduction sum in CUDA 7.5. I have been trying to follow the NVIDIA PDF that walks you through the initial algorithm and then steadily more optimised versions. I am currently making an array that is filled with 1 as the value in every array position so that I can check the output is correct but I am getting a value of -842159451 for an array of size 64. I am expecting that the kernel code is correct as I have followed the exact code from NVIDIA for it but here is my kernel:
__global__ void reduce0(int *input, int *output) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = input[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
if (tid % (2 * s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}
Here is my code calling the kernel, which is where I expect my problem to be:
int main()
{
int numThreadsPerBlock = 1024;
int *hostInput;
int *hostOutput;
int *deviceInput;
int *deviceOutput;
int numInputElements = 64;
int numOutputElements; // number of elements in the output list, initialised below
numOutputElements = numInputElements / (numThreadsPerBlock / 2);
if (numInputElements % (numThreadsPerBlock / 2)) {
numOutputElements++;
}
hostInput = (int *)malloc(numInputElements * sizeof(int));
hostOutput = (int *)malloc(numOutputElements * sizeof(int));
for (int i = 0; i < numInputElements; ++i) {
hostInput[i] = 1;
}
const dim3 blockSize(numThreadsPerBlock, 1, 1);
const dim3 gridSize(numOutputElements, 1, 1);
cudaMalloc((void **)&deviceInput, numInputElements * sizeof(int));
cudaMalloc((void **)&deviceOutput, numOutputElements * sizeof(int));
cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(int), cudaMemcpyHostToDevice);
reduce0 << <gridSize, blockSize >> >(deviceInput, deviceOutput);
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(int), cudaMemcpyDeviceToHost);
for (int ii = 1; ii < numOutputElements; ii++) {
hostOutput[0] += hostOutput[ii]; //accumulates the sum in the first element
}
int sumGPU = hostOutput[0];
printf("GPU Result: %d\n", sumGPU);
std::string wait;
std::cin >> wait;
return 0;
}
I have also tried bigger and smaller array sizes for the input and I get the same result of a very large negative value no matter the size of the array.
Seems you are using a dynamically allocated shared array:
extern __shared__ int sdata[];
but you are not allocating it in the kernel invocation:
reduce0 <<<gridSize, blockSize >>>(deviceInput, deviceOutput);
You have two options:
Option 1
Allocate the shared memory statically in the kernel, e.g.
constexpr int threadsPerBlock = 1024;
__shared__ int sdata[threadsPerBlock];
More often than not I find this the cleanest approach, as it works without a problem when you have multiple arrays in shared memory. The drawback is that while the size usually depends on the number of threads in the block, you need the size to be known at compile-time.
Option 2
Specify the amount of dynamically allocated shared memory in the kernel invocation.
reduce0 <<<gridSize, blockSize, numThreadsPerBlock*sizeof(int) >>>(deviceInput, deviceOutput);
This will work for any value of numThreadsPerBlock (provided it is within the allowed range of course). The drawback is that if you have multiple extern shared arrays, you need to figure out how to put then in the memory yourself, so that one does not overwrite the other.
Note, there may be other problems in your code. I didn't test it. This is something I spotted immediately upon glancing over your code.
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
So Here is My almost Complete code:
the first kernel which is normal global histogram works correctly. but I get the error "an illegal memory access was encountered (77)"
at the final memcpy after calculating the shared_histogram. I dont know what is wrong with the code. seems like the shared histogram does change the size of d_hist2. I also checked that bin_count is changed or not. but it didnt. so is my shared_histog kernel wrong or i am doing a mistake on memCpy??
note : w * h * nc is the size of my input image
__global__ void histog( int *img, int *hist, int bin_count, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
if(x>=n) return;
unsigned char value = img[x];
int bin = value % bin_count;
atomicAdd(&hist[bin],1);
}
__global__ void shared_histog( int *img, int *hist, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
int indx = threadIdx.x;
if(x>n) return;
__shared__ int shHist[256];
if (indx < 256)
shHist[indx] =0;
__syncthreads();
unsigned char value = img[x];
__syncthreads();
atomicAdd( (int*)&shHist[value], 1);
__syncthreads();
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
}
int main(int argc, char **argv)
{
cudaDeviceSynchronize(); CUDA_CHECK;
int *imgval = new int[(size_t)w*h*nc];
for (int i =0; i<w*h*nc; i++)
imgval[i] = (imgIn[i])*256 + 1;
int bin_count = 256;
int *Histogram = new int[bin_count];
int *Histogram2 = new int[bin_count];
for (int i =0; i <bin_count; i++)
Histogram2[i] = 0;
Timer timer; timer.start();
for (int i =0; i <bin_count; i++)
Histogram[i] = 0;
for (int i =0; i<w*h*nc; i++)
Histogram[(imgval[i])]++;
showHistogram256("CPU_Histo", Histogram, 100 + w + 40, 100);
timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "CPU time: " << t*1000 << " ms" << endl;
int *d_img = NULL;
int nbytes = w * h * nc * sizeof(int);
cudaMalloc(&d_img, nbytes); CUDA_CHECK;
cudaMemcpy(d_img, imgval, nbytes, cudaMemcpyHostToDevice); CUDA_CHECK;
int *d_hist = NULL;
cudaMalloc(&d_hist, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist, 0, bin_count * sizeof(int)); CUDA_CHECK;
int *d_hist2 = NULL;
cudaMalloc(&d_hist2, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist2, 0, bin_count * sizeof(int)); CUDA_CHECK;
dim3 block = dim3(1024,1,1);
dim3 grid = dim3 ((w*h*nc+block.x-1)/block.x, 1, 1);
Timer timer2; timer2.start();
histog <<<grid, block>>> (d_img, d_hist, bin_count, nbytes); CUDA_CHECK;
timer2.end(); float t2 = timer2.get(); // elapsed time in seconds
cout << "GPU time: " << t2*1000 << " ms" << endl;
cudaMemcpy(Histogram, d_hist,bin_count * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo", Histogram, 100 + w + 40, 100 + h/2 + 10);
Timer timer3; timer3.start();
shared_histog <<<grid, block>>> (d_img, d_hist2, nbytes); CUDA_CHECK;
timer3.end(); float t3 = timer3.get(); // elapsed time in seconds
cout << "Shared time: " << t3*1000 << " ms" << endl;
* here comes the error *
cudaMemcpy(Histogram2, d_hist2, 256 * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo_Shared", Histogram2, 100 + w + 40, 100 + h +10);
return 0;
}
You're using __syncthreads() after a conditional statement:
if(x>n) return;
that may prevent all threads in the block from reaching it. That is not correct usage:
__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.
But it is probably not connected to the illegal memory access.
You are launching this kernel with 1024 threads per block:
dim3 block = dim3(1024,1,1);
which means in the kernel, your indx variable:
int indx = threadIdx.x;
will go from 0..1023 depending on the thread, which means that this line:
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
^^^^ ^^^^
will attempt to index into both hist and shHist out-of bounds for threads whose indx value is greater than 255, since both hist and shHist are only allocated with 256 elements.
You can probably fix this by adding a conditional statement:
if (indx < 256)
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
If you compile with -lineinfo and use cuda-memcheck, you can actually have cuda-memcheck pinpoint the line of source code that is generating the out-of-bounds access.
I have written a small program in CUDA that counts how many 3's are in a C array and prints them.
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cstdlib>
__global__ void incrementArrayOnDevice(int *a, int N, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ int s_a[512]; // one for each thread
//s_a[threadIdx.x] = a[id];
if( id < N )
{
//if( s_a[threadIdx.x] == 3 )
if( a[id] == 3 )
{
atomicAdd(count, 1);
}
}
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
int N = 16777216;
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
// do calculation on device
int blockSize = 512;
int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1);
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", count);
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
The result I get is:
real 0m3.025s
user 0m2.989s
sys 0m0.029s
When I run it on the CPU with 4 threads I get:
real 0m0.101s
user 0m0.100s
sys 0m0.024s
Note that the GPU is an old one - I don't know the exact model because I do not have root access to it, but the OpenGL version it runs is 1.2 using the MESA driver.
Am I doing something wrong? What can I do to make it run faster?
Note: I have tried using buckets for each block (so the atomicAdd()s would be reduced for each one) but I get exactly the same performance.
I have also tried copying the 512 integers that are assigned to this block to a shared block of memory (you can see it in the comments) and the time is the same again.
This is in response to your question "What can I do to make it run faster?" As I mentioned in the comments, there are issues (probably) with the timing methodology, and the main suggestion I have for speed improvement is to use a "classical parallel reduction" algorithm. The following code implements a better (in my opinion) timing measurement, and also converts your kernel to a reduction style kernel:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
When I run this on a reasonably fast GPU (a Quadro 5000, a little slower than a Tesla M2050) I get the following:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
We see that the GPU is substantially faster than this (naive, single-threaded) CPU implementation for the compute portion. When we add in the cost to transfer the data, the GPU version is slower but is not 30x slower.
By way of comparison, when I timed your original algorithm, I got numbers like this:
GPU total time = 0.118131s
GPU compute time = 0.093213s
My system config for this was Xeon X5560 CPU, RHEL 5.5, CUDA 5.0, Quadro5000 GPU.