Getting unknown error when launching large kernel sizes - c++

I am running into a problem when launching a simple kernel when my array size becomes larger than 591 by 591. At a size of 591x591 the array is returned without any error, but as soon as I launch the kernel with grid dimensions of 38x38 blocks with 16x16 threads each, the kernel fails to launch and returns an "unknown error".
The following code is the kernel I am calling and the call to the kernel in my code:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
using namespace std;
#define BLOCKSIZE 16
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__,__LINE__);}
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if(abort) exit(code);
}
}
__global__ void IdentityMatrixKernel(float* identity, int size)
{
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int index = index_y * grid_width + index_x;
// map the two 2D block indices to a single linear, 1D block index
//int result = blockIdx.y * gridDim.x + blockIdx.x;
if (index % (size+1))
{
identity[index] = 0;
}
else
{
identity[index] = 1;
}
void foo(float *aArray, int size)
{
float* d_I;
int size2 = size*size*sizeof(float);
gpuErrchk(cudaMalloc(&d_I,size2));
dim3 block_size;
block_size.x = BLOCKSIZE;
block_size.y = BLOCKSIZE;
dim3 grid_size;
grid_size.x = size1/ block_size.x + 1;
grid_size.y = size1/ block_size.y + 1;
IdentityMatrixKernel<<<grid_size,block_size>>>(d_I,size);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(aArray,d_I,size2,cudaMemcpyDeviceToHost));
cudaFree(d_I);
}
int main()
{
int size = 591;
float *aArray = (float*)malloc(size*size*sizeof(float));
foo(aArray,size);
return 0;
}
For size = 591 no error shows up, outputs identity matrix of size 591x591 but for any larger size it spits out an "unknown error" to console.

One problem seems to be that you are launching a grid of threads that is larger than your actual matrix:
grid_size.x = size1/ block_size.x + 1;
grid_size.y = size1/ block_size.y + 1;
But you are not checking for any out-of-bounds accesses in your kernel. You need to add a thread check such as:
if ((index_x >= size)||(index_y >= size)) return;
near the beginning of your kernel. But that's not enough. Another problem is that your index calculation is not correct:
int index = index_y * grid_width + index_x;
On the surface of it, it appears to be correct, but since your thread array is larger than your data array (potentially), this can give incorrect indexing. Since you're passing size to the kernel anyway, change it to something like this:
int index = index_y * size + index_x;
And you should be able to eliminate the out-of-bounds accesses.

I extend Robert Crovella's answer.
If you define block_size.{x, y} with a big number (in your case 16), then you won't be able to work with arrays of smaller size e.g. 4x4. What you could do is define a small block size:
/* create thread blocks */
dim3 block_size;
block_size.x = 4;
block_size.y = 4;
/* create n x n block grids */
dim3 grid_size;
grid_size.x = size1/block_size.x;
grid_size.y = size1/block_size.y;
/* in case of partial sizes make grid_size 1 x 1 */
if (size1 % block_size.x)
grid_size.x = 1, grid_size.y = 1;

Related

How to keep track of a global index within an MPI_Scatter(...) function?

I am working on an MPI program in which process 0 reads a .raw file, places it in a pointer array, and distribute it to other processes so they can each do work. I am currently distributing sections of the pointer array by using MPI_Scatter which seems to be working.
The problem:
The calculations to do on each sub array requires knowing what its global index is. Currently I only know how to go over its local index. Below is the code:
#include <iostream>
#include <cstdint>
#include <mpi.h>
#define MCW MPI_COMM_WORLD
const int WIDTH = 1300;
const int HEIGHT = 600;
//work function that requires the *global* x,y coordinate of the file
float calc_vorticity(int x, int y, int width, int height, float* vector_field) {
float d_x = 0.01;
float d_y = 0.01;
uint32_t index = y * width + x;
int start_x = (x == 0) ? 0 : x - 1;
int end_x = (x == width - 1) ? x : x + 1;
int start_y = (y == 0) ? 0 : y - 1;
int end_y = (y == height - 1) ? y : y + 1;
uint32_t duidx = (start_y * width + end_x) * 2;
uint32_t dvidx = (end_y * width + start_x) * 2;
std::pair<double, double> fdu(vector_field[duidx], vector_field[duidx + 1]);
std::pair<double, double> fdv(vector_field[dvidx], vector_field[dvidx + 1]);
std::pair<double, double> vec0(vector_field[index * 2], vector_field[index * 2 + 1]);
float duy = (fdu.second - vec0.second) / (d_x * (end_x - start_x));
float dvx = (fdv.first - vec0.first) / (d_y * (end_y - start_y));
return duy - dvx;
}
int main(int argc, char** argv) {
int rank, size;
MPI_Init(&argc, &argv); //initialize MPI
MPI_Comm_rank(MCW, &rank); //Assign each processor a rank
MPI_Comm_size(MCW, &size); //Assign the size
//allocate the space
float* vector_field = new float[WIDTH * HEIGHT * 2];
//have rank 0 read in image
if(!rank) {
FILE* file = fopen("../data/cyl2d_1300x600_float32[2].raw", "r");
fread(vector_field, sizeof(float), WIDTH * HEIGHT * 2, file);
fclose(file);
}
//create buffer of data & catch all processes up to this point
float* buffer = new float[(WIDTH * HEIGHT * 2) / size];
MPI_Barrier(MCW);
//broadcast the array to other ranks
MPI_Scatter(
vector_field, //array being sent out that resides on root process
(WIDTH * HEIGHT * 2) / size, //number of items to send
MPI_FLOAT, //type of items sent
buffer, //buffer of data that holds number of items in sent array
(WIDTH * HEIGHT * 2) / size, //number of items to be recieved
MPI_FLOAT, //type of items recieved
0, //originating process
MCW //communicator
);
//allocate space for results
//Note that we only need half the size of buffer as vorticity is represented with only 1 float
float* vorticities = new float[(WIDTH * HEIGHT) / size];
MPI_Barrier(MCW);
//calculate value for each cell
// !!! Problem here !!!
/* Call calc_vorticity(...) sending the current **global** x/y coordinate */
/*
Serial implementation call looks like this:
for (int i=0; i<WIDTH; i++){
for (int j=0; j<HEIGHT; j++){
vorticities[j*WIDTH+i] = vorticity(i, j, WIDTH, HEIGHT, vector_field);
}
}
*/
//assure all processes are caught up
MPI_Barrier(MCW);
//have process 0 write to output file
if(!rank) {
FILE* output = fopen("../data/vorticities_dist.raw", "w");
fwrite(vorticities, sizeof(float), WIDTH * HEIGHT, output);
fclose(output);
}
//delete used memory
delete[] vector_field;
delete[] vorticities;
//Finalize & exit
MPI_Finalize();
return 1;
}
I've looked at a lot of MPI_Gather Docs, and I don't think Scatter/Gather is the answer as you cannot also broadcast the global indexes with that pointer array. I was also tossing the idea around of creating my own object which has 2 elements, num & global_index. This way I could broadcast an array of objects which carry their global index with them, though I don't know how to scatter custom data types.

CUDA kernel stops working when using cooperative groups grid sync() function [duplicate]

I’m trying to write a kernel whose threads iteratively process items in a work queue. My understanding is that I should be able to do this by using atomic operations to manipulate the work queue (i.e., grab work items from the queue and insert new work items into the queue), and using grid synchronization via cooperative groups to ensure all threads are at the same iteration (I ensure the number of thread blocks doesn’t exceed the device capacity for the kernel). However, sometimes I observe that work items are skipped or processed multiple times during an iteration.
The following code is a working example to show this. In this example, an array with the size of input_len is created, which holds work items 0 to input_len - 1. The processWorkItems kernel processes these items for max_iter iterations. Each work item can put itself and its previous and next work items in the work queue, but marked array is used to ensure that during an iteration, each work item is added to the work queue at most once. What should happen in the end is that the sum of values in histogram be equal to input_len * max_iter, and no value in histogram be greater than 1. But I observe that occasionally both of these criteria are violated in the output, which implies that I’m not getting atomic operations and/or proper synchronization. I would appreciate it if someone could point out the flaws in my reasoning and/or implementation. My OS is Ubuntu 18.04, CUDA version is 10.1, and I’ve run experiments on P100, V100, and RTX 2080 Ti GPUs, and observed similar behavior.
The command I use for compiling for RTX 2080 Ti:
nvcc -O3 -o atomicsync atomicsync.cu --gpu-architecture=compute_75 -rdc=true
Some inputs and outputs of runs on RTX 2080 Ti:
./atomicsync 50 1000 1000
Skipped 0.01% of items. 5 extra item processing.
./atomicsync 500 1000 1000
Skipped 0.00% of items. 6 extra item processing.
./atomicsync 5000 1000 1000
Skipped 0.00% of items. 14 extra item processing.
atomicsync.cu:
#include <stdio.h>
#include <cooperative_groups.h>
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
template< typename T >
void check(T result, char const *const func, const char *const file, int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result), cudaGetErrorString(result), func);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
__device__ inline void addWorkItem(int input_len, int item, int item_adder, int iter, int *queue, int *queue_size, int *marked) {
int already_marked = atomicExch(&marked[item], 1);
if(already_marked == 0) {
int idx = atomicAdd(&queue_size[iter + 1], 1);
queue[(iter + 1) * input_len + idx] = item;
}
}
__global__ void processWorkItems(int input_len, int max_iter, int *histogram, int *queue, int *queue_size, int *marked) {
auto grid = cooperative_groups::this_grid();
const int items_per_block = (input_len + gridDim.x - 1) / gridDim.x;
for(int iter = 0; iter < max_iter; ++iter) {
while(true) {
// Grab work item to process
int idx = atomicSub(&queue_size[iter], 1);
--idx;
if(idx < 0) {
break;
}
int item = queue[iter * input_len + idx];
// Keep track of processed work items
++histogram[iter * input_len + item];
// Add previous, self, and next work items to work queue
if(item > 0) {
addWorkItem(input_len, item - 1, item, iter, queue, queue_size, marked);
}
addWorkItem(input_len, item, item, iter, queue, queue_size, marked);
if(item + 1 < input_len) {
addWorkItem(input_len, item + 1, item, iter, queue, queue_size, marked);
}
}
__threadfence_system();
grid.sync();
// Reset marked array for next iteration
for(int i = 0; i < items_per_block; ++i) {
if(blockIdx.x * items_per_block + i < input_len) {
marked[blockIdx.x * items_per_block + i] = 0;
}
}
__threadfence_system();
grid.sync();
}
}
int main(int argc, char* argv[])
{
int input_len = atoi(argv[1]);
int max_iter = atoi(argv[2]);
int num_blocks = atoi(argv[3]);
// A histogram to keep track of work items that have been processed in each iteration
int histogram_host[input_len * max_iter];
memset(histogram_host, 0, sizeof(int) * input_len * max_iter);
int *histogram_device;
checkCudaErrors(cudaMalloc(&histogram_device, sizeof(int) * input_len * max_iter));
checkCudaErrors(cudaMemcpy(histogram_device, histogram_host, sizeof(int) * input_len * max_iter, cudaMemcpyHostToDevice));
// Size of the work queue for each iteration
int queue_size_host[max_iter + 1];
queue_size_host[0] = input_len;
memset(&queue_size_host[1], 0, sizeof(int) * max_iter);
int *queue_size_device;
checkCudaErrors(cudaMalloc(&queue_size_device, sizeof(int) * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_size_device, queue_size_host, sizeof(int) * (max_iter + 1), cudaMemcpyHostToDevice));
// Work queue
int queue_host[input_len * (max_iter + 1)];
for(int i = 0; i < input_len; ++i) {
queue_host[i] = i;
}
memset(&queue_host[input_len], 0, sizeof(int) * input_len * max_iter);
int *queue_device;
checkCudaErrors(cudaMalloc(&queue_device, sizeof(int) * input_len * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_device, queue_host, sizeof(int) * input_len * (max_iter + 1), cudaMemcpyHostToDevice));
// An array used to keep track of work items already added to the work queue to
// avoid multiple additions of a work item in the same iteration
int marked_host[input_len];
memset(marked_host, 0, sizeof(int) * input_len);
int *marked_device;
checkCudaErrors(cudaMalloc(&marked_device, sizeof(int) * input_len));
checkCudaErrors(cudaMemcpy(marked_device, marked_host, sizeof(int) * input_len, cudaMemcpyHostToDevice));
const dim3 threads(1, 1, 1);
const dim3 blocks(num_blocks, 1, 1);
processWorkItems<<<blocks, threads>>>(input_len, max_iter, histogram_device, queue_device, queue_size_device, marked_device);
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(histogram_host, histogram_device, sizeof(int) * input_len * max_iter, cudaMemcpyDeviceToHost));
int extra = 0;
double deficit = 0;
for(int i = 0; i < input_len; ++i) {
int cnt = 0;
for(int iter = 0; iter < max_iter; ++iter) {
if(histogram_host[iter * input_len + i] > 1) {
++extra;
}
cnt += histogram_host[iter * input_len + i];
}
deficit += max_iter - cnt;
}
printf("Skipped %.2f%% of items. %d extra item processing.\n", deficit / (input_len * max_iter) * 100, extra);
checkCudaErrors(cudaFree(histogram_device));
checkCudaErrors(cudaFree(queue_device));
checkCudaErrors(cudaFree(queue_size_device));
checkCudaErrors(cudaFree(marked_device));
return 0;
}
You may wish to read how to do a cooperative grid kernel launch in the programming gude or study any of the cuda sample codes (e.g. reductionMultiBlockCG, and there are others) that use a grid sync.
You're doing it incorrectly. You cannot launch a cooperative grid with ordinary <<<...>>> launch syntax. Because of that, there is no reason to assume that the grid.sync() in your kernel is working correctly.
It's easy to see the grid sync is not working in your code by running it under cuda-memcheck. When you do that the results will get drastically worse.
When I modify your code to do a proper cooperative launch, I have no issues on Tesla V100:
$ cat t1811.cu
#include <stdio.h>
#include <cooperative_groups.h>
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
template< typename T >
void check(T result, char const *const func, const char *const file, int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result), cudaGetErrorString(result), func);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
__device__ inline void addWorkItem(int input_len, int item, int item_adder, int iter, int *queue, int *queue_size, int *marked) {
int already_marked = atomicExch(&marked[item], 1);
if(already_marked == 0) {
int idx = atomicAdd(&queue_size[iter + 1], 1);
queue[(iter + 1) * input_len + idx] = item;
}
}
__global__ void processWorkItems(int input_len, int max_iter, int *histogram, int *queue, int *queue_size, int *marked) {
auto grid = cooperative_groups::this_grid();
const int items_per_block = (input_len + gridDim.x - 1) / gridDim.x;
for(int iter = 0; iter < max_iter; ++iter) {
while(true) {
// Grab work item to process
int idx = atomicSub(&queue_size[iter], 1);
--idx;
if(idx < 0) {
break;
}
int item = queue[iter * input_len + idx];
// Keep track of processed work items
++histogram[iter * input_len + item];
// Add previous, self, and next work items to work queue
if(item > 0) {
addWorkItem(input_len, item - 1, item, iter, queue, queue_size, marked);
}
addWorkItem(input_len, item, item, iter, queue, queue_size, marked);
if(item + 1 < input_len) {
addWorkItem(input_len, item + 1, item, iter, queue, queue_size, marked);
}
}
__threadfence_system();
grid.sync();
// Reset marked array for next iteration
for(int i = 0; i < items_per_block; ++i) {
if(blockIdx.x * items_per_block + i < input_len) {
marked[blockIdx.x * items_per_block + i] = 0;
}
}
__threadfence_system();
grid.sync();
}
}
int main(int argc, char* argv[])
{
int input_len = atoi(argv[1]);
int max_iter = atoi(argv[2]);
int num_blocks = atoi(argv[3]);
// A histogram to keep track of work items that have been processed in each iteration
int *histogram_host = new int[input_len * max_iter];
memset(histogram_host, 0, sizeof(int) * input_len * max_iter);
int *histogram_device;
checkCudaErrors(cudaMalloc(&histogram_device, sizeof(int) * input_len * max_iter));
checkCudaErrors(cudaMemcpy(histogram_device, histogram_host, sizeof(int) * input_len * max_iter, cudaMemcpyHostToDevice));
// Size of the work queue for each iteration
int queue_size_host[max_iter + 1];
queue_size_host[0] = input_len;
memset(&queue_size_host[1], 0, sizeof(int) * max_iter);
int *queue_size_device;
checkCudaErrors(cudaMalloc(&queue_size_device, sizeof(int) * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_size_device, queue_size_host, sizeof(int) * (max_iter + 1), cudaMemcpyHostToDevice));
// Work queue
int *queue_host = new int[input_len * (max_iter + 1)];
for(int i = 0; i < input_len; ++i) {
queue_host[i] = i;
}
memset(&queue_host[input_len], 0, sizeof(int) * input_len * max_iter);
int *queue_device;
checkCudaErrors(cudaMalloc(&queue_device, sizeof(int) * input_len * (max_iter + 1)));
checkCudaErrors(cudaMemcpy(queue_device, queue_host, sizeof(int) * input_len * (max_iter + 1), cudaMemcpyHostToDevice));
// An array used to keep track of work items already added to the work queue to
// avoid multiple additions of a work item in the same iteration
int marked_host[input_len];
memset(marked_host, 0, sizeof(int) * input_len);
int *marked_device;
checkCudaErrors(cudaMalloc(&marked_device, sizeof(int) * input_len));
checkCudaErrors(cudaMemcpy(marked_device, marked_host, sizeof(int) * input_len, cudaMemcpyHostToDevice));
const dim3 threads(1, 1, 1);
const dim3 blocks(num_blocks, 1, 1);
int dev = 0;
int supportsCoopLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev));
if (!supportsCoopLaunch) {printf("Cooperative Launch is not supported on this machine configuration. Exiting."); return 0;}
/// This will launch a grid that can maximally fill the GPU, on the default stream with kernel arguments
int numBlocksPerSm = 0;
// Number of threads my_kernel will be launched with
int numThreads = threads.x;
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, processWorkItems, numThreads, 0));
// launch
void *kernelArgs[] = { &input_len, &max_iter, &histogram_device, &queue_device, &queue_size_device, &marked_device};
dim3 dimBlock = dim3(numThreads,1,1);
num_blocks = min(num_blocks, deviceProp.multiProcessorCount*numBlocksPerSm);
dim3 dimGrid(num_blocks, 1, 1);
printf("launching %d blocks\n", dimGrid.x);
checkCudaErrors(cudaLaunchCooperativeKernel((void*)processWorkItems, dimGrid, dimBlock, kernelArgs));
// processWorkItems<<<blocks, threads>>>(input_len, max_iter, histogram_device, queue_device, queue_size_device, marked_device);
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(histogram_host, histogram_device, sizeof(int) * input_len * max_iter, cudaMemcpyDeviceToHost));
int extra = 0;
double deficit = 0;
for(int i = 0; i < input_len; ++i) {
int cnt = 0;
for(int iter = 0; iter < max_iter; ++iter) {
if(histogram_host[iter * input_len + i] > 1) {
++extra;
}
cnt += histogram_host[iter * input_len + i];
}
deficit += max_iter - cnt;
}
printf("Skipped %.2f%% of items. %d extra item processing.\n", deficit / (input_len * max_iter) * 100, extra);
checkCudaErrors(cudaFree(histogram_device));
checkCudaErrors(cudaFree(queue_device));
checkCudaErrors(cudaFree(queue_size_device));
checkCudaErrors(cudaFree(marked_device));
return 0;
}
$ nvcc -o t1811 t1811.cu -arch=sm_70 -std=c++11 -rdc=true
$ cuda-memcheck ./t1811 50 1000 5000
========= CUDA-MEMCHECK
launching 2560 blocks
Skipped 0.00% of items. 0 extra item processing.
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t1811 50 1000 1000
========= CUDA-MEMCHECK
launching 1000 blocks
Skipped 0.00% of items. 0 extra item processing.
========= ERROR SUMMARY: 0 errors
$ ./t1811 50 1000 5000
launching 2560 blocks
Skipped 0.00% of items. 0 extra item processing.
$ ./t1811 50 1000 1000
launching 1000 blocks
Skipped 0.00% of items. 0 extra item processing.
$ ./t1811 50 1000 1000
launching 1000 blocks
Skipped 0.00% of items. 0 extra item processing.
$
I'm not suggesting the above code is defect free or suitable for any particular purpose. It is mostly your code. I've modified it just to demonstrate the concepts mentioned.
As an aside, I changed a few of your large stack-based memory allocations to heap based. I don't recommend trying to create large stack-based arrays such as this:
int histogram_host[input_len * max_iter];
in my opinion its better to do:
int *histogram_host = new int[input_len * max_iter];
As your input command-line parameters become larger, this may become an issue depending on the machine characteristics. This doesn't have much to do with CUDA, however. I've not tried to address every instance of this pattern in your code.
Although not relevant to this particular question, grid sync has other requirements for successful use as well. These are covered in the programming guide and may include but not limited to:
platform support (e.g. OS, GPU, etc.)
kernel sizing requirements (total number of threads or threadblocks launched)
The programming guide contains convenient, boiler-plate code that may be used to satisfy these requirements.

CUDA Vector Reduction to handle vectors of length less than 512?

I'm working on parallel vector_reduction algorithm tutorial from NVIDIA to implement the algorithm using CUDA C++ API. I have implemented the algorithm but it only works for vector lengths that are fixed to 512. I am not able to figure out how to get it working for vectors less than 512? I want it to work for arbitrary sizes, i.e, 324, 123, 23.
#include <stdio.h>
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float partialSum[NUM_ELEMENTS];
int tx = threadIdx.x;
int i = tx + blockIdx.x * blockDim.x;
if (i < n) {
partialSum[tx] = g_data[i];
}
int stride;
for (stride = blockDim.x/2; stride > 0; stride >>= 1) {
__syncthreads();
if (tx < stride) {
partialSum[tx] += partialSum[tx + stride];
}
}
if (tx == 0) {
g_data[blockIdx.x] = partialSum[tx];
}
}
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
dim3 blockSize, gridSize;
// Number of threads in each thread block
blockSize = dim3(num_elements, 1, 1);
// Number of thread blocks in grid
gridSize = dim3(1, 1, 1);
// Invoke the kernel
reduction<<<gridSize, blockSize>>>(d_data, num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
int main() {
float *data = new float[NUM_ELEMENTS];
for (int i = 0; i < NUM_ELEMENTS; i++) data[i] = 1;
float r = computeOnDevice(data, NUM_ELEMENTS);
printf(" result = %f\n" , r);
}
Your code is 100% correct. The problem is that your bitshifts don't account for the last part of your array. You can easily fix this by artificially extending the array to the next power of 2. This way your entire array will be reduced and the extra "elements" (they don't actually exist) are just ignored.
#include <math.h>
__global__ void reduction(float *g_data, int n){
// figure out exponent of next larger power of 2
int exponent = ceilf(log2f(n));
// calculate next larger power of 2
int size = (int)powf(2, exponent);
__shared__ float partialSum[NUM_ELEMENTS];
int tx = threadIdx.x;
int i = tx + blockIdx.x * blockDim.x;
if (i < n){
partialSum[tx] = g_data[i];
}
for (int stride = size / 2; stride > 0; stride >>= 1){
__syncthreads();
if (tx < stride) {
// all threads that run out of bounds do nothing
// equivalent to adding 0
if((tx + stride) < n)
partialSum[tx] += partialSum[tx + stride];
}
}
if (tx == 0){
g_data[blockIdx.x] = partialSum[tx];
}
}
Edit
Regarding your comment, this method of reduction will never work for an array that is being reduced in multiple blocks. So, for compute capability 1.0-1.3, the largest array you can reduce is 512 elements, for compute capability >1.3 you can do up to 1024 elements, this is the maximum number of threads per block.
This is because __shared__ memory is shared among threads not blocks. So, to reduce an array scattered over multiple blocks you'd need to partition the array such that each block reduces a chunk and then leverage __global__ memory to reduce the values from all blocks. However, __global__ memory is approximately 10-20 times slower than the (on-chip) __shared__ memory, so once you start using a lot of blocks, this will become very inefficient.
The alternative would be to have each thread process multiple indices, however, eventually your partialSum array won't fit into shared memory anymore and overflow into global memory anyway. This approach would also mean you can never use more than 512 (or 1024) threads, which defeats the purpose of using CUDA which depends on running a very large number of threads to hide latency and make the expensive memory transfer from host to device worth it.

CUDA - Parallel Reduction Sum

I am trying to implement a parallel reduction sum in CUDA 7.5. I have been trying to follow the NVIDIA PDF that walks you through the initial algorithm and then steadily more optimised versions. I am currently making an array that is filled with 1 as the value in every array position so that I can check the output is correct but I am getting a value of -842159451 for an array of size 64. I am expecting that the kernel code is correct as I have followed the exact code from NVIDIA for it but here is my kernel:
__global__ void reduce0(int *input, int *output) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = input[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
if (tid % (2 * s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}
Here is my code calling the kernel, which is where I expect my problem to be:
int main()
{
int numThreadsPerBlock = 1024;
int *hostInput;
int *hostOutput;
int *deviceInput;
int *deviceOutput;
int numInputElements = 64;
int numOutputElements; // number of elements in the output list, initialised below
numOutputElements = numInputElements / (numThreadsPerBlock / 2);
if (numInputElements % (numThreadsPerBlock / 2)) {
numOutputElements++;
}
hostInput = (int *)malloc(numInputElements * sizeof(int));
hostOutput = (int *)malloc(numOutputElements * sizeof(int));
for (int i = 0; i < numInputElements; ++i) {
hostInput[i] = 1;
}
const dim3 blockSize(numThreadsPerBlock, 1, 1);
const dim3 gridSize(numOutputElements, 1, 1);
cudaMalloc((void **)&deviceInput, numInputElements * sizeof(int));
cudaMalloc((void **)&deviceOutput, numOutputElements * sizeof(int));
cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(int), cudaMemcpyHostToDevice);
reduce0 << <gridSize, blockSize >> >(deviceInput, deviceOutput);
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(int), cudaMemcpyDeviceToHost);
for (int ii = 1; ii < numOutputElements; ii++) {
hostOutput[0] += hostOutput[ii]; //accumulates the sum in the first element
}
int sumGPU = hostOutput[0];
printf("GPU Result: %d\n", sumGPU);
std::string wait;
std::cin >> wait;
return 0;
}
I have also tried bigger and smaller array sizes for the input and I get the same result of a very large negative value no matter the size of the array.
Seems you are using a dynamically allocated shared array:
extern __shared__ int sdata[];
but you are not allocating it in the kernel invocation:
reduce0 <<<gridSize, blockSize >>>(deviceInput, deviceOutput);
You have two options:
Option 1
Allocate the shared memory statically in the kernel, e.g.
constexpr int threadsPerBlock = 1024;
__shared__ int sdata[threadsPerBlock];
More often than not I find this the cleanest approach, as it works without a problem when you have multiple arrays in shared memory. The drawback is that while the size usually depends on the number of threads in the block, you need the size to be known at compile-time.
Option 2
Specify the amount of dynamically allocated shared memory in the kernel invocation.
reduce0 <<<gridSize, blockSize, numThreadsPerBlock*sizeof(int) >>>(deviceInput, deviceOutput);
This will work for any value of numThreadsPerBlock (provided it is within the allowed range of course). The drawback is that if you have multiple extern shared arrays, you need to figure out how to put then in the memory yourself, so that one does not overwrite the other.
Note, there may be other problems in your code. I didn't test it. This is something I spotted immediately upon glancing over your code.

count3's in cuda is very slow

I have written a small program in CUDA that counts how many 3's are in a C array and prints them.
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cstdlib>
__global__ void incrementArrayOnDevice(int *a, int N, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ int s_a[512]; // one for each thread
//s_a[threadIdx.x] = a[id];
if( id < N )
{
//if( s_a[threadIdx.x] == 3 )
if( a[id] == 3 )
{
atomicAdd(count, 1);
}
}
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
int N = 16777216;
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
// do calculation on device
int blockSize = 512;
int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1);
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", count);
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
The result I get is:
real 0m3.025s
user 0m2.989s
sys 0m0.029s
When I run it on the CPU with 4 threads I get:
real 0m0.101s
user 0m0.100s
sys 0m0.024s
Note that the GPU is an old one - I don't know the exact model because I do not have root access to it, but the OpenGL version it runs is 1.2 using the MESA driver.
Am I doing something wrong? What can I do to make it run faster?
Note: I have tried using buckets for each block (so the atomicAdd()s would be reduced for each one) but I get exactly the same performance.
I have also tried copying the 512 integers that are assigned to this block to a shared block of memory (you can see it in the comments) and the time is the same again.
This is in response to your question "What can I do to make it run faster?" As I mentioned in the comments, there are issues (probably) with the timing methodology, and the main suggestion I have for speed improvement is to use a "classical parallel reduction" algorithm. The following code implements a better (in my opinion) timing measurement, and also converts your kernel to a reduction style kernel:
#include <stdio.h>
#include <assert.h>
#include <cstdlib>
#define N (1<<24)
#define nTPB 512
#define NBLOCKS 32
__global__ void incrementArrayOnDevice(int *a, int n, int *count)
{
__shared__ int lcnt[nTPB];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int lcount = 0;
while (id < n) {
if (a[id] == 3) lcount++;
id += gridDim.x * blockDim.x;
}
lcnt[threadIdx.x] = lcount;
__syncthreads();
int stride = blockDim.x;
while(stride > 1) {
// assume blockDim.x is a power of 2
stride >>= 1;
if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride];
__syncthreads();
}
if (threadIdx.x == 0) atomicAdd(count, lcnt[0]);
}
int main(void)
{
int *a_h; // host memory
int *a_d; // device memory
cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop;
float etg1, etg2, etc;
cudaEventCreate(&gstart1);
cudaEventCreate(&gstart2);
cudaEventCreate(&gstop1);
cudaEventCreate(&gstop2);
cudaEventCreate(&cstart);
cudaEventCreate(&cstop);
// allocate array on host
a_h = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; ++i)
a_h[i] = (i % 3 == 0 ? 3 : 1);
// allocate arrays on device
cudaMalloc(&a_d, sizeof(int) * N);
int blockSize = nTPB;
int nBlocks = NBLOCKS;
printf("number of blocks: %d\n", nBlocks);
int count;
int *devCount;
cudaMalloc(&devCount, sizeof(int));
cudaMemset(devCount, 0, sizeof(int));
// copy data from host to device
cudaEventRecord(gstart1);
cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemset(devCount, 0, sizeof(int));
cudaEventRecord(gstart2);
// do calculation on device
incrementArrayOnDevice<<<nBlocks, blockSize>>> (a_d, N, devCount);
cudaEventRecord(gstop2);
// retrieve result from device
cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(gstop1);
printf("GPU count = %d\n", count);
int hostCount = 0;
cudaEventRecord(cstart);
for (int i=0; i < N; i++)
if (a_h[i] == 3) hostCount++;
cudaEventRecord(cstop);
printf("CPU count = %d\n", hostCount);
cudaEventSynchronize(cstop);
cudaEventElapsedTime(&etg1, gstart1, gstop1);
cudaEventElapsedTime(&etg2, gstart2, gstop2);
cudaEventElapsedTime(&etc, cstart, cstop);
printf("GPU total time = %fs\n", (etg1/(float)1000) );
printf("GPU compute time = %fs\n", (etg2/(float)1000));
printf("CPU time = %fs\n", (etc/(float)1000));
free(a_h);
cudaFree(a_d);
cudaFree(devCount);
}
When I run this on a reasonably fast GPU (a Quadro 5000, a little slower than a Tesla M2050) I get the following:
number of blocks: 32
GPU count = 5592406
CPU count = 5592406
GPU total time = 0.025714s
GPU compute time = 0.000793s
CPU time = 0.017332s
We see that the GPU is substantially faster than this (naive, single-threaded) CPU implementation for the compute portion. When we add in the cost to transfer the data, the GPU version is slower but is not 30x slower.
By way of comparison, when I timed your original algorithm, I got numbers like this:
GPU total time = 0.118131s
GPU compute time = 0.093213s
My system config for this was Xeon X5560 CPU, RHEL 5.5, CUDA 5.0, Quadro5000 GPU.