CUDA: cascaded summation of all vector elements - c++

I have implemented a cascaded addition function for a large vector of float values on my GPU and my CPU. That simply means that all elements of this vector shell be summed up into one result. The CPU algorithm is quite trivial and works fine, but the GPU algorithm is always 35200 off the desired result.
The minimal working code for the algorithm and comparison to the CPU is below.
The output is always this:
CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s
GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s
- CPU result does not match GPU result in improved atomic add.
CPU: 10000000.000000, GPU: 10035200.000000, diff:-35200.000000
I checked it with cuda-memcheck but no errors occured in that run. I have tried many many different things but none of themworked. It if not due to the inaccuracy of the float datatype because I changed all floats to ints and still got the exact same result.
This is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
void reductionWithCudaImproved(float *result, const float *input);
__global__ void reductionKernelImproved(float *result, const float *input);
void reductionCPU(float *result, const float *input);
#define SIZE 10000000
#define TILE 32
#define ILP 8
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100
int main()
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0.0;
resultGPU = 0.0;
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
input[i] = 1.0;
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCudaImproved(&resultGPU, input);
if (resultCPU != resultGPU)
printf("- CPU result does not match GPU result in improved atomic add. CPU: %f, GPU: %f, diff:%f\n\n", resultCPU, resultGPU, (resultCPU - resultGPU));
printf("+ CPU result matches GPU result in improved atomic add. CPU: %f, GPU: %f\n\n", resultCPU, resultGPU);
return 0;
void reductionCPU(float *result, const float *input)
for (int i = 0; i < SIZE; i++)
*result += input[i];
__global__ void reductionKernelImproved(float *result, const float *input)
int i;
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
__shared__ float interResult;
if (threadIdx.x == 0 && threadIdx.y == 0)
interResult = 0.0;
#pragma unroll ILP
for (i = 0; i < ILP; i++)
if (index < SIZE)
atomicAdd(&interResult, input[index]);
if (threadIdx.x == 0 && threadIdx.y == 0)
atomicAdd(result, interResult);
void reductionWithCudaImproved(float *result, const float *input)
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_IMPR;
dim_block.y = BLOCK_Y_IMPR;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X_IMPR;
dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y* BLOCK_COUNT_X_IMPR));
dim_grid.z = 1;
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
reductionKernelImproved << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time (improved): %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);

I think you have overlapping indices in your kernel call:
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
If I am not mistaken, your blockDim.x = 4 and BLOCK_COUNT_X_IMPR = 100, so each row will jump 400 indices.
However, your col can go as high as 400 * 8.
blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
row = 0
index = 408
blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
row = 1
index = 1 * 400 + 8 = 408
So I guess you should rewrite your index
// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;


CUDA array filtering kernel without a for loop

I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
if (A[x + 3] != 0)
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
// TODO: For multiple blocks, we still need to densely pack B. (see below)
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
Which we can then test using GTEST:
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
__global__ void get_indeces(float* A, int* P, int size_A)
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];

Why CUDA shared memory is slower than global memory in tiled matrix multiplication?

I have tiled matrix multiplication code with and without shared memory. Below is matrix multiplication using global memory:
void MatrixMulKernel(float* M, float* N, float* P, int Width)
int Row = blockIdx.y*blockDim.y + threadIdx.y;
int Col = blockIdx.x*blockDim.x + threadIdx.x;
if ((Row < Width) && (Col < Width)) {
float Pvalue = 0;
for (int k = 0; k < Width; ++k)
Pvalue += M[Row*Width + k] * N[k*Width + Col];
P[Row*Width + Col] = Pvalue;
Below is matrix multiplication using shared memory:
void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width)
__shared__ float Mds[blockWidth][blockWidth];
__shared__ float Nds[blockWidth][blockWidth];
int tx = threadIdx.x; int ty = threadIdx.y;
int bx = blockIdx.x; int by = blockIdx.y;
int row = by * blockWidth + ty;
int col = bx * blockWidth + tx;
float pvalue = 0;
for (int m = 0; m < Width / blockWidth; ++m)
Mds[ty][tx] = d_M[row * Width + m*blockWidth + tx];
Nds[ty][tx] = d_N[(m*blockWidth + ty)*Width + col];
for (int k = 0; k < blockWidth; ++k)
pvalue += Mds[ty][k]*Nds[k][tx];
d_P[row*Width + col] = pvalue;
As much as I know using shared memory should be faster but in comparing this two codes I found code without shared memory runs about 2 seconds faster for 1600*1600 matrixes. Is there any explanation for this speed difference or something goes wrong with my code?
My teacher uses "Programming Massively Parallel Processors" Book as main text resource these two codes comes from that.
Configuration for Kernel:
int NumBlocks =ceil( Width / blockWidth); // blockWidth = 16
dim3 dimGrid(NumBlocks, NumBlocks,1); // Width = 1600
dim3 dimBlock(blockWidth, blockWidth,1);
clock_t startGpuCalculation = clock();
MatrixMulKernel <<<dimGrid, dimBlock >>>(d_M, d_N, d_P, Width);
clock_t endGpuCalculation = clock();
I was Running Project In Debug Mode (VS 2017 & CUDA 9). I Run Code in Release Mode and Shared Memory Is Much Faster Than Global Memory. My Bad.

CUDA shuffle instruction reduction slower than shared memory reduction?

Shuffle instruction based warp reduction is expected to perform faster reduction than reduction using shared memory or global memory, as mentioned in -
In the following code, I tried to validate this:-
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32];
int lane = threadIdx.x%32;
int wid = threadIdx.x / 32;
val = warpReduceSum(val);
//write reduced value to shared memory
if (lane == 0) shared[wid] = val;
//ensure we only grab a value from shared memory if that warp existed
val = (threadIdx.x<blockDim.x / 32) ? shared[lane] : int(0);
if (wid == 0) val = warpReduceSum(val);
return val;
__global__ void device_reduce_stable_kernel(float *in, float* out, int N) {
float sum = int(0);
//printf("value = %d ", blockDim.x*gridDim.x);
for (int i = blockIdx.x*blockDim.x + threadIdx.x; i<N; i += blockDim.x*gridDim.x) {
sum += in[i];
sum = blockReduceSum(sum);
if (threadIdx.x == 0)
out[blockIdx.x] = sum;
void device_reduce_stable(float *in, float* out, int N) {
//int threads = 512;
//int blocks = min((N + threads - 1) / threads, 1024);
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = N / maxThreadsPerBlock;
device_reduce_stable_kernel << <blocks, threads >> >(in, out, N);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
device_reduce_stable_kernel << <1, 1024 >> >(out, out, blocks);
//cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
__global__ void global_reduce_kernel(float * d_out, float * d_in)
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// do reduction in global mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
if (tid < s)
d_in[myId] += d_in[myId + s];
__syncthreads(); // make sure all adds at one stage are done!
// only thread 0 writes result for this block back to global mem
if (tid == 0)
d_out[blockIdx.x] = d_in[myId];
__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[];
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// load shared mem from global mem
sdata[tid] = d_in[myId];
__syncthreads(); // make sure entire block is loaded!
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads(); // make sure all adds at one stage are done!
// only thread 0 writes result for this block back to global mem
if (tid == 0)
d_out[blockIdx.x] = sdata[0];
void reduce(float * d_out, float * d_intermediate, float * d_in,
int size, bool usesSharedMemory)
// assumes that size is not greater than maxThreadsPerBlock^2
// and that size is a multiple of maxThreadsPerBlock
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = size / maxThreadsPerBlock;
if (usesSharedMemory)
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
global_reduce_kernel << <blocks, threads >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
// now we're down to one block left, so reduce it
threads = blocks; // launch one thread for each block in prev step
blocks = 1;
if (usesSharedMemory)
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
global_reduce_kernel << <blocks, threads >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
int main()
/*int deviceCount;
if (deviceCount == 0) {
fprintf(stderr, "error: no devices supporting CUDA.\n");
int dev = 0;
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, dev) == 0)
printf("Using device %d:\n", dev);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
const int ARRAY_SIZE = 2048;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
float sum = 0.0f;
for (int i = 0; i < ARRAY_SIZE; i++) {
// generate random float in [-1.0f, 1.0f]
h_in[i] = i;
sum += h_in[i];
// declare GPU memory pointers
float * d_in, *d_intermediate, *d_out;
// allocate GPU memory
cudaMalloc((void **)&d_in, ARRAY_BYTES);
cudaMalloc((void **)&d_intermediate, ARRAY_BYTES); // overallocated
cudaMalloc((void **)&d_out, sizeof(float));
// transfer the input array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
int whichKernel = 2;
cudaEvent_t start, stop;
// launch the kernel
switch (whichKernel) {
case 0:
printf("Running global reduce\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
cudaEventRecord(stop, 0);
case 1:
printf("Running reduce with shared mem\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
cudaEventRecord(stop, 0);
case 2:
printf("Running reduce with shuffle instruction\n");
cudaEventRecord(start, 0);
/*for (int i = 0; i < 100; i++)
device_reduce_stable(d_in, d_out, ARRAY_SIZE);
cudaEventRecord(stop, 0);
fprintf(stderr, "error: ran no kernel\n");
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
elapsedTime /= 100.0f; // 100 trials
// copy back the sum from GPU
float h_out;
cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
printf("average time elapsed: %f\n", elapsedTime);
// free GPU memory allocation
return 0;
The results showed that warp based reduction took nearly twice the time of shared memory based reduction. These results contradict the behavior expected.
The experiment was performed on Tesla K40c with Compute capability higher than 3.0.
I'm comparing the following two reduction kernels, one using only shared memory WITHOUT using warp shuffling for the last warp reduction stage (version4) and one using shared memory AND warp shuffling for the last warp reduction stage (version5).
template <class T>
__global__ void version4(T *g_idata, T *g_odata, unsigned int N)
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // --- Local thread index
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // --- Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
// --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
if (tid < 32) {
sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
template <class T>
__global__ void version5(T *g_idata, T *g_odata, unsigned int N)
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // --- Local thread index
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // --- Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
// --- Single warp reduction by shuffle operations
if (tid < 32)
// --- Last iteration removed from the for loop, but needed for shuffle reduction
mySum += sdata[tid + 32];
// --- Reduce final warp using shuffle
//for (int offset = warpSize / 2; offset > 0; offset /= 2) mySum += __shfl_down_sync(0xffffffff, mySum, offset);
for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor_sync(0xffffffff, mySum, i);
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
I confirm that there is no sensitive difference between the two. On my GTX920M card, the timing have been the following:
N = 33554432
version4 = 27.5ms
version5 = 27.095ms
So, I'm confirming Robert's comment above.

CUDA fails to run / copy memory

I am writing a program to do several copies of an integration in parallel to speed up its evaluation for say, 50000 different inputs.The code generally takes about 0.0005 sec to run on C++ on a single CPU core and its internal loops in my tests run for around 3000 cycles, so I think it may not be too complicated to run on a gpu thread. Also, I use Visual Studio 2013 and GTX 860m for writing my programs and do not have compiling other programs written in CUDA C.
here is my code: (you can get the data file from here ElCentro-import2.txt)
#include <stdio.h>
#include <math.h>
#include <fstream>
#include <cmath>
#include <cuda_runtime.h>
using namespace std;
__global__ void
vectorAdd(const float *A, const float *p, float *u, float *v, float *a, float *fs, float *rhat, float *phat, int j, int numElements)
int ni = blockDim.x * blockIdx.x + threadIdx.x;
int ti = 10000 * ni;
float t = A[ni];
float m = 1.0f;
float epsilon = 10.0f;
float gamma = 0.5f;
float beta = 1.0f / 4.0f;
float pi = 3.14159f;
float ksy = 0.04f;
float dt = 0.02f;
float fy = 1000.0f;
float c = 4.0f * ksy * pi / t;
float a1 = m / (beta * dt * dt) + gamma * c / (beta * dt);
float a2 = m / (beta * dt) + c*(gamma / beta - 1.0f);
float a3 = (1.0f / (2.0f *beta) - 1.0f)* m + dt*(gamma / (2.0f *beta) - 1.0f) *c;
float k = m * (2.0f*pi / t)*(2.0f*pi / t);
float fz;
float ab;
if (ni < numElements)
v[10000 * ni] = 0.0f;
u[10000 * ni] = 0.0f;
fs[10000 * ni] = 0.0f;
a[10000 * ni] = 0.0f;
for (size_t i = 0 + 10000 * ni; i < j-1 + 10000 * ni; i++)
u[i + 1] = u[i];
fs[i + 1] = fs[i];
phat[i + 1] = p[i + 1] + a1 * u[i] + a2*v[i] + a3*a[i];
rhat[i + 1] = phat[i + 1] - fs[i + 1] - a1 * u[i + 1];
ab = std::fabsf(rhat[i + 1]);
while (ab >= epsilon)
u[i + 1] = u[i + 1] + rhat[i + 1] / (k + a1);
fz = fs[i] + k*(u[i + 1] - u[i]);
if (fz > 0.0)
fs[i + 1] = fminf(fz, fy);
fs[i + 1] = fmaxf(fz, -fy);
rhat[i + 1] = phat[i + 1] - fs[i + 1] - a1 * u[i + 1];
ab = std::fabsf(rhat[i + 1]);
v[i + 1] = gamma*(u[i + 1] - u[i]) / beta / dt + (1.0f - gamma / beta)*v[i] + dt*(1.0f - gamma / 2.0f / beta)*a[i];
a[i + 1] = (u[i + 1] - u[i]) / beta / dt / dt - (1.0f / beta / dt)*v[i] + (1.0f - 1.0f / 2.0f / beta)*a[i];
int numElements = 16;
int kore;
FILE* myfile;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_data = (float *)malloc(10000);
float *h_datat = (float *)malloc(10000);
myfile = fopen("ElCentro-import2.txt", "r");
std::cout << "file is opened\n";
kore = 0;
while (EOF != fscanf(myfile, "%f %f \n", &h_datat[kore], &h_data[kore]))
std::cout << kore << "file is read\n";
size_t nsize = 10000 * numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_u = (float *)malloc(nsize);
float *h_v = (float *)malloc(nsize);
float *h_a = (float *)malloc(nsize);
float *h_p = (float *)malloc(nsize);
for (int i = 0; i < kore; ++i)
h_p[i] = -10000 * h_data[i];
for (int i = 0; i < numElements; ++i)
h_A[i] = 1.0f;
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
float *d_p = NULL;
err = cudaMalloc((void **)&d_p, nsize);
float *d_u = NULL;
err = cudaMalloc((void **)&d_u, nsize);
float *d_v = NULL;
err = cudaMalloc((void **)&d_v, nsize);
float *d_a = NULL;
err = cudaMalloc((void **)&d_a, nsize);
float *d_fs = NULL;
err = cudaMalloc((void **)&d_fs, nsize);
float *d_rhat = NULL;
err = cudaMalloc((void **)&d_rhat, nsize);
float *d_phat = NULL;
err = cudaMalloc((void **)&d_phat, nsize);
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_p, h_p, nsize, cudaMemcpyHostToDevice);
int threadsPerBlock = 1;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
vectorAdd <<<blocksPerGrid, threadsPerBlock >>>(d_A, d_p, d_u, d_v, d_a, d_fs, d_rhat, d_phat, kore-1, numElements);
err = cudaGetLastError();
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_u, d_u, nsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < numElements; ++i)
std::cout << h_u[1000+kore * i] << "\n";;
printf("Test PASSED\n");
err = cudaFree(d_A);
err = cudaFree(d_p);
err = cudaFree(d_u);
err = cudaFree(d_v);
err = cudaFree(d_a);
err = cudaFree(d_fs);
err = cudaFree(d_rhat);
err = cudaFree(d_phat);
err = cudaDeviceReset();
return 0;
My problem is that when I try to copy the results from the device to host (say copy d_u to h_u) it genrates my error checking phrase:
Failed to copy vector u from device to host (error code uspecified launch failure)
Also, if i move the kernel sync code (cudaDeviceSynchronize()) to after the kernel call, it also generates an error report about kernel launch. I am quite new to C++ and CUDA programming and this problem has got me confused for several days.
This part should print the same value (~21.8) for several times (for numElements>1)
for (int i = 0; i < numElements; ++i)
std::cout << h_u[1000+kore * i] << "\n";;
if I set the value of numElements equal to 1, the code runs well, but this is not the point in parallel computing. I also cheked the GPU stats in GPU-Z and memory utilization is less than 1 MB and maximum GPU load is less than 1%. (Release Build)
I have changed the build from Debug to Release and the program runs well! Also by increasing the GPU time out setting in window registry (WDDM) this problem has got nearly eliminated.

CUDA: please help me to find error in my code

There's code, that uses GPU:
__global__ void gpu_process(float* input, float* weights, float* output, int psize, int size)
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
void process(float* input, float* weights, float* output, size_t psize, size_t size)
float* in_d, *w_d, *out_d;
cudaMalloc((void**)&in_d, psize * sizeof(float));
cudaMalloc((void**)&w_d, psize * size * sizeof(float));
cudaMalloc((void**)&out_d, size * sizeof(float));
for(size_t i = 0; i < size; i++)
output[i] = 0;
cudaMemcpy(in_d, input, psize * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(w_d, weights, psize * size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(out_d, output, size * sizeof(float), cudaMemcpyHostToDevice);
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
dim3 dimBlock(block_x, block_y);
dim3 dimGrid(ceil(float(rx) / block_x), ceil(float(ry) / block_y));
gpu_process<<<dimGrid, dimBlock>>>(in_d, w_d, out_d, psize, size);
cudaMemcpy(output, out_d, size * sizeof(float), cudaMemcpyDeviceToHost);
There's code, that do the same thing, but uses only CPU:
int blockIdxx, blockIdxy, blockDimx, blockDimy, threadIdxx, threadIdxy;
void cpu_process(float* input, float* weights, float* output, int psize, int size)
int i = blockIdxx*blockDimx + threadIdxx;
int j = blockIdxy*blockDimy + threadIdxy;
if(i < psize && j < size)
output[j] += input[i] * weights[i * size + j];
void process(float* input, float* weights, float* output, size_t psize, size_t size)
for(size_t i = 0; i < size; i++)
output[i] = 0;
int rx = psize, ry = size, block_x = min((int)psize, 32), block_y = min((int)size, 32);
blockDimx = block_x;
blockDimy = block_y;
int gridDimx = ceil(float(rx) / block_x), gridDimy = ceil(float(ry) / block_y);
for(blockIdxx = 0; blockIdxx < gridDimx; blockIdxx++)
for(blockIdxy = 0; blockIdxy < gridDimy; blockIdxy++)
for(threadIdxx = 0; threadIdxx < blockDimx; threadIdxx++)
for(threadIdxy = 0; threadIdxy < blockDimy; threadIdxy++)
cpu_process(input, weights, output, psize, size);
Why CPU variant works correctly but GPU variant returns garbage in output? What differs in
Version of cuda-toolkit: 4.0
OS: Debian GNU/Linux, cuda installed from it's repositories.
GPU: NVIDIA GeForce GT 525M.
cudaThreadSyncronize is deprecated and should not be used, instead use cudaDeviceSyncronize, check the error codes of these, since they will return an error if a thread has failed. These also block all code thereafter until the task is completed, so you could also add some timing code inbetween to find bottlenecks.