A kernel with less thread divergence

A kernel with less thread divergence - c++

Expected value of result = 8. Received value of result= 1; Can pin point what is wrong on this? Result should have the value of 8 but it is printing out the value of 1. Can anyone help?
#include <stdio.h>`
#include <assert.h>
//define array size 8
#define ARRAY_SIZE 8
__global__ void vecAddKernel(int * A_d) {
//thread Index
unsigned int t = threadIdx.x;
for (unsigned int stride = blockDim.x / 2; stride > 0; stride /= 2) {
__syncthreads();
if (t < stride)
A_d[t] += A_d[t + stride];
}
}
int main(int argc, char * * argv) {
int A_h[ARRAY_SIZE];
// initializing all values in A_h array to 1
for (int i = 0; i < ARRAY_SIZE; i++) {
A_h[i] = 1;
}
int * A_d, result;
// reserving size array A_d of 8 in cuda
cudaMalloc((void * * ) & A_d, ARRAY_SIZE * sizeof(int));
cudaMemcpy(A_d, A_h, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
vecAddKernel << < 1, ARRAY_SIZE / 2 >>> (A_d);
Copy the first index of A_d to the result.
cudaMemcpy( &result, &A_d[0], sizeof(int), cudaMemcpyDeviceToHost);
// outputting the value of result
printf("Result = %d\n", result);
//freeing the memory
cudaFree(A_d);
}

I'm not sure how you're getting Result = 1.
When I compile and run your code, I see Result = 4. That's because the initial value of stride in the loop inside the kernel should be blockDim.x rather than blockDim.x / 2 (the first iteration of the loop should add pairs of values separated by ARRAY_SIZE / 2, and blockDim.x is already ARRAY_SIZE / 2).
Replacing blockDim.x / 2 with blockDim.x in the initializer of unsigned int stride renders the program correct.
If you're interested in performing array reductions like this, you might want to look at __shfl_down and the other shuffle functions introduced with Kepler: https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/

Related

CUDA array filtering kernel without a for loop

I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}

The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
Which we can then test using GTEST:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}

This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}

CUDA: cascaded summation of all vector elements

I have implemented a cascaded addition function for a large vector of float values on my GPU and my CPU. That simply means that all elements of this vector shell be summed up into one result. The CPU algorithm is quite trivial and works fine, but the GPU algorithm is always 35200 off the desired result.
The minimal working code for the algorithm and comparison to the CPU is below.
The output is always this:
CPU Time: 22.760059 ms, bandwidth: 3.514929 GB/s
GPU Time (improved): 12.077088 ms, bandwidth: 6.624114 GB/s
- CPU result does not match GPU result in improved atomic add.
CPU: 10000000.000000, GPU: 10035200.000000, diff:-35200.000000
I checked it with cuda-memcheck but no errors occured in that run. I have tried many many different things but none of themworked. It if not due to the inaccuracy of the float datatype because I changed all floats to ints and still got the exact same result.
This is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
void reductionWithCudaImproved(float *result, const float *input);
__global__ void reductionKernelImproved(float *result, const float *input);
void reductionCPU(float *result, const float *input);
#define SIZE 10000000
#define TILE 32
#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100
int main()
{
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0.0;
resultGPU = 0.0;
srand((int)time(NULL));
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
input[i] = 1.0;
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCudaImproved(&resultGPU, input);
if (resultCPU != resultGPU)
printf("- CPU result does not match GPU result in improved atomic add. CPU: %f, GPU: %f, diff:%f\n\n", resultCPU, resultGPU, (resultCPU - resultGPU));
else
printf("+ CPU result matches GPU result in improved atomic add. CPU: %f, GPU: %f\n\n", resultCPU, resultGPU);
return 0;
}
void reductionCPU(float *result, const float *input)
{
for (int i = 0; i < SIZE; i++)
*result += input[i];
}
__global__ void reductionKernelImproved(float *result, const float *input)
{
int i;
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
__shared__ float interResult;
if (threadIdx.x == 0 && threadIdx.y == 0)
interResult = 0.0;
__syncthreads();
#pragma unroll ILP
for (i = 0; i < ILP; i++)
{
if (index < SIZE)
{
atomicAdd(&interResult, input[index]);
index++;
}
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
atomicAdd(result, interResult);
}
void reductionWithCudaImproved(float *result, const float *input)
{
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_IMPR;
dim_block.y = BLOCK_Y_IMPR;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X_IMPR;
dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y* BLOCK_COUNT_X_IMPR));
dim_grid.z = 1;
cudaSetDevice(0);
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reductionKernelImproved << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time (improved): %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaDeviceSynchronize();
cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_input);
cudaFree(dev_result);
return;
}

I think you have overlapping indices in your kernel call:
int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * blockDim.x * BLOCK_COUNT_X_IMPR + col;
If I am not mistaken, your blockDim.x = 4 and BLOCK_COUNT_X_IMPR = 100, so each row will jump 400 indices.
However, your col can go as high as 400 * 8.
Consider:
blockIdx = (12, 0)
threadIdx = (3, 0)
=> col = (12*4 + 3) * 8 = 408
row = 0
index = 408
blockIdx = (0, 0)
threadIdx = (1, 1)
=> col = (0*4 + 1) * 8 = 8
row = 1
index = 1 * 400 + 8 = 408
So I guess you should rewrite your index
// gridDim.x = BLOCK_COUNT_X_IMPR
int index = row * blockDim.x * gridDim.x * ILP + col;

Bitonic sorting in cuda misorders some values

i'm making a sorting algorithm on CUDA for a bigger project and i decided implementing a Bitonic sorting. The number of elements i'll be sorting will be allways a power of two, in fact will be 512. I need an array which will have the final positions because this method will be used for ordering an array that represents the quality matrix of another solution.
fitness is the array i'll sort, numElements is the number of elements, and orden is initially an empty array with numElements positions which will be filled at the very beginning in this way: orden[i]=i. Actually orden is not relevant for this issue but I kept it.
My problem is that some values aren't sorted properly and until now i've been unable to figure out what problem do I have.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <ctime>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include <device_functions.h>
#include "float.h"
__global__ void sorting(int * orden, float * fitness, int numElements);
// Populating array with random values for testing purposes
__global__ void populate( curandState * state, float * fitness{
curandState localState = state[threadIdx.x];
int a = curand(&localState) % 500;
fitness[threadIdx.x] = a;
}
//Curand setup for the populate method
__global__ void setup_cuRand(curandState * state, unsigned long seed)
{
int id = threadIdx.x;
curand_init(seed, id, 0, &state[id]);
}
int main()
{
float * arrayx;
int numelements = 512;
int * orden;
float arrayCPU[512] = { 0 };
curandState * state;
cudaDeviceReset();
cudaSetDevice(0);
cudaMalloc(&state, numelements * sizeof(curandState));
cudaMalloc((void **)&arrayx, numelements*sizeof(float));
cudaMalloc((void **)&orden, numelements*sizeof(int));
setup_cuRand << <1, numelements >> >(state, unsigned(time(NULL)));
populate << <1, numelements >> > (state, arrayx);
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < numelements; i++)
printf("fitness[%i] = %f\n", i, arrayCPU[i]);
sorting << <1, numelements >> >(orden, arrayx, numelements);
printf("\n\n");
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < numelements; i++)
printf("fitness[%i] = %f\n", i, arrayCPU[i]);
cudaDeviceReset();
return 0;
}
__device__ bool isValid(float n){
return !(isnan(n) || isinf(n) || n != n || n <= FLT_MIN || n >= FLT_MAX);
}
__global__ void sorting(int * orden, float * fitness, int numElements){
int i = 0;
int j = 0;
float f = 0.0;
int aux = 0;
//initial orden registered (1, 2, 3...)
orden[threadIdx.x] = threadIdx.x;
//Logarithm on base 2 of numElements
for (i = 2; i <= numElements; i = i * 2){
// descending from i reducing to half each iteration
for (j = i; j >= 2; j = j / 2){
if (threadIdx.x % j < j / 2){
__syncthreads();
// ascending or descending consideration using (threadIdx.x % (i*2) < i)
if ((threadIdx.x % (i * 2) < i) && (fitness[threadIdx.x] > fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x])) ||
((threadIdx.x % (i * 2) >= i) && (fitness[threadIdx.x] <= fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x + j / 2])))){
aux = orden[threadIdx.x];
orden[threadIdx.x] = orden[threadIdx.x + j / 2];
orden[threadIdx.x + j / 2] = aux;
//Se reubican los fitness
f = fitness[threadIdx.x];
fitness[threadIdx.x] = fitness[threadIdx.x + j / 2];
fitness[threadIdx.x + j / 2] = f;
}
}
}
}
}
For example, an output i got on a random execution:
A random execution
This is a representation of my bitonic sorting:
Bitonic sorting Schema, the arrows point where the worst of the values compared goes to

Here are the issues I found:
In your posted code, this does not compile:
__global__ void populate( curandState * state, float * fitness{
^
missing close parenthesis
I added a close parenthesis there.
It's not necessary to take the address of the array in these cudaMemcpy statements:
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
....
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
the array name is already the address of the array, so I removed the ampersands. If you use a dynamically allocated array, such usage would be broken.
Your usage of __syncthreads() here is broken:
for (j = i; j >= 2; j = j / 2){
if (threadIdx.x % j < j / 2){
__syncthreads();
usage of __syncthreads() inside a conditional statement is generally incorrect unless the conditional statement evaluates uniformly across the threadblock. This is covered in the documentation. We can achieve the desired effect with a slight change:
for (j = i; j >= 2; j = j / 2){
__syncthreads();
if (threadIdx.x % j < j / 2){
With the above changes, your code appears to run correctly for me, for most cases. Your usage of FLT_MIN in your validity check is also questionable, if you intend 0 (or any negative values) to be sorted correctly. Speaking generally, FLT_MIN is a number that is very small, close to zero. If you were thinking that this is a large negative number, it is not. As a result, zero is a possible output of your random number generator, and it will not be sorted correctly. I'll leave this one to you to fix, it should be straightforward, but it will depend on what you ultimately want to achieve. (If you only want to sort positive non-zero floating point values, the test may be OK, but in this case your random number generator can return 0.)

Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3

I have read this post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D among many others including NVIDIA docs and I can't get cudaMallocPitch to work together with cudaMemcpy2D.
I need to copy a very big matrix in an array format (Matrix[width*height]) along with a simple array to perform Matrix * vector operations. It is not optional for me to use cudaMallocPitch in order to avoid conflicts and have a better performance.
So, I started by just trying to copy the matrix (vector in my case) to the device and check if it was correctly copied but my code does not print anything. If I use cudaMalloc and cudaMemcpy everything works fine. But I do not know what to do with cudaMallocPitch and cudaMemcpy2D.
What can I do to fix this?
#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < N)
{
double e = *(double *)(((char *) A + idx * mpitch) + N);
printf("(%f)", e);
}
}
int main()
{
int N = 1500;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaFree(d_A);
delete [] A;
return 0;
}

Error checking can make a big difference in debugging. You should always use it before coming here.
It wasn't clear if you wanted a row or column vector i.e. a matrix of [1xN] or [Nx1]
I've added an explanation on Talomnies suggestion, but first the 'working slabs of code'
Here's [Nx1]
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
double e = *(double *)(((char *) A + idx * mpitch));
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
[1xN]:
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
int row=0;//only one row
double *row_ptr = (double *)( (char *) (A + mpitch * row) );
double e = row_ptr[idx];
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
Explanation
Firslty, Error Handling:
Considering how easy error handling is in CUDA there isn't a good excuse not to put it in.
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
Second, you didn't specify if you wanted a column vector or a row vector. Since a row vector is simply a 1-D array in linear memory and you don't need pitched memory to do that, I will assume for this explanation that you meant a column vector.
The reoccurring problem you were having was "misaligned address" in the kernel. This indicates that the problem is book-keeping, so lets walk through the three major steps of handling an aligned 2D array (even though our arrays will be either a column or row vector).
Allocating:
Your allocation was written out as
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
This is correct for the row vector as the API is cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) However if we would like to do a column vector correct call is
cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
Accessing:
For accessing you were mixing up accessing a row, and accessing an element in the row.
double e = *(double *)(((char *) A + idx * mpitch) + N);
Once again stick to the documentation. The API documentation for cudaMallocPitch includes
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
for us this translates into
int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;
I've used column = 0 for completeness since we do not have more than one column.
Copying:
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
For this case this is correct. API for cudaMemcpy2D is
cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);

C/CUDA Program Output

The following is a CUDA programming example which is basically C but with NVidia CUDA functions within. I've been trying to interpret this code example and figure out what it is trying to do. My question is this the program compiles just fine, but what arguments does it take? For example this CUDA program is being run in a linux emulator however upon running ./program it returns:
Usage: ./program number
Segmentation fault
What are the programs input arguments. Thank you.
#include <assert.h>
#include <stdio.h>
//#define N 100000
__host__ void saxpy_host(int length, float alpha, float * x, float * y)
{
for (int i = 0; i < length; ++i)
y[i] = alpha*x[i] + y[i];
}
__global__ void saxpy (int length, float alpha, float * x, float * y)
{
int i;
i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < length) y[i] = alpha*x[i]+y[i];
__syncthreads();
}
int main(int argc, char* argv[]) {
if (argc != 2) {
printf("Usage: %s number\n", argv[0]);
return -1;
}
int N = atoi(argv[1]);
// host data
float alpha = 0.5;
float x[N], xback[N];
float y[N], yback[N];
int size;
int i;
int blocks;
// determining size
size = sizeof(float)*N;
// device data
float * dxp, * dyp;
// fill host data
for (i = 0; i < N; i++) {
x[i] = (float) (rand () % 128);
y[i] = (float) (rand () % 256);
}
// Allocating and Moving data to device
cudaMalloc((void**) &dxp, size);
cudaMalloc((void**) &dyp, size);
cudaMemcpy (dxp, x, size, cudaMemcpyHostToDevice);
cudaMemcpy (dyp, y, size, cudaMemcpyHostToDevice);
// size of thread blocks
blocks = (N + 31)/32;
saxpy <<< blocks, 32 >>> (N, alpha, dxp, dyp);
// bring back data
cudaMemcpy (xback, dxp, size, cudaMemcpyDeviceToHost);
cudaMemcpy (yback, dyp, size, cudaMemcpyDeviceToHost);
// Calculating host SAXPY
saxpy_host (N, alpha, (float *) &x, (float *) &y);
// checking computation on host matches computation on GPU
for (i = 0; i < N; i++) {
assert (yback[i] == y[i]) ;
//printf ("%i %f %f \n", i, yback[i], y[i]);
}
// free device data
cudaFree(dxp); cudaFree(dyp);
return 0;
}

int N = atoi(argv[1]);
The program takes a single integer as a command line argument. (Try calling it as ./program 5, for example.)
It then calculates a SAXPY (An old term originating from early BLAS implementations, but it stuck. It means "single (precision, aka float) real alpha x plus y".) with vectors of dimension N.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

A kernel with less thread divergence - c++

Related

CUDA array filtering kernel without a for loop

CUDA: cascaded summation of all vector elements

Bitonic sorting in cuda misorders some values

Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3

C/CUDA Program Output

Categories

Resources