CUDA Zero Copy vs. CudaMemcpy on Jetson TK1

CUDA Zero Copy vs. CudaMemcpy on Jetson TK1 - c++

My Question:
I am looking for someone to either point out a mistake in the way I am attempting to use implement zero-copy in CUDA, or reveal a more 'behind the scenes' perspective to why the zero-copy method would not be faster than memcpy method. By the way, I am performing my tests on NVidia's TK1 processor, using Ubuntu.
My problem has to do with efficiently using NVIDIA TK1's (physically) unified memory architecture with CUDA. There are 2 methods NVIDIA provides for GPU/CPU memory transfer abstraction.
Unified Memory abstraction (using cudaHostAlloc & cudaHostGetDevicePointer)
Explicit copy to host, and from device (using cudaMalloc() & cudaMemcpy)
Short description of my test code: I test out the same cuda kernel using both methods 1 and 2. I expected 1 to be faster given that there is no copy to device of the source data or copy from device of the result data. However, results backwards to my assumption (method # 1 is 50% slower). Below is my code for this test:
#include <libfreenect/libfreenect.hpp>
#include <iostream>
#include <vector>
#include <cmath>
#include <pthread.h>
#include <cxcore.h>
#include <time.h>
#include <sys/time.h>
#include <memory.h>
///CUDA///
#include <cuda.h>
#include <cuda_runtime.h>
///OpenCV 2.4
#include <highgui.h>
#include <cv.h>
#include <opencv2/gpu/gpu.hpp>
using namespace cv;
using namespace std;
///The Test Kernel///
__global__ void cudaCalcXYZ( float *dst, float *src, float *M, int height, int width, float scaleFactor, int minDistance)
{
float nx,ny,nz, nzpminD, jFactor;
int heightCenter = height / 2;
int widthCenter = width / 2;
//int j = blockIdx.x; //Represents which row we are in
int index = blockIdx.x*width;
jFactor = (blockIdx.x - heightCenter)*scaleFactor;
for(int i= 0; i < width; i++)
{
nz = src[index];
nzpminD = nz + minDistance;
nx = (i - widthCenter )*(nzpminD)*scaleFactor;
ny = (jFactor)*(nzpminD);
//Solve for only Y matrix (height vlaues)
dst[index++] = nx*M[4] + ny*M[5] + nz*M[6];
//dst[index++] = 1 + 2 + 3;
}
}
//Function fwd declarations
double getMillis();
double getMicros();
void runCudaTestZeroCopy(int iter, int cols, int rows);
void runCudaTestDeviceCopy(int iter, int cols, int rows);
int main(int argc, char **argv) {
//ZERO COPY FLAG (allows runCudaTestZeroCopy to run without fail)
cudaSetDeviceFlags(cudaDeviceMapHost);
//Runs kernel using explicit data copy to 'device' and back from 'device'
runCudaTestDeviceCopy(20, 640,480);
//Uses 'unified memory' cuda abstraction so device can directly work from host data
runCudaTestZeroCopy(20,640, 480);
std::cout << "Stopping test" << std::endl;
return 0;
}
void runCudaTestZeroCopy(int iter, int cols, int rows)
{
cout << "CUDA Test::ZEROCOPY" << endl;
int src_rows = rows;
int src_cols = cols;
int m_rows = 4;
int m_cols = 4;
int dst_rows = src_rows;
int dst_cols = src_cols;
//Create and allocate memory for host mats pointers
float *psrcMat;
float *pmMat;
float *pdstMat;
cudaHostAlloc((void **)&psrcMat, src_rows*src_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pmMat, m_rows*m_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pdstMat, dst_rows*dst_cols*sizeof(float), cudaHostAllocMapped);
//Create mats using host pointers
Mat src_mat = Mat(cvSize(src_cols, src_rows), CV_32FC1, psrcMat);
Mat m_mat = Mat(cvSize(m_cols, m_rows), CV_32FC1, pmMat);
Mat dst_mat = Mat(cvSize(dst_cols, dst_rows), CV_32FC1, pdstMat);
//configure src and m mats
for(int i = 0; i < src_rows*src_cols; i++)
{
psrcMat[i] = (float)i;
}
for(int i = 0; i < m_rows*m_cols; i++)
{
pmMat[i] = 0.1234;
}
//Create pointers to dev mats
float *d_psrcMat;
float *d_pmMat;
float *d_pdstMat;
//Map device to host pointers
cudaHostGetDevicePointer((void **)&d_psrcMat, (void *)psrcMat, 0);
//cudaHostGetDevicePointer((void **)&d_pmMat, (void *)pmMat, 0);
cudaHostGetDevicePointer((void **)&d_pdstMat, (void *)pdstMat, 0);
//Copy matrix M to device
cudaMalloc( (void **)&d_pmMat, sizeof(float)*4*4 ); //4x4 matrix
cudaMemcpy( d_pmMat, pmMat, sizeof(float)*m_rows*m_cols, cudaMemcpyHostToDevice);
//Additional Variables for kernels
float scaleFactor = 0.0021;
int minDistance = -10;
//Run kernel! //cudaSimpleMult( float *dst, float *src, float *M, int width, int height)
int blocks = src_rows;
const int numTests = iter;
double perfStart = getMillis();
for(int i = 0; i < numTests; i++)
{
//cudaSimpleMult<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_cols, src_rows);
cudaCalcXYZ<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_rows, src_cols, scaleFactor, minDistance);
cudaDeviceSynchronize();
}
double perfStop = getMillis();
double perfDelta = perfStop - perfStart;
cout << "Ran " << numTests << " iterations totaling " << perfDelta << "ms" << endl;
cout << " Average time per iteration: " << (perfDelta/(float)numTests) << "ms" << endl;
//Copy result back to host
//cudaMemcpy(pdstMat, d_pdstMat, sizeof(float)*src_rows*src_cols, cudaMemcpyDeviceToHost);
//cout << "Printing results" << endl;
//for(int i = 0; i < 16*16; i++)
//{
// cout << "src[" << i << "]= " << psrcMat[i] << " dst[" << i << "]= " << pdstMat[i] << endl;
//}
cudaFree(d_psrcMat);
cudaFree(d_pmMat);
cudaFree(d_pdstMat);
cudaFreeHost(psrcMat);
cudaFreeHost(pmMat);
cudaFreeHost(pdstMat);
}
void runCudaTestDeviceCopy(int iter, int cols, int rows)
{
cout << "CUDA Test::DEVICE COPY" << endl;
int src_rows = rows;
int src_cols = cols;
int m_rows = 4;
int m_cols = 4;
int dst_rows = src_rows;
int dst_cols = src_cols;
//Create and allocate memory for host mats pointers
float *psrcMat;
float *pmMat;
float *pdstMat;
cudaHostAlloc((void **)&psrcMat, src_rows*src_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pmMat, m_rows*m_cols*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&pdstMat, dst_rows*dst_cols*sizeof(float), cudaHostAllocMapped);
//Create pointers to dev mats
float *d_psrcMat;
float *d_pmMat;
float *d_pdstMat;
cudaMalloc( (void **)&d_psrcMat, sizeof(float)*src_rows*src_cols );
cudaMalloc( (void **)&d_pdstMat, sizeof(float)*src_rows*src_cols );
cudaMalloc( (void **)&d_pmMat, sizeof(float)*4*4 ); //4x4 matrix
//Create mats using host pointers
Mat src_mat = Mat(cvSize(src_cols, src_rows), CV_32FC1, psrcMat);
Mat m_mat = Mat(cvSize(m_cols, m_rows), CV_32FC1, pmMat);
Mat dst_mat = Mat(cvSize(dst_cols, dst_rows), CV_32FC1, pdstMat);
//configure src and m mats
for(int i = 0; i < src_rows*src_cols; i++)
{
psrcMat[i] = (float)i;
}
for(int i = 0; i < m_rows*m_cols; i++)
{
pmMat[i] = 0.1234;
}
//Additional Variables for kernels
float scaleFactor = 0.0021;
int minDistance = -10;
//Run kernel! //cudaSimpleMult( float *dst, float *src, float *M, int width, int height)
int blocks = src_rows;
double perfStart = getMillis();
for(int i = 0; i < iter; i++)
{
//Copty from host to device
cudaMemcpy( d_psrcMat, psrcMat, sizeof(float)*src_rows*src_cols, cudaMemcpyHostToDevice);
cudaMemcpy( d_pmMat, pmMat, sizeof(float)*m_rows*m_cols, cudaMemcpyHostToDevice);
//Run Kernel
//cudaSimpleMult<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_cols, src_rows);
cudaCalcXYZ<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_rows, src_cols, scaleFactor, minDistance);
//Copy from device to host
cudaMemcpy( pdstMat, d_pdstMat, sizeof(float)*src_rows*src_cols, cudaMemcpyDeviceToHost);
}
double perfStop = getMillis();
double perfDelta = perfStop - perfStart;
cout << "Ran " << iter << " iterations totaling " << perfDelta << "ms" << endl;
cout << " Average time per iteration: " << (perfDelta/(float)iter) << "ms" << endl;
cudaFree(d_psrcMat);
cudaFree(d_pmMat);
cudaFree(d_pdstMat);
cudaFreeHost(psrcMat);
cudaFreeHost(pmMat);
cudaFreeHost(pdstMat);
}
//Timing functions for performance measurements
double getMicros()
{
timespec ts;
//double t_ns, t_s;
long t_ns;
double t_s;
clock_gettime(CLOCK_MONOTONIC, &ts);
t_s = (double)ts.tv_sec;
t_ns = ts.tv_nsec;
//return( (t_s *1000.0 * 1000.0) + (double)(t_ns / 1000.0) );
return ((double)t_ns / 1000.0);
}
double getMillis()
{
timespec ts;
double t_ns, t_s;
clock_gettime(CLOCK_MONOTONIC, &ts);
t_s = (double)ts.tv_sec;
t_ns = (double)ts.tv_nsec;
return( (t_s * 1000.0) + (t_ns / 1000000.0) );
}
I have already seen the post Cuda zero-copy performance, but I feel this was not related for the following reason: The GPU and CPUs have a physically unified memory architecture.
Thanks

When you are using ZeroCopy, the read to memory goes through some path where it queries the memory unit to fetch data from system memory. This operation has some latency.
When using direct access to memory, the memory unit gathers data from global memory, and has a different access pattern and latency.
Actually seeing this difference would require some form of profiling.
Nonetheless, your call to global function makes use of a single thread
cudaCalcXYZ<<< blocks,1 >>> (...
In this case, the GPU has little way to hide latency when memory is gathered from the system memory (or global memory). I would recommend you use more threads (some multiple of 64, at least 128 total), and run the profiler on it to get the cost of memory access. Your algorithm seems separable, and modifing the code from
for(int i= 0; i < width; i++)
to
for (int i = threadIdx.x ; i < width ; i += blockDim.x)
will probably increase performance overall.
Image size is 640 in width which will turn into 5 iterations of 128 threads.
cudaCalcXYZ<<< blocks,128 >>> (...
I believe it would result in some performance increase.

ZeroCopy feature allow us to running data on device without manually copy it to Device Memory like cudaMemcpy function. Zero copy memory only pass host address to device that read/wrote on kernel device. So, the more thread block you declaration to kernel device, the more data that read/wrote on kernel device, the more host address that passed to device. Finally, you got better performance gain than if you only declaration a few thread block to device kernel.

Related

Segmentation fault when using cudaMemcpy

I'm trying to use cudaMemcpy to a std::vector::data to an array for a device kernel and it gives set fault error. The way I do it is:
cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);
where vx is vector. The following is the complete example. Any hints on where the problem are would be appreciated.
#include <iostream>
#include <math.h>
#include <vector>
using namespace std;
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < n) {
y[i] = x[i] + y[i];
}
}
int main(void)
{
int N = 1<<10;
float *d_x = NULL, *d_y = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*N);
cudaMalloc((void **)&d_y, sizeof(float)*N);
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
vx.push_back(1.0f);
vy.push_back(2.0f);
}
cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), N*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;
cout<<"blockSize: "<<blockSize<<" minGridSize: "<<minGridSize<<" gridSize: "<<gridSize<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
add<<<gridSize, blockSize>>>(N, d_x, d_y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++) {
maxError = fmax(maxError, fabs(d_y[i]-3.0f));
}
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
return 0;
}
blockSize: 1024 minGridSize: 16 gridSize: 1
Launched blocks of size 1024. Theoretical occupancy: 1.000000
Segmentation fault (core dumped)

The problem is here:
for (int i = 0; i < N; i++) {
maxError = fmax(maxError, fabs(d_y[i]-3.0f));
^^^^^^
}
And the reason is you cannot dereference device pointer on host.
The solution is copy device memory to host similar to what you did for host to device.

Why am I getting wrong results with this implemention of a sum reduction in CUDA?

I'm working on one tutorial for the vector_reduction algorithm implemented using CUDA C++ API and I'm struggling because I really don't understand what I'm doing wrong because the result is (device: 4386.000000 host: 260795.000000)
The code that I'm using is the following (the problem size is fixed at 512).
EDIT: Unfortunately the problem has not been solved and I still get the same result. I have updated the code providing the complete code. The goal is the same, to sum all the elements of an array of float of 512 elements.
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float s_data[NUM_ELEMENTS];
int tid = threadIdx.x;
int index = tid + blockIdx.x*blockDim.x;
s_data[tid] = 0.0;
if (index < n){
s_data[tid] = g_data[index];
}
__syncthreads();
for (int s = 2; s <= blockDim.x; s = s * 2){
if ((tid%s) == 0){
s_data[tid] += s_data[tid + s / 2];
}
__syncthreads();
}
if (tid == 0){
g_data[blockIdx.x] = s_data[tid];
}
}
// includes, system
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
// includes, kernels
#include "vector_reduction_kernel.cu"
// For simplicity, just to get the idea in this MP, we're fixing the problem size to 512 elements.
#define NUM_ELEMENTS 512
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
float computeOnDevice(float* h_data, int array_mem_size);
extern "C"
void computeGold( float* reference, float* idata, const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
cudaSetDevice(0);
runTest( argc, argv);
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////
//! Run naive scan test
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv)
{
int num_elements = NUM_ELEMENTS;
const unsigned int array_mem_size = sizeof( float) * num_elements;
// Allocate host memory to store the input data
float* h_data = (float*) malloc( array_mem_size);
// initialize the input data on the host to be integer values
// between 0 and 1000
for( unsigned int i = 0; i < num_elements; ++i)
h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
// Function to compute the reference solution on CPU using a C sequential version of the algorithm
// It is written in the file "vector_reduction_gold.cpp". The Makefile compiles this file too.
float reference = 0.0f;
computeGold(&reference , h_data, num_elements);
// Function to compute the solution on GPU using a call to a CUDA kernel (see body below)
// The kernel is written in the file "vector_reduction_kernel.cu". The Makefile also compiles this file.
float result = computeOnDevice(h_data, num_elements);
// We can use an epsilon of 0 since values are integral and in a range that can be exactly represented
float epsilon = 0.0f;
unsigned int result_regtest = (abs(result - reference) <= epsilon);
printf( "Test %s\n", (1 == result_regtest) ? "Ok." : "No.");
printf( "device: %f host: %f\n", result, reference);
// cleanup memory
free( h_data);
}
// Function to call the CUDA kernel on the GPU.
// Take h_data from host, copies it to device, setup grid and thread
// dimensions, excutes kernel function, and copy result of scan back
// to h_data.
// Note: float* h_data is both the input and the output of this function.
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
//int threads = (num_elements/2) + num_elements%2;
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}

You really should provide a complete code for questions like this. You should also use proper CUDA error checking and run your code with cuda-memcheck. You have at least 2 errors in your code:
we don't do a cudaMemcpy like this:
cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
it should be:
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
the first parameter is just a pointer, not a pointer-to-pointer. cuda-memcheck or proper CUDA error checking would have focused your attention on this line.
You aren't launching enough threads. Your kernel loads one element per thread. If you have a problem size of 512, you're going to need 512 threads, and this:
int threads = (num_elements/2) + num_elements%2;
isn't getting you that. Not sure what you have in mind there. But this could fix it for the 512 case:
int threads = (num_elements);
Your reduction methodology requires a power-of-two threadblock size.
Here's a fully worked test case, note use of cuda-memcheck:
$ cat t27.cu
#include <stdio.h>
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float s_data[NUM_ELEMENTS];
int tid = threadIdx.x;
int index = tid + blockIdx.x*blockDim.x;
s_data[tid] = 0.0;
if (index < n){
s_data[tid] = g_data[index];
}
__syncthreads();
for (int s = 2; s <= blockDim.x; s = s * 2){
if ((tid%s) == 0){
s_data[tid] += s_data[tid + s / 2];
}
__syncthreads();
}
if (tid == 0){
g_data[blockIdx.x] = s_data[tid];
}
}
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
int main(){
float *data = new float[NUM_ELEMENTS];
for (int i = 0; i < NUM_ELEMENTS; i++) data[i] = 1;
float r = computeOnDevice(data, NUM_ELEMENTS);
printf(" result = %f\n" , r);
}
$ nvcc -arch=sm_35 -o t27 t27.cu
$ cuda-memcheck ./t27
========= CUDA-MEMCHECK
result = 512.000000
========= ERROR SUMMARY: 0 errors
Here is a modified version of the code you have now posted (which is broken in several new/different ways), which seems to run correctly for me:
$ cat t30.cu
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float s_data[NUM_ELEMENTS];
int tid = threadIdx.x;
int index = tid + blockIdx.x*blockDim.x;
s_data[tid] = 0.0;
if (index < n){
s_data[tid] = g_data[index];
}
__syncthreads();
for (int s = 2; s <= blockDim.x; s = s * 2){
if ((tid%s) == 0){
s_data[tid] += s_data[tid + s / 2];
}
__syncthreads();
}
if (tid == 0){
g_data[blockIdx.x] = s_data[tid];
}
}
// includes, system
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
// includes, kernels
// For simplicity, just to get the idea in this MP, we're fixing the problem size to 512 elements.
#define NUM_ELEMENTS 512
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
float computeOnDevice(float* h_data, int array_mem_size);
extern "C"
void computeGold( float* reference, float* idata, const unsigned int len)
{
for (int i = 0; i<len; i++) *reference += idata[i];
};
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
cudaSetDevice(0);
runTest( argc, argv);
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////
//! Run naive scan test
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv)
{
int num_elements = NUM_ELEMENTS;
const unsigned int array_mem_size = sizeof( float) * num_elements;
// Allocate host memory to store the input data
float* h_data = (float*) malloc( array_mem_size);
// initialize the input data on the host to be integer values
// between 0 and 1000
for( unsigned int i = 0; i < num_elements; ++i)
h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
// Function to compute the reference solution on CPU using a C sequential version of the algorithm
// It is written in the file "vector_reduction_gold.cpp". The Makefile compiles this file too.
float reference = 0.0f;
computeGold(&reference , h_data, num_elements);
// Function to compute the solution on GPU using a call to a CUDA kernel (see body below)
// The kernel is written in the file "vector_reduction_kernel.cu". The Makefile also compiles this file.
float result = computeOnDevice(h_data, num_elements);
// We can use an epsilon of 0 since values are integral and in a range that can be exactly represented
float epsilon = 0.0f;
unsigned int result_regtest = (abs(result - reference) <= epsilon);
printf( "Test %s\n", (1 == result_regtest) ? "CORRECTO: Coinciden los resultados de la CPU y la GPU" : "INCORRECTO: Los resultados calculados en paralelo en la GPU no coinciden con los obtenidos secuencialmente en la CPU");
printf( "device: %f host: %f\n", result, reference);
// cleanup memory
free( h_data);
}
// Function to call the CUDA kernel on the GPU.
// Take h_data from host, copies it to device, setup grid and thread
// dimensions, excutes kernel function, and copy result of scan back
// to h_data.
// Note: float* h_data is both the input and the output of this function.
#if 0
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
//int threads = (num_elements/2) + num_elements%2;
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
#endif
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaError_t err = cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
if (err != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(err)); exit(0);}
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
err = cudaGetLastError();
if (err != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(err)); exit(0);}
cudaDeviceReset();
return result;
}
$ nvcc -arch=sm_35 -o t30 t30.cu
$ cuda-memcheck ./t30
========= CUDA-MEMCHECK
Test CORRECTO: Coinciden los resultados de la CPU y la GPU
device: 260795.000000 host: 260795.000000
========= ERROR SUMMARY: 0 errors
$
You still haven't added proper CUDA error checking to your code, so its entirely possible that you have a machine setup problem. If you're still having trouble, you might want to run the exact code I have posted above, since I've put rudimentary error checking in it.

CUDA c++, simple matrix multiplication error

I am quite new at CUDA programming with c++, so sorry for this simple question. I simply cannot figure out where i am going wrong with this. I am trying to do a matrix multiplication. I have found inspiration from several sources so it might be that i have mixed up some different methods. I am trying to multiply two matrixes h_a and h_b. I successfuly generate the two matrixes, but when i allocate the memory for the two matrices, i for some reason lose the values in that matrix, and even after the multiplication all values are zero. Below is the code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <ctime>
#include <stdio.h>
#include <iostream>
#include <math.h>
using namespace std;
__global__ void MulKernel(int *c, const int *a, const int *b, const int P)
{
float tempsum;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
if (row < P && col < P){
for (int i = 0; i < P; i++){
tempsum += a[row*P + i] * b[i*P + col];
}
}
c[row*P + col] = tempsum;
}
int main()
{
srand(time(NULL));
int *pointer;
int N = 16;
int SIZE = N*N;
int *h_a = new int[SIZE];
int *h_b = new int[SIZE];
int *h_c = new int[SIZE];
for (int i = 0; i < SIZE; i++) {
h_a[i] = rand() % 1000;
h_b[i] = rand() % 1000;
}
cout << "First values " << h_a[0] << " " << h_b[0] << endl;
cudaMalloc(&h_a, sizeof(int)*SIZE);
cudaMalloc(&h_b, sizeof(int)*SIZE);
cudaMalloc(&h_c, sizeof(int)*SIZE);
cudaMalloc(&pointer, sizeof(int));
cout << "Second values " << h_a[0] << " " << h_b[0] << endl;
cudaMemcpy(h_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(h_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(pointer, &N, sizeof(int), cudaMemcpyHostToDevice);
cout << "Third values " << h_a[0] <<" "<< h_b[0] << endl;
MulKernel <<<1, 256 >>>(h_c, h_a, h_b, N);
cudaMemcpy(h_c, &h_c, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_a, &h_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_b, &h_b, sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 5; i++){
cout << h_c[i] << "=" << h_a[i] << h_b[i] << endl;
}
cout << h_c[1] << endl;
cudaFree(h_a);
cudaFree(h_b);
cudaFree(h_c);
return 0;
}
The output in the terminal reads:
First values 454 964
Second values 0 0
Third values 0 0
0=00
0=00
0=00
0=00
0=00
0
Press any key to continue . . .
I hope someone can spot the error(s)
Best regards

There are quite a few issues with your code.
Any time you're having trouble with a cuda code, I recommend proper cuda error checking as well as running your code with cuda-memcheck. In this case, you've made programming errors that actually result in a seg fault, so these methods aren't that useful.
Your kernel is mostly workable. There are 3 issues. First, you are performing int multiplication but have declared your tempsum variable as float. That probably isn't a huge issue but is not consistent with your kernel. Second, you are not initializing tempsum (it should be set to zero). Third, you have your threadcheck (i.e. if-statement) slightly misplaced. You should condition the kernel so as not to write to c if the thread is out-of-bounds.
You're probably confused about host and device variables. We don't allocate a host variable with new and then do a cudaMalloc operation on the same pointer. That's not how things work. We need to create an equivalent set of variables to store data on the device. Let's call those *d_a etc. We'll call cudaMalloc on those to allocate device space, then we'll use those in the cudaMemcpy operations as the device-side variables.
Your kernel is expecting a 2D thread array (so that the .x and .y built-in variables in the kernel have meaning). But you are defining the thread array using 1D variables. That needs to be fixed in your kernel launch (i.e. define a 2D array using dim3 variables). Likewise the kernel launch should specify the d_a and etc. variables that are device pointers.
You may be confused about how to handle a variable like N when passing it to the kernel. We can pass that directly (by value) without any of the gymnastics with pointer that you have created.
You have transfer sizes wrong in your cudaMemcpy operations. Like memcpy you need to specify a transfer size in bytes, so we need to multiply most of your transfer sizes by SIZE.
Here's a modified version of your code with the above issues addressed:
$ cat t1073.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <ctime>
#include <stdio.h>
#include <iostream>
#include <math.h>
using namespace std;
__global__ void MulKernel(int *c, const int *a, const int *b, const int P)
{
int tempsum=0;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
if (row < P && col < P){
for (int i = 0; i < P; i++){
tempsum += a[row*P + i] * b[i*P + col];
}
c[row*P + col] = tempsum;
}
}
int main()
{
srand(time(NULL));
int N = 16;
int SIZE = N*N;
int *h_a = new int[SIZE];
int *h_b = new int[SIZE];
int *h_c = new int[SIZE];
for (int i = 0; i < SIZE; i++) {
h_a[i] = rand() % 1000;
h_b[i] = rand() % 1000;
}
cout << "First values " << h_a[0] << " " << h_b[0] << endl;
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, sizeof(int)*SIZE);
cudaMalloc(&d_b, sizeof(int)*SIZE);
cudaMalloc(&d_c, sizeof(int)*SIZE);
cout << "Second values " << h_a[0] << " " << h_b[0] << endl;
cudaMemcpy(d_a, h_a, sizeof(int)*SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(int)*SIZE, cudaMemcpyHostToDevice);
cout << "Third values " << h_a[0] <<" "<< h_b[0] << endl;
MulKernel <<<1, dim3(N,N) >>>(d_c, d_a, d_b, N);
cudaMemcpy(h_c, d_c, sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(h_a, d_a, sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(h_b, d_b, sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
for (int i = 0; i < 5; i++){
cout << h_c[i] << "=" << h_a[i] << h_b[i] << endl;
}
cout << h_c[1] << endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
$ nvcc -o t1073 t1073.cu
$ cuda-memcheck ./t1073
========= CUDA-MEMCHECK
First values 698 173
Second values 698 173
Third values 698 173
5502745=698173
5866060=120710
3945532=646669
4432346=582703
4971909=746272
5866060
========= ERROR SUMMARY: 0 errors
$
Personally, I can't interpret the output easily, and I'm not sure why you've chosen the = sign. For matrix multiplication, c[i] is not equal to a[i]*b[i], if that's what you were thinking. If you want a simple test that is easily understood visually, try setting both a and b matrices to all 1. Then you can easily spot a correct output, it should be all N.
Also note that for brevity, I've not tried to teach you every aspect of CUDA programming in this question, just fix some mistakes. As just one example, this code will break if you set N to a value larger than 32. You may need to learn more about CUDA programming to understand why that is.

CUDA : programming with twice as much blocks (tiling?)

My 3D Laplacian solver works. I obtain a power of 350 Gflop/s, I'm trying to upgrade it to have better performance with twice as much blocks.
However, performances still being 350 Gflop/s:
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include"res3dcb.cuh"
#include <math.h>
using namespace std;
// Constant statement.
const int blocksize=32;
const int N=128;
const int size=(N+2)*(N+2)*(N+2)*sizeof(float);
// Let's start the main program.
int main(void) {
// Variable statement.
float time1,time2,time3;
float *x_d, *y_d;
float *x,*y;
float gflops;
float NumOps;
int power=4; // You can change power as you prefer (but keep 2^x)
// Init x and y.
x = new float[size];
y = new float[size];
for (int k=1;k<N+1;k++)
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++) {
x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
}
// Shadow cases.
for (int k=1;k<N+1;k++) {
for (int i=1;i<N+1;i++) {
x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1];
x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}
for (int j=0;j<N+2;j++) {
x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j];
x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}
for (int i=0;i<N+2;i++)
for (int j=0;j<N+2;j++) {
x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
}
// Display of initial matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
// CPU to GPU.
cudaMalloc( (void**) & x_d, size);
cudaMalloc( (void**) & y_d, size);
cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;
// Solver parameters.
dim3 dimGrid(power*N/blocksize, power*N/blocksize);
dim3 dimBlock(blocksize, blocksize);
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time3));
// GPU to CPU.
cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
cudaFree(x_d);
cudaFree(y_d);
// Display of final matrix.
id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
cout<<"Time : "<<time3<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}
Where:
__ global__ void res2d(volatile float* x, float* y, int N, int power)
{
int i = threadIdx.x + blockIdx.x*(blockDim.x);
int j = threadIdx.y + blockIdx.y*(blockDim.y);
int id,jd;
#pragma unroll //Now let's recude the number of operation per block
for (int incr=1; incr<power+1; incr++) {
if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
#pragma unroll
for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
id=i-(incr-1)*N;
jd=j-(incr-1)*N;
y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd]
+ x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1]
+ x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1]
- 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
}
}
}
}
With parameters:
dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)
Questions:
If power= 2,4 or 8, number of operations per block is divided by 2,4 or 8. But it's not faster. Why?
Is it useless to reduce the number of operation per block?
Thanks in advance for your help.

CUDA kernel launches are asynchronous. When you do this:
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
the timer is only capturing the API launch latency, not the actual execution time of the code. This is why changing the amount of work done in the kernel is apparently having no effect on performance -- your timing method is incorrect.
Do something like this instead:
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
cudaDeviceSynchronize();
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
This inserts a blocking call which will ensure that the kernel is finished execution before the time is measured.
[This answer added as a community wiki entry to get the question off the unanswered queue].

Combining texture memory Unified Memory in CUDA 6

I am writing a CUDA application for Jetson TK1 using CUDA 6. I have got the impression from Mark Harris in his blog post
Jetson TK1: Mobile Embedded Supercomputer Takes CUDA Everywhere
that the memory of the Tegra K1 is physically unified. I have also observed results indicating that cudaMallocManaged is significantly faster for global memory than ordinary cudaMemcpy. This is probably because the Unified Memory doesn't require any copying.
However, what do I do when I want to use the texture memory for parts of my application? I have not found any support for textures using cudaMallocManaged so I have assumed that I have to use normal cudaMemcpyToArray and bindTextureToArray?
Using the previous mentioned method often seem to work but the variables managed by cudaMallocManaged sometimes give weird segmentation faults for me. Is this the right way to use texture memory along with Unified Memory? The following code illustrates how I do it. This code works fine but my question is whether this is the right way to go or if it might create undefined behaviour that could cause e.g. segmentation faults.
#define width 16
#define height 16
texture<float, cudaTextureType2D, cudaReadModeElementType> input_tex;
__global__ void some_tex_kernel(float* output){
int i= threadIdx.x;
float x = i%width+0.5f;
float y = i/width+0.5f;
output[i] = tex2D(input_tex, x, y);
}
int main(){
float* out;
if(cudaMallocManaged(&out, width*height*sizeof(float))!= cudaSuccess)
std::cout << "unified not working\n";
for(int i=0; i< width*height; ++i){
out[i] = float(i);
}
const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaArray* input_t;
cudaMallocArray(&input_t, &desc, width, height);
cudaMemcpyToArrayAsync(input_t, 0, 0, out, width*height*sizeof(float), cudaMemcpyHostToDevice);
input_tex.filterMode = cudaFilterModeLinear;
cudaBindTextureToArray(input_tex, input_t, desc);
some_tex_kernel<<<1, width*height>>>(out);
cudaDeviceSynchronize();
for(int i=0;i<width*height; ++i)
std::cout << out[i] << " ";
cudaFree(out);
cudaFreeArray(input_t);
}
}
Another thing that I find odd is that if I remove the cudaDeviceSynchronize() in the code I always get segmentation faults. I understand that the result might not be finished if I read it without a synchronization but should not the variable still be accessible?
Anyone have a clue?
Mattias

The only managed memory possibilities at this time are static allocations using __device__ __managed__ or dynamic allocations using cudaMallocManaged(). There is no direct support for textures, surfaces, constant memory, etc.
Your usage of texturing is fine. The only overlap between texture usage and managed memory is in the following call:
cudaMemcpyToArrayAsync(input_t, 0, 0, out, width*height*sizeof(float), cudaMemcpyHostToDevice);
where managed memory is the source (i.e. host side) of the transfer. This is acceptable as long as the call is issued during a period when no kernels are executing (see below).
"Another thing that I find odd is that if I remove the cudaDeviceSynchronize() in the code I always get segmentation faults."
cudaDeviceSynchronize(); is necessary after a kernel call to make the managed memory visible to the host again. I suggest you read this section of the documentation carefully:
"In general, it is not permitted for the CPU to access any managed allocations or variables while the GPU is active. Concurrent CPU/GPU accesses, ... will cause a segmentation fault..."
As you've indicated, the code you posted works fine. If you have other code that has unpredictable seg faults while using managed memory, I would carefully inspect the code flow (especially if you are using streams i.e. concurrency) to make sure that the host is accessing managed data only after a cudaDeviceSynchronize(); has been issued, and before any subsequent kernel calls.

Robert Crovella has already answered to your question. However, in order to show you that cudaMallocManaged can be used in the framework of texture memory, I have dusted my 1D linear interpolation code and converted it using cudaMallocManaged. You will see that the code performs the 1D linear interpolation in four different ways:
CPU;
GPU;
GPU using tex1Dfetch;
GPU using tex1D filtering.
The code works without problems in all the cases and, especially, the latter two ones, on a Kepler K20c card.
// includes, system
#include <cstdlib>
#include <conio.h>
#include <math.h>
#include <fstream>
#include <iostream>
#include <iomanip>
// includes, cuda
#include <cuda.h>
#include <cuda_runtime.h>
using namespace std;
texture<float, 1, cudaReadModeElementType> data_d_texture_filtering;
texture<float, 1> data_d_texture;
#define BLOCK_SIZE 256
/******************/
/* ERROR CHECKING */
/******************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
/************/
/* LINSPACE */
/************/
// --- Generates N equally spaced, increasing points between a and b and stores them in x
void linspace(float* x, float a, float b, int N) {
float delta_x=(b-a)/(float)N;
x[0]=a;
for(int k=1;k<N;k++) x[k]=x[k-1]+delta_x;
}
/*************/
/* RANDSPACE */
/*************/
// --- Generates N randomly spaced, increasing points between a and b and stores them in x
void randspace(float* x, float a, float b, int N) {
float delta_x=(b-a)/(float)N;
x[0]=a;
for(int k=1;k<N;k++) x[k]=x[k-1]+delta_x+(((float)rand()/(float)RAND_MAX-0.5)*(1./(float)N));
}
/******************/
/* DATA GENERATOR */
/******************/
// --- Generates N complex random data points, with real and imaginary parts ranging in (0.f,1.f)
void Data_Generator(float* data, int N) {
for(int k=0;k<N;k++) {
data[k]=(float)rand()/(float)RAND_MAX;
}
}
/*************************************/
/* LINEAR INTERPOLATION KERNEL - CPU */
/*************************************/
float linear_kernel_CPU(float in)
{
float d_y;
return 1.-abs(in);
}
/***************************************/
/* LINEAR INTERPOLATION FUNCTION - CPU */
/***************************************/
void linear_interpolation_function_CPU(float* result_GPU, float* data, float* x_in, float* x_out, int M, int N){
float a;
for(int j=0; j<N; j++){
int k = floor(x_out[j]+M/2);
a = x_out[j]+M/2-floor(x_out[j]+M/2);
result_GPU[j] = a * data[k+1] + (-data[k] * a + data[k]);
}
}
/*************************************/
/* LINEAR INTERPOLATION KERNEL - GPU */
/*************************************/
__device__ float linear_kernel_GPU(float in)
{
float d_y;
return 1.-abs(in);
}
/**************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - GLOBAL MEMORY */
/**************************************************************/
__global__ void linear_interpolation_kernel_function_GPU(float* __restrict__ result_d, const float* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - truncf(reg_x_out);
float dk = data_d[k];
float dkp1 = data_d[k+1];
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}
/***************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - TEXTURE MEMORY */
/***************************************************************/
__global__ void linear_interpolation_kernel_function_GPU_texture(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - truncf(reg_x_out);
float dk = tex1Dfetch(data_d_texture,k);
float dkp1 = tex1Dfetch(data_d_texture,k+1);
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}
/************************************************************************************/
/* LINEAR INTERPOLATION KERNEL FUNCTION - GPU - TEXTURE MEMORY - FILTERING FEATURES */
/************************************************************************************/
__global__ void linear_interpolation_kernel_function_GPU_texture_filtering(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N) result_d[j] = tex1D(data_d_texture_filtering,float(x_out_d[j]+M/2+0.5));
}
/***************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU */
/***************************************/
void linear_interpolation_function_GPU(float* result_d, float* data_d, float* x_in_d, float* x_out_d, int M, int N){
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
linear_interpolation_kernel_function_GPU<<<dimGrid,dimBlock>>>(result_d, data_d, x_out_d, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
/********************************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU - TEXTURE MEMORY */
/********************************************************/
void linear_interpolation_function_GPU_texture(float* result_d, float* data_d, float* x_in_d, float* x_out_d, int M, int N){
cudaBindTexture(NULL, data_d_texture, data_d, M*sizeof(float));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
linear_interpolation_kernel_function_GPU_texture<<<dimGrid,dimBlock>>>(result_d, x_out_d, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
/*****************************************************************************/
/* LINEAR INTERPOLATION FUNCTION - GPU - TEXTURE MEMORY - FILTERING FEATURES */
/*****************************************************************************/
void linear_interpolation_function_GPU_texture_filtering(float* result_d, float* data, float* x_in_d, float* x_out_d, int M, int N){
cudaArray* data_d = NULL; gpuErrchk(cudaMallocArray(&data_d, &data_d_texture_filtering.channelDesc, M, 1));
gpuErrchk(cudaMemcpyToArray(data_d, 0, 0, data, sizeof(float)*M, cudaMemcpyHostToDevice));
gpuErrchk(cudaBindTextureToArray(data_d_texture_filtering, data_d));
data_d_texture_filtering.normalized = false;
data_d_texture_filtering.filterMode = cudaFilterModeLinear;
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
linear_interpolation_kernel_function_GPU_texture_filtering<<<dimGrid,dimBlock>>>(result_d, x_out_d, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
/********/
/* MAIN */
/********/
int main()
{
int M=1024; // --- Number of input points
int N=1024; // --- Number of output points
int Nit = 100; // --- Number of computations for time measurement
// --- Input sampling
float* x_in; gpuErrchk(cudaMallocManaged(&x_in,sizeof(float)*M));
// --- Input data
float *data; gpuErrchk(cudaMallocManaged(&data,(M+1)*sizeof(float))); Data_Generator(data,M); data[M]=0.;
// --- Output sampling
float* x_out; gpuErrchk(cudaMallocManaged((void**)&x_out,sizeof(float)*N)); randspace(x_out,-M/2.,M/2.,N);
// --- Result allocation
float *result_CPU; result_CPU=(float*)malloc(N*sizeof(float));
float *result_d; gpuErrchk(cudaMallocManaged(&result_d,sizeof(float)*N));
float *result_d_texture; gpuErrchk(cudaMallocManaged(&result_d_texture,sizeof(float)*N));
float *result_d_texture_filtering; gpuErrchk(cudaMallocManaged(&result_d_texture_filtering,sizeof(float)*N));
// --- Reference interpolation result as evaluated on the CPU
linear_interpolation_function_CPU(result_CPU, data, x_in, x_out, M, N);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int k=0; k<Nit; k++) linear_interpolation_function_GPU(result_d, data, x_in, x_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "GPU Global memory [ms]: " << setprecision (10) << time/Nit << endl;
cudaEventRecord(start, 0);
for (int k=0; k<Nit; k++) linear_interpolation_function_GPU_texture_filtering(result_d_texture_filtering, data, x_in, x_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "GPU Texture filtering [ms]: " << setprecision (10) << time/Nit << endl;
cudaEventRecord(start, 0);
for (int k=0; k<Nit; k++) linear_interpolation_function_GPU_texture(result_d_texture, data, x_in, x_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "GPU Texture [ms]: " << setprecision (10) << time/Nit << endl;
float diff_norm=0.f, norm=0.f;
for(int j=0; j<N; j++) {
diff_norm = diff_norm + (result_CPU[j]-result_d[j])*(result_CPU[j]-result_d[j]);
norm = norm + result_CPU[j]*result_CPU[j];
}
printf("Error GPU [percentage] = %f\n",100.*sqrt(diff_norm/norm));
float diff_norm_texture_filtering=0.f;
for(int j=0; j<N; j++) {
diff_norm_texture_filtering = diff_norm_texture_filtering + (result_CPU[j]-result_d_texture_filtering[j])*(result_CPU[j]-result_d_texture_filtering[j]);
}
printf("Error texture filtering [percentage] = %f\n",100.*sqrt(diff_norm_texture_filtering/norm));
float diff_norm_texture=0.f;
for(int j=0; j<N; j++) {
diff_norm_texture = diff_norm_texture + (result_CPU[j]-result_d_texture[j])*(result_CPU[j]-result_d_texture[j]);
}
printf("Error texture [percentage] = %f\n",100.*sqrt(diff_norm_texture/norm));
cudaDeviceReset();
return 0;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

CUDA Zero Copy vs. CudaMemcpy on Jetson TK1 - c++

Related

Segmentation fault when using cudaMemcpy

Why am I getting wrong results with this implemention of a sum reduction in CUDA?

CUDA c++, simple matrix multiplication error

CUDA : programming with twice as much blocks (tiling?)

Combining texture memory Unified Memory in CUDA 6

Categories

Resources