We have the following serial C code operating on
two vectors a[] and b[]:
double a[20000],b[20000],r=0.9;
for(int i=1;i<=10000;++i)
{
a[i]=r*a[i]+(1-r)*b[i]];
errors=max(errors,fabs(a[i]-b[i]);
b[i]=a[i];
}
Please tell us on how to port this code to CUDA and cublas?
It's also possible to implement this reduction in Thrust using thrust::transform_reduce. This solution fuses the entire operation, as talonmies suggests:
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
// this functor unpacks a tuple and then computes
// a weighted absolute difference of its members
struct weighted_absolute_difference
{
double r;
weighted_absolute_difference(const double r)
: r(r)
{}
__host__ __device__
double operator()(thrust::tuple<double,double> t)
{
double a = thrust::get<0>(t);
double b = thrust::get<1>(t);
a = r * a + (1.0 - r) * b;
return fabs(a - b);
}
};
int main()
{
using namespace thrust;
const std::size_t n = 20000;
const double r = 0.9;
device_vector<double> a(n), b(n);
// initialize a & b
...
// do the reduction
double result =
transform_reduce(make_zip_iterator(make_tuple(a.begin(), b.begin())),
make_zip_iterator(make_tuple(a.end(), b.end())),
weighted_absolute_difference(r),
-1.f,
maximum<double>());
// note that this solution does not set
// a[i] = r * a[i] + (1 - r) * b[i]
return 0;
}
Note that we do not perform the assignment a[i] = r * a[i] + (1 - r) * b[i] in this solution, though it would be simple to do so after the reduction using thrust::transform. It is not safe to modify transform_reduce's arguments in either functor.
This second line in your loop:
errors=max(errors,fabs(a[i]-b[i]);
is known as a reduction. Fortunately there is reduction example code in the CUDA SDK - take a look at this and use it as a template for your algorithm.
You probably want to split this into two separate operations (possibly as two separate kernels) - one for the parallel part (calculation of bp[] values) and a second for the reduction (calculate errors).
Related
I've been trying to write a function to approximate an the value of an integral using the Composite Simpson's Rule.
template <typename func_type>
double simp_rule(double a, double b, int n, func_type f){
int i = 1; double area = 0;
double n2 = n;
double h = (b-a)/(n2-1), x=a;
while(i <= n){
area = area + f(x)*pow(2,i%2 + 1)*h/3;
x+=h;
i++;
}
area -= (f(a) * h/3);
area -= (f(b) * h/3);
return area;
}
What I do is multiply each value of the function by either 2 or 4 (and h/3) with pow(2,i%2 + 1) and subtract off the edges as these should only have a weight of 1.
At first, I thought it worked just fine, however, when I compared it to my Trapezoidal Method function it was way more inaccurate which shouldn't be the case.
This is a simpler version of a code I previously wrote which had the same problem, I thought that if I cleaned it up a little the problem would go away, but alas. From another post, I get the idea that there's something going on with the types and the operations I'm doing on them which results in loss of precision, but I just don't see it.
Edit:
For completeness, I was running it for e^x from 1 to zero
\\function to be approximated
double f(double x){ double a = exp(x); return a; }
int main() {
int n = 11; //this method works best for odd values of n
double e = exp(1);
double exact = e-1; //value of integral of e^x from 0 to 1
cout << simp_rule(0,1,n,f) - exact;
The Simpson's Rule uses this approximation to estimate a definite integral:
Where
and
So that there are n + 1 equally spaced sample points xi.
In the posted code, the parameter n passed to the function appears to be the number of points where the function is sampled (while in the previous formula n is the number of intervals, that's not a problem).
The (constant) distance between the points is calculated correctly
double h = (b - a) / (n - 1);
The while loop used to sum the weighted contributes of all the points iterates from x = a up to a point with an ascissa close to b, but probably not exactly b, due to rounding errors. This implies that the last calculated value of f, f(x_n), may be slightly different from the expected f(b).
This is nothing, though, compared to the error caused by the fact that those end points are summed inside the loop with the starting weight of 4 and then subtracted after the loop with weight 1, while all the inner points have their weight switched. As a matter of fact, this is what the code calculates:
Also, using
pow(2, i%2 + 1)
To generate the sequence 4, 2, 4, 2, ..., 4 is a waste, in terms of efficency, and may add (depending on the implementation) other unnecessary rounding errors.
The following algorithm shows how to obtain the same (fixed) result, without a call to that library function.
template <typename func_type>
double simpson_rule(double a, double b,
int n, // Number of intervals
func_type f)
{
double h = (b - a) / n;
// Internal sample points, there should be n - 1 of them
double sum_odds = 0.0;
for (int i = 1; i < n; i += 2)
{
sum_odds += f(a + i * h);
}
double sum_evens = 0.0;
for (int i = 2; i < n; i += 2)
{
sum_evens += f(a + i * h);
}
return (f(a) + f(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
}
Note that this function requires the number of intervals (e.g. use 10 instead of 11 to obtain the same results of OP's function) to be passed, not the number of points.
Testable here.
The above excellent and accepted solution could benefit from liberal use of std::fma() and templatize on the floating point type.
https://en.cppreference.com/w/cpp/numeric/math/fma
#include <cmath>
template <typename fptype, typename func_type>
double simpson_rule(fptype a, fptype b,
int n, // Number of intervals
func_type f)
{
fptype h = (b - a) / n;
// Internal sample points, there should be n - 1 of them
fptype sum_odds = 0.0;
for (int i = 1; i < n; i += 2)
{
sum_odds += f(std::fma(i,h,a));
}
fptype sum_evens = 0.0;
for (int i = 2; i < n; i += 2)
{
sum_evens += f(std::fma(i,h,a);
}
return (std::fma(2,sum_evens,f(a)) +
std::fma(4,sum_odds,f(b))) * h / 3;
}
Suppose the following problem in E3 with an arbitrary norm. For example, L1 norm is used (Hamming, Karlsruhe, geodesic, ..., are also applicable).
namespace {
typedef boost::tuple<double, double, double> Point;
double nL1(const Point& p1, const Point& p2) { //L1 norm, example
double dx = p1.get<0>() - p2.get<0>();
double dy = p1.get<1>() - p2.get<1>();
double dz = p1.get<2>() - p2.get<2>();
return abs(dx) + abs(dy) + abs(dz);
}
}
There are two sets A, B. A is initialized by n random points, where n is relatively large (n = 1*10^7 - 1*10^9), B is empty; see the sample code:
int main() {
using namespace std;
using namespace boost::lambda;
vector<Point> A, B;
int n = 10000000;
for (int i = 0; i < n; i++) //Create random points
A.push_back(boost::make_tuple(rand(), rand(), rand()));
Initially, we put a random point from A to B. In the simplified example, the first point A[0] is used:
B.push_back(A[0]);
Subsequently, for i = 1:n we repeat these steps:
Find the nearest point in B to A[i] according to a given norm
B * = argmin(|A[i]-B|)
If |B* - A[i]| < eps, then B.push_back(A[i]). In other words, add sufficiently close A[i] to B (Here, eps = 1.0*10^4 is used).
For the nearest search, I am using the std::partial_sort.
const int k = 1;
for (int i = 1; i < A.size(); i++) {
partial_sort(B.begin(), B.begin() + k, B.end(), bind(less<double>(), bind(nL1, _1, A[i]), bind(nL1, _2, A[i])));
if (nL1(*B.begin(), A[i]) < 1e4) B.push_back(A[i]); //Some threshold, eps=1.0*10^4
}
}
B is continuously growing and the search becomes more expensive... Being repeated in the loop, it is too slow, even for small sets (n=1*10^6)... Here, the partial sort is inefficient.
Are there significant improvements in speed? Of course, a naive approach can be used (but it is not faster).
How to speed up nn-search?
Another problem appears when also the second nearest points is required...
Current k-nn search libraries can not be used because of an arbitrary norm (the problem can be solved on the sphere). I tried to use nano-flann, but it does not support some specific norms...
I want to optimize my application using vectorization. More specifically, I want to vectorize the mathematical operations on the std::complex<double> type. However, this seems to be quite difficult. Consider the following example:
#define TEST_LEN 100
#include <algorithm>
#include <complex>
typedef std::complex<double> cmplx;
using namespace std::complex_literals;
#pragma omp declare simd
cmplx add(cmplx a, cmplx b)
{
return a + b;
}
#pragma omp declare simd
cmplx mult(cmplx a, cmplx b)
{
return a * b;
}
void k(cmplx *x, cmplx *&y, int i0, int N)
{
#pragma omp for simd
for (int i = i0; i < N; i++)
y[i] = add(mult(-(1i + 1.0), x[i]), 1i);
}
int main(int argc, char **argv)
{
cmplx *x = new cmplx[TEST_LEN];
cmplx *y = new cmplx[TEST_LEN];
for (int i = 0; i < TEST_LEN; i++)
x[i] = 0;
for (int i = 0; i < TEST_LEN; i++)
{
int N = std::min(4, TEST_LEN - i);
k(x, y, i, N);
}
delete[] x;
delete[] y;
return 1;
}
I am using the g++ compiler. For this code the compiler gives the following warning:
warning: unsupported return type 'cmplx' {aka 'std::complex'} for simd
for the lines containing the mult and add function.
It seems like it is not possible to vectorize the std::complex<double> type like this.
Is there a different way how this can be archieved?
Not easily. SIMD works quite well when you have values in the next N steps that behave the same way. So consider for example an array of 2D vectors:
X Y X Y X Y X Y
If we were to do a vector addition operation here,
X Y X Y X Y X Y
+ + + + + + + +
X Y X Y X Y X Y
The compiler will nicely vectorise that operation. If however we were to want to do something different for the X and Y values, the memory layout becomes problematic for SIMD:
X Y X Y X Y X Y
+ / + / + / + /
X Y X Y X Y X Y
If you consider for example the multiplication case:
(a + bi) (c + di) = (ac - bd) (ad + bc)i
Suddenly the operations are jumping between SIMD lanes, which is pretty much going to kill any decent vectorization.
Take a quick look at this godbolt: https://godbolt.org/z/rnVVgl
Addition boils down to some vaddps instructions (working on 8 floats at a time).
Multiply ends up using vfmadd231ss and vmulss (which both work on 1 float at a time).
The only easy way to automatically vectorise your complex code would be to seperate out the real and imaginary parts into 2 arrays:
struct ComplexArray {
float* real;
float* imaginary;
};
Within this godbolt you can see that the compiler is now using vfmadd213ps instructions (so again back to working on 8 floats at a time).
https://godbolt.org/z/Ostaax
I am developing my first Cuda application, and I have a kernel with "below-expected throughput", which seems to be the biggest bottleneck at the moment.
The task of the kernel is to compute an N by N sized matrix (DD) containing squared distances between all elements on a data matrix. The data matrix (Y) is size N by D (to support multi dimensional data) and stored as row-major.
Source:
__global__ void computeSquaredEuclideanDistance(const float * __restrict__ Y, float * __restrict__ DD, const int N, const int D) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < N * N; i += stride) {
const int m = i / N;
const int n = i % N;
float tmp = 0;
for (int d = 0; d < D; ++d) {
const float Ynd = Y[d + D * n];
const float Ymd = Y[d + D * m];
const float Ydiff = Ynd - Ymd;
tmp += Ydiff * Ydiff;
}
DD[n + N * m] = tmp;
}
}
This is being called with size_t blockSize = 256 and size_t numBlocks = (N*N + blockSize - 1)/blockSize.
How can I optimize this kernel? My initial thought is that the time-consuming part is reading data without exploiting some sort of shared memory, but can anyone give me pointers on how to approach this?
Remarks from the nvvc profiling tool:
Latency analysis:
Compute utilization at around 40%
Memory (L2 cache) utilization at around 35%
Occupancy is not an issue
Active Warps at 57.59 of a theoretical 64
Occupancy at 90% of a theoretical 100
For my application, typical values are:
5k < N < 30k
D is either 2 or 3
I typically disregard these types of optimization questions because they are on the verge of off-topic, in my opinion. Worst still, you provide no MCVE so anyone trying to answer would have to write all their own support code to compile and benchmark your kernel. And this sort of work does require benchmarking and code analysis. But because your problem is basically a linear algebra problem (and I like linear algebra), I answered it rather than close voting it as too broad......
With that off my chest. there are a couple of things which immediately jump out in the code which could be improved and which would probably have a material affect on the run time.
The first is that the trip count of the inner loop is known a priori. Anytime you have a situation like that, let the compiler know. Loop unrolling and code reordering is a very powerful compiler optimization and the NVIDIA compiler is extremely good at it. If you move D into a template parameter, you can do something like this:
template<int D>
__device__ float esum(const float *x, const float *y)
{
float val = 0.f;
#pragma unroll
for(int i=0; i<D; i++) {
float diff = x[i] - y[i];
val += diff * diff;
}
return val;
}
template<int D>
__global__
void vdistance0(const float * __restrict__ Y, float * __restrict__ DD, const int N)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < N * N; i += stride) {
const int m = i / N;
const int n = i % N;
DD[n + N * m] = esum<D>(Y + D * n, Y + D * m);
}
}
template __global__ void vdistance0<2>(const float *, float *, const int);
template __global__ void vdistance0<3>(const float *, float *, const int);
The compiler will inline esum and unroll the inner loop and it can then use its reordering heuristics to better interleave loads and flops to improve throughput. The resulting code has a lower register footprint too. When I run this for N=10000 and D=2, I get about 35% speed up (7.1ms versus 4.5ms on a GTX 970 with CUDA 9.1).
But there is an even more glaringly obvious optimization than this. The calculation you are performing will produce a symmetric output matrix. You only need to do (N*N)/2 operations to compute the full matrix, rather than the N*N you are doing in your code [technically N(N/2 -1) because the diagonal entries are zero, but lets forget the diagonal for the purposes of this discussion].
So taking a different approach and using one block to calculate each row of the upper triangular output matrix, then you can do something like this:
struct udiag
{
float *p;
int m;
__device__ __host__ udiag(float *_p, int _m) : p(_p), m(_m) {};
__device__ __host__ float* get_row(int i) { return p + (i * (i + 1)) / 2; };
};
template<int D>
__global__
void vdistance2(const float * __restrict__ Y, float * __restrict__ DD, const int N)
{
int rowid = blockIdx.x;
int colid = threadIdx.x;
udiag m(DD, N);
for(; rowid < N; rowid += gridDim.x) {
float* p = m.get_row(rowid);
const float* y = Y + D * rowid;
for(int i=colid; i < (N-rowid); i += blockDim.x) {
p[i] = esum<D>(y, y + D * i);
}
}
}
template __global__ void vdistance2<2>(const float *, float *, const int);
template __global__ void vdistance2<3>(const float *, float *, const int);
This uses a little helper class to encapsulate the triangle numbers needed for the addressing scheme for the upper triangular output matrix. Doing this saves an enormous amount of memory and memory bandwidth as well as reducing the total FLOP count for the calculation. If you need to do other things afterwards BLAS (and CUBLAS) supports computations on upper or lower triangular matrices. Use them. When I run this I get about 75% speedup (7.1ms versus 1.6ms on the same GTX 970).
Huge disclaimer: All the code you see here was written during a 45 minute lunch break and as been very lightly tested. I make absolutely no claims that anything in this answer is actually correct. I have confirmed that it compiles and doesn't produce a runtime error when I run it to get profiling data. That is it. Cavaet Emptor and all that.
I have two vectors (oldvector and newvector). I need to calculate the value of the residual which is defined by the following pseudocode:
residual = 0;
forall i : residual += (oldvector[i] - newvector[i])^2
Currently, I am calculating this with two CUDA Thrust operations which are essentially doing:
forall i : oldvector[i] = oldvector[i] - newvector[i]
followed by a thrust::transform_reduce with a square as unary operator, which is doing:
residual = 0;
forall i : residual += oldvector[i]^2;
The problem with this is obviously the intermediate store to global memory before transform_reduce. Is there a more efficient approach to this problem which would fuse these two steps? Apart from writing my own CUDA kernel, is there any other option?
One approach I thought of was to write a thrust::reduce with zip iterators. The problem with this is that the return type of the operator has to be the same type as its input. What this means, according to me, is that the reduction operator would be returning a tuple which would mean an extra addition.
If I do write a reduction CUDA kernel, has there been any improvements made over the CUDA 1.1 example for the reduction kernel?
thrust::inner_product will do it in a single function call. Your original idea can be made to work also (zipping together the two vectors and using thrust::transform_reduce) This code demonstrates both methods:
#include <iostream>
#include <thrust/tuple.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#define N 2
struct zdiffsq{
template <typename Tuple>
__host__ __device__ float operator()(Tuple a)
{
float result = thrust::get<0>(a) - thrust::get<1>(a);
return result*result;
}
};
struct diffsq{
__host__ __device__ float operator()(float a, float b)
{
return (b-a)*(b-a);
}
};
int main(){
thrust::device_vector<float> oldvector(N);
thrust::device_vector<float> newvector(N);
oldvector[0] = 1.0f; oldvector[1] = 2.0f;
newvector[0] = 2.0f; newvector[1] = 5.0f;
float result = thrust::inner_product(oldvector.begin(), oldvector.end(), newvector.begin(), 0.0f, thrust::plus<float>(), diffsq());
std::cout << "Result: " << result << std::endl;
float result2 = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(oldvector.begin(), newvector.begin())), thrust::make_zip_iterator(thrust::make_tuple(oldvector.end(), newvector.end())), zdiffsq(), 0.0f, thrust::plus<float>());
std::cout << "Result2: " << result2 << std::endl;
}
You can also investigate eliminating the functor definition used with the inner product example, by using thrust placeholders.
Even if you want to write your own CUDA code, the standard recommendation now for oft-used algorithms like parallel reductions and sorting, is to use cub.
And yes, the CUDA parallel reduction sample and accompanying presentation is still a good basic intro to a fast parallel reduction.
Robert Crovella has already answered this question and has also suggested using CUB.
At variance with Thrust, CUB leaves performance-critical parameters, such as the choice of specific reduction algorithm to be used and the degree of concurrency unbound, selectable by the user. These parameters can be tuned in order maximimize performance for a particular architecture and application. The parameters can be specified at compile time, so avoiding runtime performance penalties.
Below, there is a full worked example on how using CUB for residual calculation.
#include <cub/cub.cuh>
#include <cuda.h>
#include "Utilities.cuh"
#include <iostream>
#define BLOCKSIZE 256
#define ITEMS_PER_THREAD 8
const int N = 4096;
/******************************/
/* TRANSFORM REDUCTION KERNEL */
/******************************/
__global__ void TransformSumKernel(const float * __restrict__ indata1, const float * __restrict__ indata2, float * __restrict__ outdata) {
unsigned int tid = threadIdx.x + blockIdx.x * gridDim.x;
// --- Specialize BlockReduce for type float.
typedef cub::BlockReduce<float, BLOCKSIZE * ITEMS_PER_THREAD> BlockReduceT;
__shared__ typename BlockReduceT::TempStorage temp_storage;
float result;
if(tid < N) result = BlockReduceT(temp_storage).Sum((indata1[tid] - indata2[tid]) * (indata1[tid] - indata2[tid]));
if(threadIdx.x == 0) atomicAdd(outdata, result);
return;
}
/********/
/* MAIN */
/********/
int main() {
// --- Allocate host side space for
float *h_data1 = (float *)malloc(N * sizeof(float));
float *h_data2 = (float *)malloc(N * sizeof(float));
float *h_result = (float *)malloc(sizeof(float));
float *d_data1; gpuErrchk(cudaMalloc(&d_data1, N * sizeof(float)));
float *d_data2; gpuErrchk(cudaMalloc(&d_data2, N * sizeof(float)));
float *d_result; gpuErrchk(cudaMalloc(&d_result, sizeof(float)));
for (int i = 0; i < N; i++) {
h_data1[i] = 1.f;
h_data2[i] = 3.f;
}
gpuErrchk(cudaMemcpy(d_data1, h_data1, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data2, h_data2, N * sizeof(float), cudaMemcpyHostToDevice));
TransformSumKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_data1, d_data2, d_result);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost));
std::cout << "output: ";
std::cout << h_result[0];
std::cout << std::endl;
gpuErrchk(cudaFree(d_data1));
gpuErrchk(cudaFree(d_data2));
gpuErrchk(cudaFree(d_result));
return 0;
}