Call functor for all combinations in Cuda/Thrust - c++

I have two index sets, one in the range [0, N], one in the range [0, M], where N != M. The indices are used to refer to values in different thrust::device_vectors.
Essentially, I want to create one GPU thread for every combination of these indices, so N*M threads. Each thread should compute a value based on the index-combination and store the result in another thrust::device_vector, at a unique index also based on the input combination.
This seems to be a fairly standard problem, but I was unable to find a way to do this in thrust. The documentation only ever mentions problems, where element i of a vector needs to compute something with element i of another vector. There is the thrust::permutation_iterator, but as far as I understand it only gives me the option to reorder data, and I have to specify the order as well.
Some code:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
int main()
{
// Initialize some data
const int N = 2;
const int M = 3;
thrust::host_vector<int> vec1_host(N);
thrust::host_vector<int> vec2_host(M);
vec1_host[0] = 1;
vec1_host[1] = 5;
vec2_host[0] = -3;
vec2_host[1] = 42;
vec2_host[2] = 9;
// Copy to device
thrust::device_vector<int> vec1_dev = vec1_host;
thrust::device_vector<int> vec2_dev = vec2_host;
// Allocate device memory to copy results to
thrust::device_vector<int> result_dev(vec1_host.size() * vec2_host.size());
// Create functor I want to call on every combination
struct myFunctor
{
thrust::device_vector<int> const& m_vec1;
thrust::device_vector<int> const& m_vec2;
thrust::device_vector<int>& m_result;
myFunctor(thrust::device_vector<int> const& vec1, thrust::device_vector<int> const& vec2, thrust::device_vector<int>& result)
: m_vec1(vec1), m_vec2(vec2), m_result(result)
{
}
__host__ __device__
void operator()(size_t i, size_t j) const
{
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec1[j];
}
} func(vec1_dev, vec2_dev, result_dev);
// How do I create N*M threads, each of which calls func(i, j) ?
// Copy results back
thrust::host_vector<int> result_host = result_dev;
for(int i : result_host)
std::cout << i << ", ";
std::cout << std::endl;
// Expected output:
// -2, 2, 43, 47, 10, 14
return 0;
}
I'm fairly sure this is very easy to achieve, I guess I'm just missing the right search terms. Anyways, all help appreciated :)

Presumably in your functor operator instead of this:
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec1[j];
^ ^
you meant this:
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec2[j];
^ ^
I think there are probably many ways to tackle this, but so as to not argue about things that are not germane to the question, I'll try and stay as close to your given code as I can.
Operations like [] on a vector are not possible in device code. Therefore we must convert your functor to work on raw data pointers, rather than thrust vector operations directly.
With those caveats, and a slight modification in how we handle your i and j indices, I think what you're asking is not difficult.
The basic strategy is to create a result vector that is of length N*M just as you suggest, then create the indices i and j within the functor operator. In so doing, we need only pass one index to the functor, using e.g. thrust::transform or thrust::for_each to create our output:
$ cat t79.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/execution_policy.h>
#include <iostream>
struct myFunctor
{
const int *m_vec1;
const int *m_vec2;
int *m_result;
size_t v1size;
myFunctor(thrust::device_vector<int> const& vec1, thrust::device_vector<int> const& vec2, thrust::device_vector<int>& result)
{
m_vec1 = thrust::raw_pointer_cast(vec1.data());
m_vec2 = thrust::raw_pointer_cast(vec2.data());
m_result = thrust::raw_pointer_cast(result.data());
v1size = vec1.size();
}
__host__ __device__
void operator()(const size_t x) const
{
size_t i = x%v1size;
size_t j = x/v1size;
m_result[i + j * v1size] = m_vec1[i] + m_vec2[j];
}
};
int main()
{
// Initialize some data
const int N = 2;
const int M = 3;
thrust::host_vector<int> vec1_host(N);
thrust::host_vector<int> vec2_host(M);
vec1_host[0] = 1;
vec1_host[1] = 5;
vec2_host[0] = -3;
vec2_host[1] = 42;
vec2_host[2] = 9;
// Copy to device
thrust::device_vector<int> vec1_dev = vec1_host;
thrust::device_vector<int> vec2_dev = vec2_host;
// Allocate device memory to copy results to
thrust::device_vector<int> result_dev(vec1_host.size() * vec2_host.size());
// How do I create N*M threads, each of which calls func(i, j) ?
thrust::for_each_n(thrust::device, thrust::counting_iterator<size_t>(0), (N*M), myFunctor(vec1_dev, vec2_dev, result_dev));
// Copy results back
thrust::host_vector<int> result_host = result_dev;
for(int i : result_host)
std::cout << i << ", ";
std::cout << std::endl;
// Expected output:
// -2, 2, 43, 47, 10, 14
return 0;
}
$ nvcc -std=c++11 -arch=sm_61 -o t79 t79.cu
$ ./t79
-2, 2, 43, 47, 10, 14,
$
In retrospect, I think this is more or less exactly what #eg0x20 was suggesting.

Related

Sub-vector of 2D vector

I have std::vector<std::vector<double>> A(300, std::vector<double>(500)).
I want to create a new vector with sub-range of A: sub-vector[5:10][25:100].
How can I do this?
You can use iterators. First create the vector:
std::vector<std::vector<double>> sub_vector;
sub_vector.reserve(5);
Then populate it with the range constructor of vector:
for (std::size_t i = 5; i < 10; ++i) {
sub_vector.emplace_back(A[i].begin() + 25, A[i].begin() + 100);
}
Notes:
You are responsible for ensuring that the indexes are in range. Otherwise this results in undefined behavior.
This deals with close-open ranges. If you want close-close ranges, you need to add one to the end indexes.
There are no simple notations like that, you have to roll your own:
std::vector<std::vector<double>> A(300, std::vector<double>(500));
std::vector<std::vector<double>> subranges;
subranges.reserve(11 - 5);
std::transform(A.begin() + 5, A.begin() + 11,
std::back_inserter(subranges),
[](const auto& inner){ // [](const std::vector<double>& inner) {
return std::vector<double>(inner.begin() + 25,
inner.begin() + 101);
});
Yet another solution
Using a function to do the job.
#include <iostream>
#include <vector>
#include <iterator>
#include <algorithm>
constexpr size_t MaxRows = 300;
constexpr size_t MaxColumns = 500;
using MyType = double;
using Columns = std::vector<MyType>;
using Matrix = std::vector<Columns>;
void copySubMatrix( const Matrix& source,Matrix& destination,const size_t& startRow,const size_t& endRow,const size_t& startColumn, const size_t& endColumn)
{
// Clear destination matrix
destination.clear();
// Copy rows end columns
std::for_each(source.begin() + startRow, source.begin() + endRow + 1, [&](const Columns & c) {
Columns row{ c.begin() + startColumn, c.begin() + endColumn + 1};
destination.push_back(row); });
}
int main() {
// Define source matrix with given size and empty destination matrix
Matrix A(MaxRows, Columns(MaxColumns));
Matrix result{};
// Fill source matrix with running values
std::for_each(A.begin(), A.end(), [i = 0](Columns & c) mutable {for (MyType& m : c) m = i++; });
// Copy the given range to the destination matrix
copySubMatrix(A, result, 5, 10, 25, 100);
// Display destination matrix
std::for_each(result.begin(), result.end(), [](const Columns & c) {
std::copy(c.begin(), c.end(), std::ostream_iterator<MyType>(std::cout, " ")); std::cout << "\n"; });
return 0;
}
You can do this by iterating over one dimension:
std::vector<vector<double>> main_vector(300, std::vector<double>(500));
std::vector<vector<double>> sub_vector;
std::vector<double>::const_iterator first, last;
unsigned int x_pos_start=5, x_pos_end=10;
unsigned int y_pos_start=25, y_pos_end=100;
sub_vector.resize(x_pos_end - x_pos_start + 1);
for(size_t i=x_pos_start; i<=x_pos_end; ++i)
{
first = main_vector[i].begin() + y_pos_start;
last = main_vector[i].begin() + y_pos_end;
sub_vector[i].insert(sub_vector[i].begin(), first, last);
}

Parallel order-preserving selection from an array using tbb

I have a range-image and want to convert it into a libpointmatcher point cloud. The cloud is an Eigen::Matrix with 4 rows (x,y,z,1) and several columns for every point.
The range-image is an unsigned short*array including the range values (z) and an unsigned char*array including information about the pixel visibility.
In serial, my code looks like this:
//container to hold the data
std::vector<Eigen::Vector4d> vec;
vec.reserve(this->Height*this->Width);
//contains information about pixel visibility
unsigned char* mask_data = (unsigned char*)range_image.mask.ToPointer();
//contains the actual pixel data
unsigned short* pixel_data = (unsigned short*)range_image.pixel.ToPointer();
for (int y =0;y < range_image.Height; y++)
{
for (int x = 0; x < range_image.Width; x++)
{
int index =x+y*range_image.Width;
if(*(mask_data+index) != 0)
{
vec.push_back(Eigen::Vector4d(x,y,(double)*(data+index),1));
}
}
}
// libpointmatcher point cloud with size of visible pixel
PM::Matrix features(4,vec.size());
PM::DataPoints::Labels featureLabels;
featureLabels.resize(4);
featureLabels[0] = PM::DataPoints::Label::Label("x");
featureLabels[1] = PM::DataPoints::Label::Label("y");
featureLabels[2] = PM::DataPoints::Label::Label("z");
featureLabels[3] = PM::DataPoints::Label::Label("pad");
//fill with data
for(int i = 0; i<vec.size(); i++)
{
features.col(i) = vec[i];
}
Because of the large images this loop takes 500ms for 840000 points and thats too slow. Now my idea was to integrate the code above in one parallized function. The problem is that the Eigen::Matrix does not provide a push_back functionality, i dont know the number of visible points in advance and i need the points in the right order to process the point cloud.
So i need a parallel algorithm to extract visible 3D-Points from my range-image and insert them into the Eigen::Matrix in the right order. I'm working with Microsoft Visual Studio 2012 and i can use either OpenMP 2.0 or TBB. I appreciate any help :)
UPDATE
As Arch D. Robison suggeested i tried the tbb::parallel_scan. I passed the mask array and a double array to hold the 3D-coodinates. The output array has four times the size of the input array to store homogeneous 3D data (x,y,z,1). Then i map the otput array in a Eigen::Matrix.The number of rows is fixed and the cols coming from the result from the parallel_scan.
size_t vec_size = width*height;
double* out = new double[vec_size * 4];
size_t m1 = Compress(mask, pixel, out, height, width,
[](unsigned char x) {return x != 0; });
Map<MatrixXd> features(out, 4, m1);
. Here is the code from the operator():
void operator()(const tbb::blocked_range2d<size_t, size_t>& r, Tag) {
// Use local variables instead of member fields inside the loop,
// to improve odds that values will be kept in registers.
size_t j = sum;
const unsigned char* m = in;
const unsigned short* p = in2;
T* values = out;
size_t yend = r.rows().end();
for (size_t y = r.rows().begin(); y != yend; ++y)
{
size_t xend = r.cols().end();
for (size_t x = r.cols().begin(); x != xend; ++x)
{
size_t index = x + y*width;
if (pred(m[index]))
{
if (Tag::is_final_scan())
{
size_t idx = j*4;
values[idx] = (double)x;
values[idx + 1] = (double)y;
values[idx + 2] = p[index];
values[idx + 3] = 1.0;
}
++j;
}
}
}
sum = j;
}
I'm now 4x faster then the serial version. What do you think about this approach? Did i miss anythink and are there improvements? Thanks
Here is an example of how to do something like std::copy_if using tbb::parallel_scan. The key method is operator(), which is usually called twice per subrange, once for a prescan and once for a final scan. (But be aware that TBB omits the prescan when it's not necessary.) Here the prescan just does tallying and the final scan does the final work (which includes replaying the tallying). See https://software.intel.com/sites/default/files/bc/2b/parallel_scan.pdf for more details on the methods. Another good references is https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf , which shows lots of things you can do with parallel scan (a.k.a. prefix-sum).
```
#include "tbb/parallel_scan.h"
#include "tbb/blocked_range.h"
#include <cstddef>
template<typename T, typename Pred>
class Body {
const T* const in;
T* const out;
Pred pred;
size_t sum;
public:
Body( T* in_, T* out_, Pred pred_) :
in(in_), out(out_), pred(pred_), sum(0)
{}
size_t getSum() const {return sum;}
template<typename Tag>
void operator()( const tbb::blocked_range<size_t>& r, Tag ) {
// Use local variables instead of member fields inside the loop,
// to improve odds that values will be kept in registers.
size_t j = sum;
const T* x = in;
T* y = out;
for( size_t i=r.begin(); i<r.end(); ++i ) {
if( pred(x[i]) ) {
if( Tag::is_final_scan() )
y[j] = x[i];
++j;
}
}
sum = j;
}
// Splitting constructor used for parallel fork.
// Note that it's sum(0), not sum(b.sum), because this
// constructor will be used to compute a partial sum.
// Method reverse_join will put together the two sub-sums.
Body( Body& b, tbb::split ) :
in(b.in), out(b.out), pred(b.pred), sum(0)
{}
// Join partial solutions computed by two Body objects.
// Arguments "this" and "a" correspond to the splitting
// constructor arguments "b" and "this". That's why
// it's called a reverse join.
void reverse_join( Body& a ) {
sum += a.sum;
}
void assign( Body& b ) {sum=b.sum;}
};
// Copy to out each element of in that satisfies pred.
// Return number of elements copied.
template<typename T, typename Pred>
size_t Compress( T* in, T* out, size_t n, Pred pred ) {
Body<T,Pred> b(in,out,pred);
tbb::parallel_scan(tbb::blocked_range<size_t>(0,n), b);
return b.getSum();
}
#include <cmath>
#include <algorithm>
#include <cassert>
int main() {
const size_t n = 10000000;
float* a = new float[n];
float* b = new float[n];
float* c = new float[n];
for( size_t i=0; i<n; ++i )
a[i] = std::cos(float(i));
size_t m1 = Compress(a, b, n, [](float x) {return x<0;});
size_t m2 = std::copy_if(a, a+n, c, [](float x) {return x<0;})-c;
assert(m1==m2);
for( size_t i=0; i<n; ++i )
assert(b[i]==c[i]);
}
```
Why do not you check out the condition *(m_maskData+index)==0 before m_features(0,index) = x;?

Nice way to create a dynamic 2D matrix in C++ 11

I already know how to create a dynamic 2D matrix using new and free it using delete. Since C++ 11 is here with many new memory features such as unique_ptr, array container etc.; what is a nice way to create a 2D matrix so that one needs not to free the matrix explicitly using delete operator?
One of the simplest ways is to use a vector of vectors
const int N = 10;
const int M = 10;
vector<vector<int>> matrix2d(N, vector<int>(M, 0)); // 10x10 zero-initialized matrix
matrix2d[0][0] = 42;
You could of course use a single vector and wrap it into an accessor class
vector<int> matrix(N * M, 0) // Ditto as above, but needs stride-aware accessors
I'll post a small example here for completeness' sake
template <typename T>
class Matrix2D {
std::vector<T> data;
unsigned int sizeX, sizeY;
public:
Matrix2D (unsigned int x, unsigned int y)
: sizeX (x), sizeY (y) {
data.resize (sizeX*sizeY);
}
T& operator()(unsigned int x, unsigned int y) {
if (x >= sizeX || y>= sizeY)
throw std::out_of_range("OOB access"); // Throw something more appropriate
return data[sizeX*y + x]; // Stride-aware access
}
};
Live Example
or perhaps combine your way with a smart pointer. Notice that the vector<vector<int>> approach should be used with caution since the vectors are independent from each other and there's nothing to enforce that they should keep their size fixed.
I strongly suggest using array_view from the GSL, which will eventually be part of the standard.
#include <array>
#include <vector>
#include "array_view.h" // must be manually imported until standardization
int main()
{
std::array<int, 10> arr{}; // 10 ints on the stack
std::vector<int> vec{12}; // 12 ints on the heap
auto a = gsl::array_view<int, 2>{{2, 5}, arr}; // 2D, 2x5 matrix
auto b = gsl::array_view<int, 3>{{3, 2, 2}, vec}; // 3D, 3x2x2 matrix
auto c = gsl::array_view<int>{vec}; // 1D, spans from `begin()` to `end()`
a[{0,3}] += b[{0,1,1}] * -c[2]; // access syntax
}
N.B. array_view holds no control over the lifetime of the range it looks at. See here for full details.
Edit:
array_view is dead as it was becoming too complicated in handling multidimensional arrays with zero cost abstraction. You should instead use span from the GSL.
See this for more information about span.
Based on above answers, I have found a simple way to create matrices although not using C++11 features. Here is an illustration.
#include <iostream>
#include <vector>
using namespace std;
typedef vector<vector<int>> Matrix2D;
typedef vector<Matrix2D> Matrix3D;
Matrix2D my_arr;
int main()
{
const size_t N = 9;
for(unsigned s = 4; s <= N; s++)
{
my_arr.resize(s);
for(unsigned i = 0; i < s; i++)
my_arr[i].resize(s,s);
for(unsigned i = 0; i < s; i++)
{
for(unsigned j = 0; j < s; j++)
cout << my_arr[i][j] << " ";
cout << endl;
}
cout << "\n\n";
}
return 0;
}

Crafting a custom call to inner_product (C++ STL)

I'm having trouble getting my head around how to use inner_product to combine a std::vector<float> and a std::vector<std::vector<float>>. Given, e.g., <2,3> and <<4,5>,<6,7>>, I'd like inner_product to produce
2*<4,5> + 3*<6,7> = <8,10> + <18,21> = <26,31>.
Supposing
vector<float> foo;
and
vector<vector<float>> bar;
are initialized and are of equal size, I don't know what UK1, UK2, and UK3 in
vector<float> ip =
inner_product(foo.begin(), foo.end(), bar.begin(), UK1, UK2, UK3);
should be. I suspect UK1 should be a vector filled with 0.0fs, of the same size as the vectors in bar. UK3 should perhaps be something like
std::transform(UK4.begin(), UK4.end(), UK4.begin(),
std::bind1st(std::multiplies<float>(), UK5));
And I guess UK2 should somehow represent component-wise vector<float> addition!
I don't even want to think about how much more complex this will become when the vectors in bar are replaced by objects of a class with float attributes...
Define the add and multiply functions like this:
static vector<float> add(const vector<float> &a,const vector<float> &b)
{
assert(a.size()==b.size());
size_t n = a.size();
vector<float> result(n);
for (size_t i=0; i!=n; ++i) {
result[i] = a[i]+b[i];
}
return result;
}
static vector<float> mul(const float &a,const vector<float> &b)
{
size_t n = b.size();
vector<float> result = b;
for (size_t i=0; i!=n; ++i) {
result[i] *= a;
}
return result;
}
And use it like this:
vector<float> zero(2,0);
vector<float> result =
inner_product(a.begin(),a.end(),b.begin(),zero,add,mul);
std::inner_product accumulates a sum. UK2 is the sum operation. UK1 is normally the neutral element of UK2. UK3 is the multiplication operation that multiplies corresponding elements of the two sequences and returns a summand for UK2.
I guess std::valarray should provide a more suitable container for this operation.
// this is c++11
#include <vector>
#include <valarray>
#include <algorithm>
#include <iostream>
int main ()
{
std::valarray <int> zero = { 0, 0 };
std::vector <int> foo = { 1, 2, 3 };
std::vector <std::valarray<int>> bar = { { 3, 4 }, { 5, 6 }, { 7, 8 } };
std::valarray<int> result = std::inner_product (foo.begin(), foo.end(),
bar.begin(), zero);
for(auto n : result) {
std::cout << n << ' ';
}
std::cout << std::endl;
}

Virtual concatenation with arrays

Suppose I have 3 double precision arrays a1[], a2[], a3[] each of length L1, L2, L3
Suppose I want to concatenate these arrays "virtually" That is I want to create a virtual
array a_virtual[] such that a_virtual = {a1[L1], a2[L2], a3[L3]} logically, though physically these arrays may not be contiguous to each other.
So if I want to access a_virtual[5] and L1=2, L2=3, L3=1 then a3[0] will be fetched. For accessing a_virtual[0], a1[0] will be fetched
How would I do this
in C
in C++ (how to do this with std::vectors in place of arrays would
also be useful)
in CUDA
I suspect if there is a way to do it, it would be the same for all the three environments, but there might be more efficient ways to do this within each environment depending on the
capabilities provided.
Here's a possible solution in C, using linked-list and (tail) recursion:
#include <stdio.h>
struct dblarr {
double *data;
size_t len;
struct dblarr *next;
};
double *fetch(const struct dblarr *arr, size_t index) {
if (arr == NULL) return NULL;
if (index < arr->len) return arr->data + index;
return fetch(arr->next, index - arr->len);
}
int main(void) {
double a1[2] = {1, 2};
double a2[3] = {1, 2, 3};
double a3[1] = {1};
struct dblarr x1, x2, x3;
x1.data = a1; x1.len = sizeof a1 / sizeof *a1; x1.next = &x2;
x2.data = a2; x2.len = sizeof a2 / sizeof *a2; x2.next = &x3;
x3.data = a3; x3.len = sizeof a3 / sizeof *a3; x3.next = NULL;
printf("before %f\n", *fetch(&x1, 5));
*fetch(&x1, 5) = 0.42;
printf(" after %f\n", *fetch(&x1, 5));
return 0;
}
You can "see the code running" at http://ideone.com/mY0ix.
How about something like the following? It's pretty hard-coded and not the best/ cleanest code, but maybe you can generalize this logic?
#include <iostream>
#include <vector>
using namespace std;
void logicalConcat(vector<int>& a1, vector<int>& a2, vector<int>& a3, int k) {
if(k > a1.size() - 1)
k -= a1.size();
else {
cout << a1[k] << endl;
return;
}
if(k > a2.size() - 1)
k -= a1.size();
else {
cout << a2[k] << endl;
return;
}
cout << a3[k] << endl;
}
Here k would be the index of the the virtual concatenation you want. We're not concatenating anything, just iterating over the vectors.
If the arrays need not be contiguous, then one way to do it is to convert a single index into two indices, one for an array of pointers to the real arrays, and another for the array that has the desired element.
To do this, you would create an array of pointers to those arrays:
double** arrays = {a1, a2, a3};
then an array of their lengths:
int arraysizes = { sizeof(a1) / sizeof(*a1), sizeof(a2) / sizeof(*a2), sizeof(a3) / sizeof(*a3) };
Then, given an index n, you can calculate the two indices for arrays by doing
int i1 = 0, j = 0;
while (n - arraysizes[j] >= 0)
n -= arraysizes[j++], ++i1;
And you can then index the array of pointers like this to get the actual element:
arrays[n][i2]
You could also create a wrapper class to do this arithmetic with a built in operator[].
Some of the others provide an answer for a basic C implementation.
Here is some sample code for a general-purpose c++ implementation of a class which creates virtual concatenated arrays without copying any of the array elements. Once created, the virtual array can be indexed like an ordinary vector (to be read or written to):
#include <vector>
#include <map>
#include <iostream>;
using namespace std;
class VirtualArray {
public:
multimap<int,double*> startIndices; // reverse map of starting index to its sub array
int size;
VirtualArray() : size(0) {}
double & operator[](int i) {
// find proper subarray in log(n) time
multimap<int,double*>::iterator iter = --startIndices.upper_bound(i);
double *subarray = iter->second;
int startIndex = iter->first;
// index into subarray
return subarray[i-startIndex];
}
void addArray(double* array, int length) {
startIndices.insert(make_pair(size, array));
size += length;
}
void addVector(vector<double> & vec) {
startIndices.insert(make_pair(size, vec.data()));
size += vec.size();
}
};
int main() {
double a1[3], a2[4], a3[6] = {1, 2, 3, 4, 5, 6};
int L1 = 3, L2 = 4, L3 = 6;
vector<double> a3vec;
a3vec.assign(a3,a3+6);
VirtualArray vArray;
vArray.addArray(a1,L1);
vArray.addArray(a2,L2);
vArray.addVector(a3vec);
cout << vArray[10];
return 0;
}