Parallelization of a for loop consisting of Thrust Transforms - c++

I've implemented a for loop consisting of several Thrust transformations. My aim is to calculate r[i] for each value of i from 0 to N. To put simply, r is a column vector and each of its elements can be calculated independently.
Therefore, I'm looking a way of parallelizing the for loop given below:
for(int i=0; i < N; i++) {
thrust::device_vector<float> P(N, 0.0);
thrust::device_vector<int> corr_col_indices_d(col_indices.begin() + row_begin[i], col_indices.begin() + row_begin[i+1]); // indices of the columns
thrust::device_vector<float> corr_values_d(values_d.begin() + row_begin[i], values_d.begin() + row_begin[i+1]); // values of the columns
// P[j] = corr_values_d[k] if j is in corr_col_indices_d, else 0 (increment k if j is in corr_col_indices_d)
thrust::scatter(corr_values_d.begin(), corr_values_d.end(), corr_col_indices_d.begin(), P.begin());
r2[i] = thrust::inner_product(P.begin(), P.end(), r1.begin(), 0.0f);
}
1) After lots of googling, roaming around Stackoverflow and NVIDIA, I attempted to put all successive transformations into a bigger "transform" with a loop variable i.
auto counting_iter = thrust::make_counting_iterator(0);
thrust::transform(counting_iter, counting_iter + N, r2.begin(), [&](int i) {
thrust::device_vector<float> P(N, 0.0);
thrust::device_vector<int> corr_col_indices_d(col_indices.begin() + row_begin[i], col_indices.begin() + row_begin[i+1]); /
thrust::device_vector<float> corr_values_d(values_d.begin() + row_begin[i], values_d.begin() + row_begin[i+1]);
thrust::scatter(corr_values_d.begin(), corr_values_d.end(), corr_col_indices_d.begin(), P.begin());
thrust::transform(P.begin(), P.end(), r1.begin(), P.begin(), thrust::multiplies<float>());
return thrust::reduce(P.begin(), P.end());
});
Unfortunately it doesn't work. Either the there is no such a thing as giving transformations like this, or my syntax is wrong.
2) Then I tried to create a functor that takes all these device_vectors as input and operates on them. As stated here, it's not possible to pass device_vectors to a functor from outside - therefore I attempted to give them as raw pointers.
struct loop {
// constructor that takes a vector as a parameter
__host__ __device__
loop(int *t_row_begin, int *t_col_indices, float*t_values, float *r1):
t_row_begin_(t_row_begin), t_col_indices_(t_col_indices), t_values_(t_values), r1_(r1) {}
// member variable to store the vector
int *t_row_begin_;
int *t_col_indices_;
float *t_values_;
float *r1_;
__host__ __device__
float operator()(int i) const {
thrust::device_vector<float> P(N, 0.0);
thrust::device_vector<int> corr_col_indices_d(t_col_indices_ + t_row_begin_[i], t_col_indices_ + t_row_begin_[i + 1]); // indices of the columns
thrust::device_vector<float> corr_values_d(t_values_ + t_row_begin_[i], t_values_ + t_row_begin_[i+1]); // values of the columns
thrust::scatter(corr_values_d.begin(), corr_values_d.end(), corr_col_indices_d.begin(), P.begin());
return thrust::inner_product(P.begin(), P.end(), r1.begin(), 0.0f);
}
};
and the loop itself:
loop lp(thrust::raw_pointer_cast(row_begin_d.data()),
thrust::raw_pointer_cast(col_indices_d.data()),
thrust::raw_pointer_cast(values_d.data()),
thrust::raw_pointer_cast(r1.data()));
auto iter = thrust::make_counting_iterator(0);
// perform the operations for each iteration of the loop using transform
thrust::transform(iter, iter + N, r2.begin(), lp);
3) I even tried passing arguments to operator rather than the constructor of the functor:
struct loop {
__host__ __device__
float operator()(int i, thrust::device_vector<int>& col_indices, thrust::device_vector<float>& values_d, thrust::device_vector<int>& row_begin, thrust::device_vector<float>& r1) const {
thrust::device_vector<float> P(N, 0.0);
thrust::device_vector<int> corr_col_indices_d(col_indices.begin() + row_begin[i], col_indices.begin() + row_begin[i+1]); // indices of the columns
thrust::device_vector<float> corr_values_d(values_d.begin() + row_begin[i], values_d.begin() + row_begin[i+1]); // values of the columns
thrust::scatter(corr_values_d.begin(), corr_values_d.end(), corr_col_indices_d.begin(), P.begin());
return thrust::inner_product(P.begin(), P.end(), r1.begin(), 0.0f);
}
};
auto iter = thrust::make_counting_iterator(0);
thrust::transform(iter, iter + N, r2.begin(),
thrust::make_transform_iterator(iter, loop()),
thrust::make_zip_iterator(thrust::make_tuple(col_indices, values_d, row_begin, r1)));
None of them compiles and all those complicated error messages don't really help. So, I'm looking for some assistance at this point.
CUDA version: 11.2Thrust version: 1.10.0
Edit: In case you wonder, those vectors correspond to components of CSR matrix representation:
vector<int> row_begin;
vector<float> values;
vector<int> col_indices;
Updates
Fused transform and reduce to inner_product. as suggested by #paleonix.

To show my thought process when refactoring your code, I provide you with multiple steps/intermediate solutions:
Get rid of allocations inside the loop, they are expensive while you do not need copies of the rows and P can be reused:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scatter.h>
#include <thrust/inner_product.h>
void foo(int N,
thrust::host_vector<int> const &row_begin,
thrust::device_vector<int> const &col_indices,
thrust::device_vector<float> const &values_d,
thrust::device_vector<float> const &r1,
thrust::host_vector<float> &r2) {
thrust::device_vector<float> P(N);
for(int i = 0; i < N; ++i) {
thrust::fill(P.begin(), P.end(), 0.0f);
// P[j] = corr_values_d[k] if j is in corr_col_indices_d, else 0 (increment k if j is in corr_col_indices_d)
thrust::scatter(values_d.cbegin() + row_begin[i],
values_d.cbegin() + row_begin[i+1],
col_indices.cbegin() + row_begin[i],
P.begin());
r2[i] = thrust::inner_product(P.cbegin(), P.cend(),
r1.cbegin(),
0.0f);
}
}
Use a permutation iterator on r1 instead of scattering the values into P. This is much more efficient as it avoids unnecessary memory accesses. The scattered and therefore non-coalesced access is expensive, so if you would access the data more than once in this fashion, using thrust::scatter could be better again.
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/inner_product.h>
#include <thrust/iterator/permutation_iterator.h>
void foo(int N,
thrust::host_vector<int> const &row_begin,
thrust::device_vector<int> const &col_indices,
thrust::device_vector<float> const &values_d,
thrust::device_vector<float> const &r1,
thrust::host_vector<float> &r2) {
auto const r1_iter =
thrust::make_permutation_iterator(
r1.cbegin(),
col_indices.cbegin());
for(int i = 0; i < N; ++i) {
r2[i] =
thrust::inner_product(
values_d.cbegin() + row_begin[i],
values_d.cbegin() + row_begin[i+1],
r1_iter + row_begin[i],
0.0f);
}
}
There is not much parallelism left in the inner_product. So do it sequentially and parallelize the outer loop:
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/transform.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
void foo(int N,
thrust::device_vector<int> const &row_begin,
thrust::device_vector<int> const &col_indices,
thrust::device_vector<float> const &values_d,
thrust::device_vector<float> const &r1,
thrust::device_vector<float> &r2) {
auto const row_begin_ptr = row_begin.data();
auto const col_indices_ptr = col_indices.data();
auto const values_d_ptr = values_d.data();
auto const r1_iter =
thrust::make_permutation_iterator(
r1.cbegin(),
col_indices.cbegin());
thrust::transform(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(0) + N,
r2.begin(),
[=] __host__ __device__ (int i){
return thrust::inner_product(thrust::seq,
values_d_ptr + row_begin_ptr[i],
values_d_ptr + row_begin_ptr[i+1],
r1_iter + row_begin_ptr[i],
0.0f);
});
}
While above solution should be sufficient for e.g. banded matrices where the rows are each very small and regular, irregularities like single long rows will make this solution quite inefficient again due to work-imbalance and warp divergence. The alternative is to use a segmented/batched reduction as implemented by thrust::reduce_by_key. To use reduce_by_key here, one would need to "decompress" the CSR matrix (transforming row offsets into keys), even though Thrust might go back to row offsets under the hood either way to use CUB in the back-end.
To avoid this inefficiency, I used CUB directly via cub::DeviceSegmentedReduce::Sum. To still fuse the transform/multiplication into the reduction, one can use a transform iterator. I also ditched the permutation iterator and implemented the gather directly in the transform iterator:
#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
void foo(int N,
thrust::device_vector<int> const &row_begin,
thrust::device_vector<int> const &col_indices,
thrust::device_vector<float> const &values_d,
thrust::device_vector<float> const &r1,
thrust::device_vector<float> &r2) {
auto const col_indices_ptr = col_indices.data();
auto const values_d_ptr = values_d.data();
auto const r1_ptr = r1.data();
auto const corr_iter =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[=] __host__ __device__ (int j){
return values_d_ptr[j] * r1_ptr[col_indices_ptr[j]];
});
// Determine temporary storage
size_t temp_storage_bytes = 0;
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes,
corr_iter,
r2.begin(),
N,
row_begin.cbegin(), row_begin.cbegin() + 1);
// Allocate temporary storage
thrust::device_vector<char> d_temp_storage(temp_storage_bytes);
// Run sum-reduction
cub::DeviceSegmentedReduce::Sum(thrust::raw_pointer_cast(d_temp_storage.data()),
temp_storage_bytes,
corr_iter,
r2.begin(),
N,
row_begin.cbegin(), row_begin.cbegin() + 1);
}
Avoiding Temporary Buffer Initialization (and Allocation)
The only thing missing in this last solution for "ideal" performance is that the temporary storage is unnecessarily initialized. This can be avoided by using a custom allocator as shown in the Thrust example uninitialized_vector.cu. I did not include it in above code to avoid the bloat.
An even nicer solution is the rmm::device_buffer from the RAPIDS Memory Manager, but this one is not included in the CUDA Toolkit.
In the future libcudac++ will hopefully give us a similarly nice C++ option, as they are working on memory resources at the moment.
If this operation is done repeatedly, one can also just reuse the temporary memory. Even when not using CUB directly, one can achieve this using a pool memory resource. See cuda/custom_temporary_allocation.cu and mr_basic.cu

Related

Sub-vector of 2D vector

I have std::vector<std::vector<double>> A(300, std::vector<double>(500)).
I want to create a new vector with sub-range of A: sub-vector[5:10][25:100].
How can I do this?
You can use iterators. First create the vector:
std::vector<std::vector<double>> sub_vector;
sub_vector.reserve(5);
Then populate it with the range constructor of vector:
for (std::size_t i = 5; i < 10; ++i) {
sub_vector.emplace_back(A[i].begin() + 25, A[i].begin() + 100);
}
Notes:
You are responsible for ensuring that the indexes are in range. Otherwise this results in undefined behavior.
This deals with close-open ranges. If you want close-close ranges, you need to add one to the end indexes.
There are no simple notations like that, you have to roll your own:
std::vector<std::vector<double>> A(300, std::vector<double>(500));
std::vector<std::vector<double>> subranges;
subranges.reserve(11 - 5);
std::transform(A.begin() + 5, A.begin() + 11,
std::back_inserter(subranges),
[](const auto& inner){ // [](const std::vector<double>& inner) {
return std::vector<double>(inner.begin() + 25,
inner.begin() + 101);
});
Yet another solution
Using a function to do the job.
#include <iostream>
#include <vector>
#include <iterator>
#include <algorithm>
constexpr size_t MaxRows = 300;
constexpr size_t MaxColumns = 500;
using MyType = double;
using Columns = std::vector<MyType>;
using Matrix = std::vector<Columns>;
void copySubMatrix( const Matrix& source,Matrix& destination,const size_t& startRow,const size_t& endRow,const size_t& startColumn, const size_t& endColumn)
{
// Clear destination matrix
destination.clear();
// Copy rows end columns
std::for_each(source.begin() + startRow, source.begin() + endRow + 1, [&](const Columns & c) {
Columns row{ c.begin() + startColumn, c.begin() + endColumn + 1};
destination.push_back(row); });
}
int main() {
// Define source matrix with given size and empty destination matrix
Matrix A(MaxRows, Columns(MaxColumns));
Matrix result{};
// Fill source matrix with running values
std::for_each(A.begin(), A.end(), [i = 0](Columns & c) mutable {for (MyType& m : c) m = i++; });
// Copy the given range to the destination matrix
copySubMatrix(A, result, 5, 10, 25, 100);
// Display destination matrix
std::for_each(result.begin(), result.end(), [](const Columns & c) {
std::copy(c.begin(), c.end(), std::ostream_iterator<MyType>(std::cout, " ")); std::cout << "\n"; });
return 0;
}
You can do this by iterating over one dimension:
std::vector<vector<double>> main_vector(300, std::vector<double>(500));
std::vector<vector<double>> sub_vector;
std::vector<double>::const_iterator first, last;
unsigned int x_pos_start=5, x_pos_end=10;
unsigned int y_pos_start=25, y_pos_end=100;
sub_vector.resize(x_pos_end - x_pos_start + 1);
for(size_t i=x_pos_start; i<=x_pos_end; ++i)
{
first = main_vector[i].begin() + y_pos_start;
last = main_vector[i].begin() + y_pos_end;
sub_vector[i].insert(sub_vector[i].begin(), first, last);
}

Create a contiguous dynamic matrix

Arrays have this nice property of being contiguous blocks of memory. When using new to allocate memory for an array, it returns a pointer to a contiguous block of memory. However, if I allocate a matrix using new, like this:
#include <iostream> //std::cin
int main()
{
int n, m;
std::cin >> n >> m;
int** mat = new int*[n];
for (int i = 0; i < n; i++)
mat[i] = new int[m];
//use the matrix in some way
for (int i = 0; i < n; i++)
delete[] mat[i];
delete[] mat;
return 0;
}
This works, but mat doesn't point to a contiguous block of size n * m * sizeof(int). How can I do this in C++? I am just complying to the latest standard (that is C++17) and nothing else. I want an answer that doesn't involve STL containers or external libraries.
Please don't answer about C, as that is pretty easy to do in both C99 and C11 using variable-length arrays:
#include <stdio.h> //scanf
#include <stdlib.h> //malloc, free
int main()
{
int n, m;
scanf("%d %d", &n, &m);
//int mat[n][m]; VLA, but I want dynamic
int (*mat)[m] = malloc(n * sizeof *mat);
//use the matrix in some way;
free(mat);
return 0;
}
Here's what you were doing, almost exactly the same but without the non-contiguous memory:
#include <iostream> //std::cin
#include <memory>
int main()
{
int n, m;
std::cin >> n >> m;
auto matrix_data = std::make_unique<int[]>(n * m);
auto mat = std::make_unique<int[]>(n);
for(int i = 0; i < n; i++) { mat[i] = matrix_data.get() + i * m; }
// Use the matrix in some way
// No need to free anything - we're using smart pointers.
// No need to return 0 from main - that's the default
}
Notes:
This is still ugly code... you'd likely better create a proper matrix class, or better still - use somebody else's implementation.
It is better to follow #Someprogrammerdude's suggestion and use arithmetic rather than an array of pointers.
I kind of still don't know what particularly are you asking for. To store matrix elements in a contiguous location, simply allocate the memory for them as a one-dimensional dynamic array. The two basic options have been already discussed, either use a vector:
std::vector<int> mat(m * n);
or, if its memory overhead is significant for you, use a unique pointer:
auto mat = std::make_unique<int[]>(m * n);
Then, to access an element with i row index and j column index, simply use:
a_i_j = mat[i * n + j];
assuming m is the number of rows and n is the number of columns. This formula stores elements in so-called row-major order. If you need the column-major order, switch to:
a_i_j = mat[j * m + i];
Of course, whole approach would be much better encapsulated in a class with some getter operator mat(i, j);.
Here is a 2D matrix class that enables m[i][j] support and has a contiguous storage. It contains a minimalistic set of member functions to create and access elements.
It's a shame that the STD library does not provide such functionality. [i][j] element access is way less error prone than index arithmetics.
#include <cstddef>
#include <vector>
template <typename T>
class matrix
{
public:
using value_type = T;
private:
class row_view
{
public:
constexpr row_view(size_t length, value_type *begin)
: _length(length),
_begin(begin)
{}
value_type &operator[](size_t i) {
// TODO: check bounds
return *std::next(_begin, i);
}
const value_type &operator[](size_t i) const {
// TODO: check bounds
return *std::next(_begin, i);
}
size_t size() const {
return _length;
}
private:
size_t _length;
value_type *_begin;
};
public:
matrix(size_t rows, size_t cols, value_type &&defaultVal)
: _rows(rows),
_array(rows * cols, std::move(defaultVal))
{}
matrix(size_t rows, size_t cols)
: matrix(rows, cols, value_type{})
{}
size_t rows() const noexcept {
return _rows;
}
size_t cols() const noexcept {
return _array.size() / _rows;
}
auto operator[](size_t rowIndex) -> row_view{
const size_t offset = cols() * rowIndex;
return {cols(), &_array[offset]};
}
auto operator[](size_t rowIndex) const -> const row_view{
const size_t offset = cols() * rowIndex;
return {cols(), &_array[offset]};
}
private:
size_t _rows;
std::vector<value_type> _array;
};
https://godbolt.org/z/qoG6jGbh5

CUDA: how to do a matrix multiplication using thrust?

I'm new to CUDA and Thrust and I'm trying to implement a matrix multiplication and I want to achieve this by only using the thrust algorithms, because I want to avoid calling a kernel manually.
Is there a way I can achieve this efficiently? (At least without Using 2 nested for loops)
Or do I have to resign and call a CUDA Kernel?
//My data
thrust::device_vector<float> data(n*m);
thrust::device_vector<float> other(m*r);
thrust::device_vector<float> result(n*r);
// To make indexing faster, not really needed
transpose(other);
// My current approach
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < r;++j)
{
result[i*r+ j] = thrust::inner_product(data.begin()+(i*m), data.begin()+((i+1)*m),other+(j*m), 0.0f);
}
}
If you are interested in performance (usually why people use GPUs for computing tasks) you should not use thrust and you should not call or write your own CUDA kernel. You should use the CUBLAS library. For a learning exercise, if you want to study your own CUDA kernel, you can refer to a first-level-optimized CUDA version in the CUDA programming guide in the shared memory section. If you really want to use thrust with a single thrust call, it is possible.
The basic idea is to use an element-wise operation like thrust::transform as described here. The per-output-array-element dot-product is computed with a functor consisting of a loop.
Here's a worked example considering 3 methods. Your original double-nested loop method (relatively slow), a single thrust call method (faster) and the cublas method (fastest, certainly for larger matrix sizes). The code below only runs method 1 for matrix side dimensions of 200 or less because it is so slow. Here is an example on Tesla P100:
$ cat t463.cu
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/inner_product.h>
#include <thrust/execution_policy.h>
#include <thrust/equal.h>
#include <thrust/iterator/constant_iterator.h>
#include <cublas_v2.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct dp
{
float *A, *B;
int m,n,r;
dp(float *_A, float *_B, int _m, int _n, int _r): A(_A), B(_B), m(_m), n(_n), r(_r) {};
__host__ __device__
float operator()(size_t idx){
float sum = 0.0f;
int row = idx/r;
int col = idx - (row*r); // cheaper modulo
for (int i = 0; i < m; i++)
sum += A[col + row*i] * B[col + row*i];
return sum;}
};
const int dsd = 200;
int main(int argc, char *argv[]){
int ds = dsd;
if (argc > 1) ds = atoi(argv[1]);
const int n = ds;
const int m = ds;
const int r = ds;
// data setup
thrust::device_vector<float> data(n*m,1);
thrust::device_vector<float> other(m*r,1);
thrust::device_vector<float> result(n*r,0);
// method 1
//let's pretend that other is (already) transposed for efficient memory access by thrust
// therefore each dot-product is formed using a row of data and a row of other
long long dt = dtime_usec(0);
if (ds < 201){
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < r;++j)
{
result[i*r+ j] = thrust::inner_product(data.begin()+(i*m), data.begin()+((i+1)*m),other.begin()+(j*m), 0.0f);
}
}
cudaDeviceSynchronize();
dt = dtime_usec(dt);
if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
std::cout << "method 1 time: " << dt/(float)USECPSEC << "s" << std::endl;
else
std::cout << "method 1 failure" << std::endl;
}
thrust::fill(result.begin(), result.end(), 0);
cudaDeviceSynchronize();
// method 2
//let's pretend that data is (already) transposed for efficient memory access by thrust
// therefore each dot-product is formed using a column of data and a column of other
dt = dtime_usec(0);
thrust::transform(thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(n*r), result.begin(), dp(thrust::raw_pointer_cast(data.data()), thrust::raw_pointer_cast(other.data()), m, n, r));
cudaDeviceSynchronize();
dt = dtime_usec(dt);
if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
std::cout << "method 2 time: " << dt/(float)USECPSEC << "s" << std::endl;
else
std::cout << "method 2 failure" << std::endl;
// method 3
// once again, let's pretend the data is ready to go for CUBLAS
cublasHandle_t h;
cublasCreate(&h);
thrust::fill(result.begin(), result.end(), 0);
float alpha = 1.0f;
float beta = 0.0f;
cudaDeviceSynchronize();
dt = dtime_usec(0);
cublasSgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, n, r, m, &alpha, thrust::raw_pointer_cast(data.data()), n, thrust::raw_pointer_cast(other.data()), m, &beta, thrust::raw_pointer_cast(result.data()), n);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
std::cout << "method 3 time: " << dt/(float)USECPSEC << "s" << std::endl;
else
std::cout << "method 3 failure" << std::endl;
}
$ nvcc -o t463 t463.cu -lcublas
$ ./t463
method 1 time: 20.1648s
method 2 time: 6.3e-05s
method 3 time: 5.7e-05s
$ ./t463 1024
method 2 time: 0.008063s
method 3 time: 0.000458s
$
For the default dimension 200 case, the single thrust call and cublas method are fairly close, but are much faster than the loop method. For a side dimension of 1024, the cublas method is almost 20x faster than the single thrust call method.
Note that I have chosen "optimal" transpose configurations for all 3 methods. For method 1, the best case timing is when the inner_product is using a "row" from each input matrix (effectively the tranpose of the 2nd input matrix). For method 2, the best case timing is when the functor is traversing a "column" from each input matrix (effectively the transpose of the first input matrix). For method 3, the choice of CUBLAS_OP_T for both input matrices seems to be fastest. In reality, only the cublas method has the flexibility to be useful for a variety of input cases with good performance.

Call functor for all combinations in Cuda/Thrust

I have two index sets, one in the range [0, N], one in the range [0, M], where N != M. The indices are used to refer to values in different thrust::device_vectors.
Essentially, I want to create one GPU thread for every combination of these indices, so N*M threads. Each thread should compute a value based on the index-combination and store the result in another thrust::device_vector, at a unique index also based on the input combination.
This seems to be a fairly standard problem, but I was unable to find a way to do this in thrust. The documentation only ever mentions problems, where element i of a vector needs to compute something with element i of another vector. There is the thrust::permutation_iterator, but as far as I understand it only gives me the option to reorder data, and I have to specify the order as well.
Some code:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
int main()
{
// Initialize some data
const int N = 2;
const int M = 3;
thrust::host_vector<int> vec1_host(N);
thrust::host_vector<int> vec2_host(M);
vec1_host[0] = 1;
vec1_host[1] = 5;
vec2_host[0] = -3;
vec2_host[1] = 42;
vec2_host[2] = 9;
// Copy to device
thrust::device_vector<int> vec1_dev = vec1_host;
thrust::device_vector<int> vec2_dev = vec2_host;
// Allocate device memory to copy results to
thrust::device_vector<int> result_dev(vec1_host.size() * vec2_host.size());
// Create functor I want to call on every combination
struct myFunctor
{
thrust::device_vector<int> const& m_vec1;
thrust::device_vector<int> const& m_vec2;
thrust::device_vector<int>& m_result;
myFunctor(thrust::device_vector<int> const& vec1, thrust::device_vector<int> const& vec2, thrust::device_vector<int>& result)
: m_vec1(vec1), m_vec2(vec2), m_result(result)
{
}
__host__ __device__
void operator()(size_t i, size_t j) const
{
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec1[j];
}
} func(vec1_dev, vec2_dev, result_dev);
// How do I create N*M threads, each of which calls func(i, j) ?
// Copy results back
thrust::host_vector<int> result_host = result_dev;
for(int i : result_host)
std::cout << i << ", ";
std::cout << std::endl;
// Expected output:
// -2, 2, 43, 47, 10, 14
return 0;
}
I'm fairly sure this is very easy to achieve, I guess I'm just missing the right search terms. Anyways, all help appreciated :)
Presumably in your functor operator instead of this:
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec1[j];
^ ^
you meant this:
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec2[j];
^ ^
I think there are probably many ways to tackle this, but so as to not argue about things that are not germane to the question, I'll try and stay as close to your given code as I can.
Operations like [] on a vector are not possible in device code. Therefore we must convert your functor to work on raw data pointers, rather than thrust vector operations directly.
With those caveats, and a slight modification in how we handle your i and j indices, I think what you're asking is not difficult.
The basic strategy is to create a result vector that is of length N*M just as you suggest, then create the indices i and j within the functor operator. In so doing, we need only pass one index to the functor, using e.g. thrust::transform or thrust::for_each to create our output:
$ cat t79.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/execution_policy.h>
#include <iostream>
struct myFunctor
{
const int *m_vec1;
const int *m_vec2;
int *m_result;
size_t v1size;
myFunctor(thrust::device_vector<int> const& vec1, thrust::device_vector<int> const& vec2, thrust::device_vector<int>& result)
{
m_vec1 = thrust::raw_pointer_cast(vec1.data());
m_vec2 = thrust::raw_pointer_cast(vec2.data());
m_result = thrust::raw_pointer_cast(result.data());
v1size = vec1.size();
}
__host__ __device__
void operator()(const size_t x) const
{
size_t i = x%v1size;
size_t j = x/v1size;
m_result[i + j * v1size] = m_vec1[i] + m_vec2[j];
}
};
int main()
{
// Initialize some data
const int N = 2;
const int M = 3;
thrust::host_vector<int> vec1_host(N);
thrust::host_vector<int> vec2_host(M);
vec1_host[0] = 1;
vec1_host[1] = 5;
vec2_host[0] = -3;
vec2_host[1] = 42;
vec2_host[2] = 9;
// Copy to device
thrust::device_vector<int> vec1_dev = vec1_host;
thrust::device_vector<int> vec2_dev = vec2_host;
// Allocate device memory to copy results to
thrust::device_vector<int> result_dev(vec1_host.size() * vec2_host.size());
// How do I create N*M threads, each of which calls func(i, j) ?
thrust::for_each_n(thrust::device, thrust::counting_iterator<size_t>(0), (N*M), myFunctor(vec1_dev, vec2_dev, result_dev));
// Copy results back
thrust::host_vector<int> result_host = result_dev;
for(int i : result_host)
std::cout << i << ", ";
std::cout << std::endl;
// Expected output:
// -2, 2, 43, 47, 10, 14
return 0;
}
$ nvcc -std=c++11 -arch=sm_61 -o t79 t79.cu
$ ./t79
-2, 2, 43, 47, 10, 14,
$
In retrospect, I think this is more or less exactly what #eg0x20 was suggesting.

Parallel order-preserving selection from an array using tbb

I have a range-image and want to convert it into a libpointmatcher point cloud. The cloud is an Eigen::Matrix with 4 rows (x,y,z,1) and several columns for every point.
The range-image is an unsigned short*array including the range values (z) and an unsigned char*array including information about the pixel visibility.
In serial, my code looks like this:
//container to hold the data
std::vector<Eigen::Vector4d> vec;
vec.reserve(this->Height*this->Width);
//contains information about pixel visibility
unsigned char* mask_data = (unsigned char*)range_image.mask.ToPointer();
//contains the actual pixel data
unsigned short* pixel_data = (unsigned short*)range_image.pixel.ToPointer();
for (int y =0;y < range_image.Height; y++)
{
for (int x = 0; x < range_image.Width; x++)
{
int index =x+y*range_image.Width;
if(*(mask_data+index) != 0)
{
vec.push_back(Eigen::Vector4d(x,y,(double)*(data+index),1));
}
}
}
// libpointmatcher point cloud with size of visible pixel
PM::Matrix features(4,vec.size());
PM::DataPoints::Labels featureLabels;
featureLabels.resize(4);
featureLabels[0] = PM::DataPoints::Label::Label("x");
featureLabels[1] = PM::DataPoints::Label::Label("y");
featureLabels[2] = PM::DataPoints::Label::Label("z");
featureLabels[3] = PM::DataPoints::Label::Label("pad");
//fill with data
for(int i = 0; i<vec.size(); i++)
{
features.col(i) = vec[i];
}
Because of the large images this loop takes 500ms for 840000 points and thats too slow. Now my idea was to integrate the code above in one parallized function. The problem is that the Eigen::Matrix does not provide a push_back functionality, i dont know the number of visible points in advance and i need the points in the right order to process the point cloud.
So i need a parallel algorithm to extract visible 3D-Points from my range-image and insert them into the Eigen::Matrix in the right order. I'm working with Microsoft Visual Studio 2012 and i can use either OpenMP 2.0 or TBB. I appreciate any help :)
UPDATE
As Arch D. Robison suggeested i tried the tbb::parallel_scan. I passed the mask array and a double array to hold the 3D-coodinates. The output array has four times the size of the input array to store homogeneous 3D data (x,y,z,1). Then i map the otput array in a Eigen::Matrix.The number of rows is fixed and the cols coming from the result from the parallel_scan.
size_t vec_size = width*height;
double* out = new double[vec_size * 4];
size_t m1 = Compress(mask, pixel, out, height, width,
[](unsigned char x) {return x != 0; });
Map<MatrixXd> features(out, 4, m1);
. Here is the code from the operator():
void operator()(const tbb::blocked_range2d<size_t, size_t>& r, Tag) {
// Use local variables instead of member fields inside the loop,
// to improve odds that values will be kept in registers.
size_t j = sum;
const unsigned char* m = in;
const unsigned short* p = in2;
T* values = out;
size_t yend = r.rows().end();
for (size_t y = r.rows().begin(); y != yend; ++y)
{
size_t xend = r.cols().end();
for (size_t x = r.cols().begin(); x != xend; ++x)
{
size_t index = x + y*width;
if (pred(m[index]))
{
if (Tag::is_final_scan())
{
size_t idx = j*4;
values[idx] = (double)x;
values[idx + 1] = (double)y;
values[idx + 2] = p[index];
values[idx + 3] = 1.0;
}
++j;
}
}
}
sum = j;
}
I'm now 4x faster then the serial version. What do you think about this approach? Did i miss anythink and are there improvements? Thanks
Here is an example of how to do something like std::copy_if using tbb::parallel_scan. The key method is operator(), which is usually called twice per subrange, once for a prescan and once for a final scan. (But be aware that TBB omits the prescan when it's not necessary.) Here the prescan just does tallying and the final scan does the final work (which includes replaying the tallying). See https://software.intel.com/sites/default/files/bc/2b/parallel_scan.pdf for more details on the methods. Another good references is https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf , which shows lots of things you can do with parallel scan (a.k.a. prefix-sum).
```
#include "tbb/parallel_scan.h"
#include "tbb/blocked_range.h"
#include <cstddef>
template<typename T, typename Pred>
class Body {
const T* const in;
T* const out;
Pred pred;
size_t sum;
public:
Body( T* in_, T* out_, Pred pred_) :
in(in_), out(out_), pred(pred_), sum(0)
{}
size_t getSum() const {return sum;}
template<typename Tag>
void operator()( const tbb::blocked_range<size_t>& r, Tag ) {
// Use local variables instead of member fields inside the loop,
// to improve odds that values will be kept in registers.
size_t j = sum;
const T* x = in;
T* y = out;
for( size_t i=r.begin(); i<r.end(); ++i ) {
if( pred(x[i]) ) {
if( Tag::is_final_scan() )
y[j] = x[i];
++j;
}
}
sum = j;
}
// Splitting constructor used for parallel fork.
// Note that it's sum(0), not sum(b.sum), because this
// constructor will be used to compute a partial sum.
// Method reverse_join will put together the two sub-sums.
Body( Body& b, tbb::split ) :
in(b.in), out(b.out), pred(b.pred), sum(0)
{}
// Join partial solutions computed by two Body objects.
// Arguments "this" and "a" correspond to the splitting
// constructor arguments "b" and "this". That's why
// it's called a reverse join.
void reverse_join( Body& a ) {
sum += a.sum;
}
void assign( Body& b ) {sum=b.sum;}
};
// Copy to out each element of in that satisfies pred.
// Return number of elements copied.
template<typename T, typename Pred>
size_t Compress( T* in, T* out, size_t n, Pred pred ) {
Body<T,Pred> b(in,out,pred);
tbb::parallel_scan(tbb::blocked_range<size_t>(0,n), b);
return b.getSum();
}
#include <cmath>
#include <algorithm>
#include <cassert>
int main() {
const size_t n = 10000000;
float* a = new float[n];
float* b = new float[n];
float* c = new float[n];
for( size_t i=0; i<n; ++i )
a[i] = std::cos(float(i));
size_t m1 = Compress(a, b, n, [](float x) {return x<0;});
size_t m2 = std::copy_if(a, a+n, c, [](float x) {return x<0;})-c;
assert(m1==m2);
for( size_t i=0; i<n; ++i )
assert(b[i]==c[i]);
}
```
Why do not you check out the condition *(m_maskData+index)==0 before m_features(0,index) = x;?