Small sized binary searches on CUDA GPUs - c++

I have a large device array inputValues of int64_t type. Every 32 elements of this array are sorted in an ascending order. I have an unsorted search array removeValues.
My intention is to look for all the elements in removeValues inside inputValues and mark them as -1. What is the most efficient method to achieve this? I am using a 3.5 cuda device if that helps.
I am not looking for a higher level solution, i.e. I do not want to use thrust or cub, but I want to write this using cuda kernels.
My initial approach was to load every 32 values in shared memory in a thread block. Every thread also loads a single value from removeValues and does an independent binary search on the shared memory array. If found, the value is set according by using an if condition.
Wouldn't this approach involve a lot of bank conflicts and branch divergence? Do you think that branch divergence can be addressed by using ternary operators while implementing the binary search? Even if that is solved, how can bank conflict be eliminated? Since the size of sorted arrays is 32, would it be possible to implement a binary search using shuffle instructions? Would that help?
EDIT : I have added an example to show what I intend to achieve.
Let's say that inputValues is a vector where every 32 elements are sorted:
[2, 4, 6, ... , 64], [95, 97, ... , 157], [1, 3, ... , 63], [...]
The typical size for this array can range between 32*2 to 32*32. The values could range from 0 to INT64_MAX.
An example of removeValues would be:
[7, 75, 95, 106]
The typical size for this array could range from 1 to 1024.
After the operation removeValues would be:
[-1, 75, -1, 106]
The values in inputValues remain unchanged.

I would concur with the answer (now deleted) and comment by #harrism. Since I put some effort into the non-thrust approach, I'll present my findings.
I tried to naively implement a binary search at the warp-level using __shfl(), and then repeat that binary search across the data set, passing the data set through each 32-element group.
It's embarrassing, but my code is around 20x slower than thrust (in fact it may be worse than that if you do careful timing with nvprof).
I made the data sizes a little larger than what was proposed in the question, because the data sizes in the question are so small that the timing is in the dust.
Here's a fully worked example of 2 approaches:
What is approximately outlined in the question, i.e. create a binary search using warp shuffle that can search up to 32 elements against a 32-element ordered array. Repeat this process for as many 32-element ordered arrays as there are, passing the entire data set through each ordered array (hopefully you can start to see some of the inefficiency now.)
Use thrust, essentially the same as what is outlined by #harrism, i.e. sort the grouped data set, and then run a vectorized thrust::binary_search on that.
Here's the example:
$ cat t1030.cu
#include <stdio.h>
#include <assert.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
typedef long mytype;
const int gsize = 32;
const int nGRP = 512;
const int dsize = nGRP*gsize;//gsize*nGRP;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
template <typename T>
__device__ T my_shfl32(T val, unsigned lane){
return __shfl(val, lane);
}
template <typename T>
__device__ T my_shfl64(T val, unsigned lane){
T retval = val;
int2 t1 = *(reinterpret_cast<int2 *>(&retval));
t1.x = __shfl(t1.x, lane);
t1.y = __shfl(t1.y, lane);
retval = *(reinterpret_cast<T *>(&t1));
return retval;
}
template <typename T>
__device__ bool bsearch_shfl(T grp_val, T my_val){
int src_lane = gsize>>1;
bool return_val = false;
T test_val;
int shift = gsize>>2;
for (int i = 0; i <= gsize>>3; i++){
if (sizeof(T)==4){
test_val = my_shfl32(grp_val, src_lane);}
else if (sizeof(T)==8){
test_val = my_shfl64(grp_val, src_lane);}
else assert(0);
if (test_val == my_val) return_val = true;
src_lane += (((test_val<my_val)*2)-1)*shift;
shift>>=1;
assert ((src_lane < gsize)&&(src_lane > 0));}
if (sizeof(T)==4){
test_val = my_shfl32(grp_val, 0);}
else if (sizeof(T)==8){
test_val = my_shfl64(grp_val, 0);}
else assert(0);
if (test_val == my_val) return_val = true;
return return_val;
}
template <typename T>
__global__ void bsearch_grp(const T * __restrict__ search_grps, T *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int tid = threadIdx.x;
if (idx < gsize*nGRP){
T grp_val = search_grps[idx];
while (tid < dsize){
T my_val = data[tid];
if (bsearch_shfl(grp_val, my_val)) data[tid] = -1;
tid += blockDim.x;}
}
}
int main(){
// data setup
assert(gsize == 32); //mandatory (warp size)
assert((dsize % 32)==0); //needed to preserve shfl capability
thrust::host_vector<mytype> grps(gsize*nGRP);
thrust::host_vector<mytype> data(dsize);
thrust::host_vector<mytype> result(dsize);
for (int i = 0; i < gsize*nGRP; i++) grps[i] = i;
for (int i = 0; i < dsize; i++) data[i] = i;
// method 1: individual shfl-based binary searches on each group
mytype *d_grps, *d_data;
cudaMalloc(&d_grps, gsize*nGRP*sizeof(mytype));
cudaMalloc(&d_data, dsize*sizeof(mytype));
cudaMemcpy(d_grps, &(grps[0]), gsize*nGRP*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_data, &(data[0]), dsize*sizeof(mytype), cudaMemcpyHostToDevice);
unsigned long long my_time = dtime_usec(0);
bsearch_grp<<<nGRP, gsize>>>(d_grps, d_data);
cudaDeviceSynchronize();
my_time = dtime_usec(my_time);
cudaMemcpy(&(result[0]), d_data, dsize*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i = 0; i < dsize; i++) if (result[i] != -1) {printf("method 1 mismatch at %d, was %d, should be -1\n", i, (int)(result[i])); return 1;}
printf("method 1 time: %fs\n", my_time/(float)USECPSEC);
// method 2: thrust sort, followed by thrust binary search
thrust::device_vector<mytype> t_grps = grps;
thrust::device_vector<mytype> t_data = data;
thrust::device_vector<bool> t_rslt(t_data.size());
my_time = dtime_usec(0);
thrust::sort(t_grps.begin(), t_grps.end());
thrust::binary_search(t_grps.begin(), t_grps.end(), t_data.begin(), t_data.end(), t_rslt.begin());
cudaDeviceSynchronize();
my_time = dtime_usec(my_time);
thrust::host_vector<bool> rslt = t_rslt;
for (int i = 0; i < dsize; i++) if (rslt[i] != true) {printf("method 2 mismatch at %d, was %d, should be 1\n", i, (int)(rslt[i])); return 1;}
printf("method 2 time: %fs\n", my_time/(float)USECPSEC);
// method 3: multiple thrust merges, followed by thrust binary search
return 0;
}
$ nvcc -O3 -arch=sm_35 t1030.cu -o t1030
$ ./t1030
method 1 time: 0.009075s
method 2 time: 0.000516s
$
I was running this on linux, CUDA 7.5, GT640 GPU. Obviously the performance will be different on different GPUs, but I'd be surprised if any GPU significantly closed the gap.
In short, you'd be well advised to use a well-tuned library like thrust or cub. If you don't like the monolithic nature of thrust, you could try cub. I don't know if cub has a binary search, but a single binary search against the whole sorted data set is not a difficult thing to write, and it's the smaller part of the time involved (for method 2 -- identifiable using nvprof or additional timing code).
Since your 32-element grouped ranges are already sorted, I also pondered the idea of using multiple thrust::merge operations rather than a single sort. I'm not sure which would be faster, but since the thrust method is already so much faster than the 32-element shuffle search method, I think thrust (or cub) is the obvious choice.

Related

Why my inversions of matrices are such slow with LAPACKE in C++ : MAGMA Alternative and set up

I am using LAPACK to inverse a matrix: I did a reference passing, i.e by working on the address. Here below the function with an input matrix and an output matrix referenced by their address.
The issue is that I am obliged to convert the F_matrix into 1D array and I think this is a waste of performances on the runtime level : which way could I find to get rid of this supplementary task which is time consuming I think if I call a lot of times the
function matrix_inverse_lapack.
Below the function concerned :
// Passing Matrixes by Reference
void matrix_inverse_lapack(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {
// Index for loop and arrays
int i, j, ip, idx;
// Size of F_matrix
int N = F_matrix.size();
int *IPIV = new int[N];
// Statement of main array to inverse
double *arr = new double[N*N];
// Output Diagonal block
double *diag = new double[N];
for (i = 0; i<N; i++){
for (j = 0; j<N; j++){
idx = i*N + j;
arr[idx] = F_matrix[i][j];
}
}
// LAPACKE routines
int info1 = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, arr, N, IPIV);
int info2 = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N, arr, N, IPIV);
for (i = 0; i<N; i++){
for (j = 0; j<N; j++){
idx = i*N + j;
F_output[i][j] = arr[idx];
}
}
delete[] IPIV;
delete[] arr;
}
For example, I call it this way :
vector<vector<double>> CO_CL(lsize*(2*Dim_x+Dim_y), vector<double>(lsize*(2*Dim_x+Dim_y), 0));
... some code
matrix_inverse_lapack(CO_CL, CO_CL);
The performances on inversion are not which are expected, I think this is due to this conversion 2D -> 1D that I described in the function matrix_inverse_lapack.
Update
I was advised to install MAGMA on my MacOS Big Sur 11.3 but I have a lot of difficulties to set up it.
I have a AMD Radeon Pro 5600M graphic card. I have already installed by default Big Sur version all the Framework OpenCL (maybe I am wrong by saying that). Anyone could tell the procedure to follow for the installation of MAGMA. I saw that on a MAGMA software exists on http://magma.maths.usyd.edu.au/magma/ but it is really expensive and doesn't correspond to what I want : I just need all the SDK (headers and libraries) , if possible built with my GPU card. I have already installed all the Intel OpenAPI SDK on my MacOS. Maybe, I could link it to a MAGMA installation.
I saw another link https://icl.utk.edu/magma/software/index.html where MAGMA seems to be public : there is none link with the non-free version above, isn't there ?
First of all let me complain that OP did not provide all necessary data. The program is almost complete, but it is not a minimal, reproducible example. This is important because (a) it wastes time and (b) it hides potentially relevant information, eg. about the matrix initialization. Second, OP did not provide any details on the compilation, which, again may be relevant.
Last, but not least, OP didn't check the status code for possible errors from Lapack functions, and this could also be important for correct interpretation of the results.
Let's start from a minimal reproducible example:
#include <lapacke.h>
#include <vector>
#include <chrono>
#include <iostream>
using Matrix = std::vector<std::vector<double>>;
std::ostream &operator<<(std::ostream &out, Matrix const &v)
{
const auto size = std::min<int>(10, v.size());
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
{
out << v[i][j] << "\t";
}
if (size < std::ssize(v)) out << "...";
out << "\n";
}
return out;
}
void matrix_inverse_lapack(Matrix const &F_matrix, Matrix &F_output, std::vector<int> &IPIV_buffer,
std::vector<double> &matrix_buffer)
{
// std::cout << F_matrix << "\n";
auto t0 = std::chrono::steady_clock::now();
const int N = F_matrix.size();
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
auto idx = i * N + j;
matrix_buffer[idx] = F_matrix[i][j];
}
}
auto t1 = std::chrono::steady_clock::now();
// LAPACKE routines
int info1 = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, matrix_buffer.data(), N, IPIV_buffer.data());
int info2 = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N, matrix_buffer.data(), N, IPIV_buffer.data());
auto t2 = std::chrono::steady_clock::now();
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
auto idx = i * N + j;
F_output[i][j] = matrix_buffer[idx];
}
}
auto t3 = std::chrono::steady_clock::now();
auto whole_fun_time = std::chrono::duration<double>(t3 - t0).count();
auto lapack_time = std::chrono::duration<double>(t2 - t1).count();
// std::cout << F_output << "\n";
std::cout << "status: " << info1 << "\t" << info2 << "\t" << (info1 == 0 && info2 == 0 ? "Success" : "Failure")
<< "\n";
std::cout << "whole function: " << whole_fun_time << "\n";
std::cout << "LAPACKE matrix operations: " << lapack_time << "\n";
std::cout << "conversion: " << (whole_fun_time - lapack_time) / whole_fun_time * 100.0 << "%\n";
}
int main(int argc, const char *argv[])
{
const int M = 5; // numer of test repetitions
const int N = (argc > 1) ? std::stoi(argv[1]) : 10;
std::cout << "Matrix size = " << N << "\n";
std::vector<int> IPIV_buffer(N);
std::vector<double> matrix_buffer(N * N);
// Test matrix_inverse_lapack M times
for (int i = 0; i < M; i++)
{
Matrix CO_CL(N);
for (auto &v : CO_CL) v.resize(N);
int idx = 1;
for (auto &v : CO_CL)
{
for (auto &x : v)
{
x = idx + 1.0 / idx;
idx++;
}
}
matrix_inverse_lapack(CO_CL, CO_CL, IPIV_buffer, matrix_buffer);
}
}
Here, operator<< is an overkill, but may be useful for anyone wanting to verify half-manually that the code works (by uncommenting lines 26 and 58), and ensuring that the code is correct is more important that measuring its performance.
The code can be compiled with
g++ -std=c++20 -O3 main.cpp -llapacke
The program relies on an external library, lapacke, which needs to be installed, headers + binaries, for the code to compile and run.
My code differs a bit from OP's: it is closer to "modern C++" in that it refrains from using naked pointers; I also added external buffers to matrix_inverse_lapack to suppress continual launching of memory allocator and deallocator, a small improvement that reduces the 2D-1D-2D conversion overhead in a measurable way. I also had to initialize the matrix and find a way to read in OP's mind what the value of N could be. I also added some timer readings for benchmarking. Apart from this, the logic of the code is unchanged.
Now a benchmark carried out on a decent workstation. It lists the percentage of time the conversion takes relative to the total time taken by matrix_inverse_lapack. In other words, I measure the conversion overhead:
N = 10, 3.5%
N = 30, 1.5%
N = 100, 1%
N = 300, 0.5%
N = 1000, 0.35%
N = 3000, 0.1%
The time taken by Lapack nicely scales as N3, as expected (data not shown). The time to invert a matrix is about 16 seconds for N = 3000, and about 5-6 s (5 microseconds) for N = 10.
I assume the overhead of even 3% is completely acceptable. I believe OP uses matrices of size larger then 100, in which case the overhead at or below 1% is certainly acceptable.
So what OP (or anyone having a similar problem) could have done wrong to obtain "unacceptable overhead conversion values"? Here's my short list
Improper compilation
Improper matrix initialization (for tests)
Improper benchmarking
1. Improper compilation
If one forgets to compile in Release mode, one ends up with optimized Lapacke competing with unoptimized conversion. On my machine this peaks at an 33% overhead for N = 20.
2. Improper matrix initialization (for tests)
If one initializes the matrix like this:
for (auto &v : CO_CL)
{
for (auto &x : v)
{
x = idx; // rather than, eg., idx + 1.0/idx
idx++;
}
}
then the matrix is singular, lapack returns quite quickly with the status different from 0. This increases the relative importance of the conversion part. But singular matrices are not what one wants to invert (it's impossible to do).
3. Improper benchmarking
Here's an example of the program output for N = 10:
./a.out 10
Matrix size = 10
status: 0 0 Success
whole function: 0.000127658
LAPACKE matrix operations: 0.000126783
conversion: 0.685425%
status: 0 0 Success
whole function: 1.2497e-05
LAPACKE matrix operations: 1.2095e-05
conversion: 3.21677%
status: 0 0 Success
whole function: 1.0535e-05
LAPACKE matrix operations: 1.0197e-05
conversion: 3.20835%
status: 0 0 Success
whole function: 9.741e-06
LAPACKE matrix operations: 9.422e-06
conversion: 3.27482%
status: 0 0 Success
whole function: 9.939e-06
LAPACKE matrix operations: 9.618e-06
conversion: 3.2297%
One can see that the first call to lapack functions can take 10 times more time than the subsequent calls. This is quite a stable pattern, as if Lapack needed some time for self-initialization. It can affect the measurements for small N badly.
4. What else can be done?
OP apperas to believe that his approach to 2D arrays is good and Lapack is strange and old-fashionable in its packing a 2D array into a 1D array. No. It is Lapack who is right.
If one defines a 2D array as vector<vector<double>>, one obtains one advantage: code simplicity. This comes at a price. Each row of such a matrix is allocated separateley from the others. Thus, a matrix 100 by 100 may be stored in 100 completely different memory blocks. This has a bad impact on the cache (and prefetcher) utilization. Lapck (and other linear algebra packages) enforces compactification of the data in a single, continuous array. This is so to minimize cache and prefetcher misses. If OP had used such an approach from the very beginning, he would probably have gained more than 1-3% that they pay now for the conversion.
This compactification can be achieved in at least three ways.
Write a custom class for a 2D matrix, with the internal data stored in a 1D array and convenient access member funnctions (e.g.: operator ()), or find a library that does just that
Write a custom allocator for std::vector (or find a library). This allocator should allocate the memory from a preallocated 1D vector exactly matching the data storage pattern used by Lapack
Use std::vector<double*> and initailze the pointers with the addresses pointing at the appropriate elements of a preallocated 1D array.
Each of the above solutions forces some changes to the surrounding code, which OP might not want to do. All depends on the code complexity and expected performance gains.
EDIT: Alternative libraries
An alternative approach is to use a library that is known for being a highly optimzed one. Lapack by itself can be regardered as a standard interface with many implementations and it may happen that OP uses an unoptimized one. Which library to choose may depend on the hardware/software platform OP is interested in and may vary in time.
As for now (mid-2021) a decent suggestions are:
Lapack https://www.netlib.org/lapack/
Atlas https://en.wikipedia.org/wiki/Automatically_Tuned_Linear_Algebra_Software http://math-atlas.sourceforge.net/
OpenBlas https://www.openblas.net/
Magma https://developer.nvidia.com/magma
Plasma https://bitbucket.org/icl/plasma/src/main/
If OP uses martices of sizes at least 100, then GPU-oriented MAGMA might be worth trying.
An easier (installation, running) way might with a parallel CPU library, e.g. Plasma. Plsama is Lapack-compliant, it has been being developed by a large team of people, including Jack Dongarra, it also should be rather easy to compile it locally as it is provided with a CMake script.
An example how much a parallel CPU-based, multicore implementation can outperform a single-threaded implementation of the LU-decomposition can be found for example here: https://cse.buffalo.edu/faculty/miller/Courses/CSE633/Tummala-Spring-2014-CSE633.pdf (short answer: 5 to 15 times for matrices of size 1000).

Performance gap between vector<bool> and array

I was trying to solve a coding problem in C++ which counts the number of prime numbers less than a non-negative number n.
So I first came up with some code:
int countPrimes(int n) {
vector<bool> flag(n+1,1);
for(int i =2;i<n;i++)
{
if(flag[i]==1)
for(long j=i;i*j<n;j++)
flag[i*j]=0;
}
int result=0;
for(int i =2;i<n;i++)
result+=flag[i];
return result;
}
which takes 88 ms and uses 8.6 MB of memory. Then I changed my code into:
int countPrimes(int n) {
// vector<bool> flag(n+1,1);
bool flag[n+1] ;
fill(flag,flag+n+1,true);
for(int i =2;i<n;i++)
{
if(flag[i]==1)
for(long j=i;i*j<n;j++)
flag[i*j]=0;
}
int result=0;
for(int i =2;i<n;i++)
result+=flag[i];
return result;
}
which takes 28 ms and 9.9 MB. I don't really understand why there is such a performance gap in both the running time and memory consumption. I have read relative questions like this one and that one but I am still confused.
EDIT: I reduced the running time to 40 ms with 11.5 MB of memory after replacing vector<bool> with vector<char>.
std::vector<bool> isn't like any other vector. The documentation says:
std::vector<bool> is a possibly space-efficient specialization of
std::vector for the type bool.
That's why it may use up less memory than an array, because it might represent multiple boolean values with one byte, like a bitset. It also explains the performance difference, since accessing it isn't as simple anymore. According to the documentation, it doesn't even have to store it as a contiguous array.
std::vector<bool> is special case. It is specialized template. Each value is stored in single bit, so bit operations are needed. This memory compact but has couple drawbacks (like no way to have a pointer to bool inside this container).
Now bool flag[n+1]; compiler will usually allocate same memory in same manner as for char flag[n+1]; and it will do that on stack, not on heap.
Now depending on page sizes, cache misses and i values one can be faster then other. It is hard to predict (for small n array will be faster, but for larger n result may change).
As an interesting experiment you can change std::vector<bool> to std::vector<char>. In this case you will have similar memory mapping as in case of array, but it will be located at heap not a stack.
I'd like to add some remarks to the good answers already posted.
The performance differences between std::vector<bool> and std::vector<char> may vary (a lot) between different library implementations and different sizes of the vectors.
See e.g. those quick benches: clang++ / libc++(LLVM) vs. g++ / libstdc++(GNU).
This: bool flag[n+1]; declares a Variable Length Array, which (despites some performance advantages due to it beeing allocated in the stack) has never been part of the C++ standard, even if provided as an extension by some (C99 compliant) compilers.
Another way to increase the performances could be to reduce the amount of calculations (and memory occupation) by considering only the odd numbers, given that all the primes except for 2 are odd.
If you can bare the less readable code, you could try to profile the following snippet.
int countPrimes(int n)
{
if ( n < 2 )
return 0;
// Sieve starting from 3 up to n, the number of odd number between 3 and n are
int sieve_size = n / 2 - 1;
std::vector<char> sieve(sieve_size);
int result = 1; // 2 is a prime.
for (int i = 0; i < sieve_size; ++i)
{
if ( sieve[i] == 0 )
{
// It's a prime, no need to scan the vector again
++result;
// Some ugly transformations are needed, here
int prime = i * 2 + 3;
for ( int j = prime * 3, k = prime * 2; j <= n; j += k)
sieve[j / 2 - 1] = 1;
}
}
return result;
}
Edit
As Peter Cordes noted in the comments, using an unsigned type for the variable j
the compiler can implement j/2 as cheaply as possible. C signed division by a power of 2 has different rounding semantics (for negative dividends) than a right shift, and compilers don't always propagate value-range proofs sufficiently to prove that j will always be non-negative.
It's also possible to reduce the number of candidates exploiting the fact that all primes (past 2 and 3) are one below or above a multiple of 6.
I am getting different timings and memory usage than the ones mentioned in the question when compiling with g++-7.4.0 -g -march=native -O2 -Wall and running on a Ryzen 5 1600 CPU:
vector<bool>: 0.038 seconds, 3344 KiB memory, IPC 3.16
vector<char>: 0.048 seconds, 12004 KiB memory, IPC 1.52
bool[N]: 0.050 seconds, 12644 KiB memory, IPC 1.69
Conclusion: vector<bool> is the fastest option because of its higher IPC (instructions per clock).
#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>
#include <vector>
size_t countPrimes(size_t n) {
std::vector<bool> flag(n+1,1);
//std::vector<char> flag(n+1,1);
//bool flag[n+1]; std::fill(flag,flag+n+1,true);
for(size_t i=2;i<n;i++) {
if(flag[i]==1) {
for(size_t j=i;i*j<n;j++) {
flag[i*j]=0;
}
}
}
size_t result=0;
for(size_t i=2;i<n;i++) {
result+=flag[i];
}
return result;
}
int main() {
{
const rlim_t kStackSize = 16*1024*1024;
struct rlimit rl;
int result = getrlimit(RLIMIT_STACK, &rl);
if(result != 0) abort();
if(rl.rlim_cur < kStackSize) {
rl.rlim_cur = kStackSize;
result = setrlimit(RLIMIT_STACK, &rl);
if(result != 0) abort();
}
}
printf("%zu\n", countPrimes(10e6));
return 0;
}

How could I parallelize the following loop USING C++ AMP?

I have the following loop in c++
dword result = 0;
for ( int i = 0; i < 16; i++ ) {
result |= ( value[i] << (unsigned int)( i << 1 ) );
}
And I would like to parallelize it in amp. I know it might go slower then the actual non-parallelized version above, but I want to do it to learn something more about AMP.
My idea was to loop trough the value array in parallel:
And fill a new array with newarray[0] = value[0] << (unsigned int)(0 << 1 ), newarray[1] = value[1] << (unsigned int)(1 << 1 ), etc. Then I would OR the values in the array in parallel in a tree structure (see image).
I have tried to put this idea in some simple c++ amp code, but I don't succeed in it, so any help would be appreciated.
Thank you for your consideration of this matter, I look forward to a response.
The following code is part of what I think you need. This code will take a number of elements as input and preps the vector on the CPU, then it does the bit shift operations in parallel on the GPU. Then I set av[elements] back to 0 because I am using that element to store your final result. It's rough, but AMP is pretty restrictive about what data types can be processed on the GPU, so I just use an extra element of the existing array for it. After the bit shifting is done, I do another parallel for each for the bitwise OR function. This one also happens on the GPU, but it is less satisfactory because every operation is ORing any given element of the array with exactly the av[elements] element, so that will create a bottleneck. Your tree structure will make this part run much more quickly, but I was unable to figure out how to do that part easily. As it is, this program can process 100 million elements in a couple seconds on a fairly old computer. Apologies in advance for any best-practice violations in the code; I am a novice as well. The code follows:
#include <conio.h>
#include <amp.h>
#include <iostream>
using namespace concurrency;
using namespace std;
unsigned int doParallel(unsigned int);
unsigned int elements;
void main()
{
int ch=NULL;
cout<<"\nHow many elements to populate: ";
cin>>elements;
cout<<"The result is: "<<doParallel(elements);
cout<<"\nPress 'X' to exit.";
do
{
ch=_getch();
} while (ch!='X' && ch!='x');
exit(0);
}
unsigned int doParallel(unsigned int elements)
{
vector<unsigned int> v(elements+1);
for (unsigned int i = 0; i<elements+1;i++)
{
v[i]=i;
}
array_view<unsigned int,1> av(elements+1,v);
parallel_for_each(av.extent,[=](index<1> idx)
restrict(amp)
{
av[idx] = static_cast<unsigned int>(av[idx])<<1;
});
av[elements]=0;
parallel_for_each(av.extent,[=](index<1> idx)
restrict(amp)
{
av[elements] |= static_cast<unsigned int>(av[idx]);
});
return av[elements];
}

Efficient index bound check and double to int cast

Consider the following code snippet
double *x, *id;
int i, n; // = vector size
// allocate and zero x
// set id to 0:n-1
for(i=0; i<n; i++) {
long iid = (long)id[i];
if(iid>=0 && iid<n && (double)iid==id[i]){
x[iid] = 1;
} else break;
}
The code uses values in vector id of type double as indices into vector x. In order for the indices to be valid I verify that they are greater than or equal to 0, less than vector size n, and that doubles stored in id are in fact integers. In this example id stores integers from 1 to n, so all vectors are accessed linearly and branch prediction of the if statement should always work.
For n=1e8 the code takes 0.21s on my computer. Since it seems to me it is a computationally light-weight loop, I expect it to be memory bandwidth bounded. Based on the benchmarked memory bandwidth I expect it to run in 0.15s. I calculate the memory footprint as 8 bytes per id value, and 16 bytes per x value (it needs to be both written, and read from memory since I assume SSE streaming is not used). So a total of 24 bytes per vector entry.
The questions:
Am I wrong saying that this code should be memory bandwidth bounded, and that it can be improved?
If not, do you know a way in which I could improve the performance so that it works with the speed of the memory?
Or maybe everything is fine and I can not easily improve it otherwise than running it in parallel?
Changing the type of id is not an option - it must be double. Also, in the general case id and x have different sizes and must be kept as separate arrays - they come from different parts of the program. In short, I wonder if it is possible to write the bound checks and the type cast/integer validation in a more efficient manner.
For convenience, the entire code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
static struct timeval tb, te;
void tic()
{
gettimeofday(&tb, NULL);
}
void toc(const char *idtxt)
{
long s,u;
gettimeofday(&te, NULL);
s=te.tv_sec-tb.tv_sec;
u=te.tv_usec-tb.tv_usec;
printf("%-30s%10li.%.6li\n", idtxt,
(s*1000000+u)/1000000, (s*1000000+u)%1000000);
}
int main(int argc, char *argv[])
{
double *x = NULL;
double *id = NULL;
int i, n;
// vector size is a command line parameter
n = atoi(argv[1]);
printf("x size %i\n", n);
// not included in timing in MATLAB
x = calloc(sizeof(double),n);
memset(x, 0, sizeof(double)*n);
// create index vector
tic();
id = malloc(sizeof(double)*n);
for(i=0; i<n; i++) id[i] = i;
toc("id = 1:n");
// use id to index x and set all entries to 4
tic();
for(i=0; i<n; i++) {
long iid = (long)id[i];
if(iid>=0 && iid<n && (double)iid==id[i]){
x[iid] = 1;
} else break;
}
toc("x(id) = 1");
}
EDIT: Disregard if you can't split the arrays!
I think it can be improved by taking advantage of a common cache concept. You can either make data accesses close in time or location. With tight for-loops, you can achieve a better data hit-rate by shaping your data structures like your for-loop. In this case, you access two different arrays, usually the same indices in each array. Your machine is loading chunks of both arrays each iteration through that loop. To increase the use of each load, create a structure to hold an element of each array, and create a single array with that struct:
struct my_arrays
{
double x;
int id;
};
struct my_arrays* arr = malloc(sizeof(my_arrays)*n);
Now, each time you load data into cache, you'll hit everything you load because the arrays are close together.
EDIT: Since your intent is to check for an integer value, and you make the explicit assumption that the values are small enough to be represented precisely in a double with no loss of precision, then I think your comparison is fine.
My previous answer had a reference to beware comparing large doubles after implicit casting, and I referenced this:
What is the most effective way for float and double comparison?
It might be worth considering examination of double type representation.
For example, the following code shows how to compare a double number greater than 1 to 999:
bool check(double x)
{
union
{
double d;
uint32_t y[2];
};
d = x;
bool answer;
uint32_t exp = (y[1] >> 20) & 0x3ff;
uint32_t fraction1 = y[1] << (13 + exp); // upper bits of fractiona part
uint32_t fraction2 = y[0]; // lower 32 bits of fractional part
if (fraction2 != 0 || fraction1 != 0)
answer = false;
else if (exp > 8)
answer = false;
else if (exp == 8)
answer = (y[1] < 0x408f3800); // this is the representation of 999
else
answer = true;
return answer;
}
This looks like much code, but it might be vectorized easily (using e.g. SSE), and if your bound is a power of 2, it might simplify the code further.

Thrust: summing the elements of an array indexed by another array [Matlab's syntax sum(x(indices))]

I'm trying to sum the elements of an array indexed by another array using the Thrust library, but I couldn't find an example. In other words, I want to implement Matlab's syntax
sum(x(indices))
Here is a guideline code trying to point out what do I like to achieve:
#define N 65536
// device array copied using cudaMemcpyToSymbol
__device__ int global_array[N];
// function to implement with thrust
__device__ int support(unsigned short* _memory, unsigned short* _memShort)
{
int support = 0;
for(int i=0; i < _memSizeShort; i++)
support += global_array[_memory[i]];
return support;
}
Also, from the host code, can I use the global_array[N] without copying it back with cudaMemcpyFromSymbol ?
Every comment/answer is appreciated :)
Thanks
This is a very late answer provided here to remove this question from the unanswered list. I'm sure that the OP has already found a solution (since May 2012 :-)), but I believe that the following could be useful to other users.
As pointed out by #talonmies, the problem can be solved by a fused gather-reduction. The solution is indeed an application of Thurst's permutation_iterator and reduce. The permutation_iterator allows to (implicitly) reorder the target array x according to the indices in the indices array. reduce performs the sum of the (implicitly) reordered array.
This application is part of Thrust's documentation, below reported for convenience
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
// this example fuses a gather operation with a reduction for
// greater efficiency than separate gather() and reduce() calls
int main(void)
{
// gather locations
thrust::device_vector<int> map(4);
map[0] = 3;
map[1] = 1;
map[2] = 0;
map[3] = 5;
// array to gather from
thrust::device_vector<int> source(6);
source[0] = 10;
source[1] = 20;
source[2] = 30;
source[3] = 40;
source[4] = 50;
source[5] = 60;
// fuse gather with reduction:
// sum = source[map[0]] + source[map[1]] + ...
int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()),
thrust::make_permutation_iterator(source.begin(), map.end()));
// print sum
std::cout << "sum is " << sum << std::endl;
return 0;
}
In the above example, map plays the role of indices, while source plays the role of x.
Concerning the additional question in your comment (iterating over a reduced number of terms), it will be sufficient to change the following line
int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()),
thrust::make_permutation_iterator(source.begin(), map.end()));
to
int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()),
thrust::make_permutation_iterator(source.begin(), map.begin()+N));
if you want to iterate only over the first N terms of the indexing array map.
Finally, concerning the possibility of using global_array from the host, you should notice that this is a vector residing on the device, so you do need a cudaMemcpyFromSymbol to move it to the host first.