CUDA histogram reduce_by_key failing - c++

I have the following CUDA Thrust code that uses reduce_by_key to histogram the values [0, 1024) into 256 buckets. I expect each bucket to have a count = 4, yet I see bucket 0 has 256, bucket 255 has 3, and the remainder have 4.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/pair.h>
#define SIZE 1024
struct binFunc {
const float minVal;
const float valRange;
const int numBins;
binFunc(float _minVal, float _valRange, int _numBins) :
minVal(_minVal), valRange(_valRange), numBins(_numBins) {}
__host__ __device__
int operator()(float v) const {
int b = int((v - minVal) / valRange * float(numBins));
return b;
}
};
int main() {
thrust::device_vector<float> d_vec(SIZE);
for (int i = 0; i < SIZE; ++i)
d_vec[i] = float(i);
thrust::device_vector<float>::iterator min;
thrust::device_vector<float>::iterator max;
thrust::pair<thrust::device_vector<float>::iterator,
thrust::device_vector<float>::iterator> minmax =
thrust::minmax_element(d_vec.begin(), d_vec.end());
min = minmax.first;
max = minmax.second;
float minVal = *min;
float maxVal = *max;
std::cout << "The minimum value is " << minVal
<< " and the maximum value is " << maxVal << "." << std::endl;
float valRange = maxVal - minVal;
std::cout << "The range is " << valRange << "." << std::endl;
int numBins = 256;
thrust::device_vector<int> d_binResults(SIZE);
thrust::transform(d_vec.begin(), d_vec.end(), d_binResults.begin(),
binFunc(minVal, valRange, numBins));
thrust::device_vector<int>::iterator d_binResults_iter =
d_binResults.begin();
for (int i = 0; i < 10; ++i) {
int b = *d_binResults_iter;
printf("d_binResults[%d]=%d\n", i, b);
d_binResults_iter++;
}
std::cout << "The numBins is " << numBins << "." << std::endl;
thrust::device_vector<int> d_binsKeys(numBins);
thrust::device_vector<int> d_binsValues(numBins);
thrust::pair<thrust::device_vector<int>::iterator,
thrust::device_vector<int>::iterator> keys_and_values =
thrust::reduce_by_key(d_binResults.begin(), d_binResults.end(),
thrust::constant_iterator<int>(1), d_binsKeys.begin(),
d_binsValues.begin());
thrust::device_vector<int>::iterator d_binsKeys_begin_iter =
d_binsKeys.begin();
thrust::device_vector<int>::iterator d_binsValues_begin_iter =
d_binsValues.begin();
for (int i = 0; i < numBins; ++i) {
int key = *d_binsKeys_begin_iter;
int val = *d_binsValues_begin_iter;
printf("d_binsValues[%d]=(%d,%d)\n", i, key, val);
d_binsKeys_begin_iter++;
d_binsValues_begin_iter++;
}
return 0;
}
The salient part of the output is:
d_binsValues[0]=(0,256)
d_binsValues[1]=(1,4)
d_binsValues[2]=(2,4)
...
d_binsValues[254]=(254,4)
d_binsValues[255]=(255,3)
So, bucket 0 has 256 elements, and bucket 255 has 3 elements? What's going on here?

If you print out all the d_binResults[] values instead of the first 10, you will discover that the last element (d_binResults[1023]) has a value of 256! But that is an invalid bin index. For numBins = 256, the valid indices are 0..255.
It is occurring due to the calculation arithmetic in your functor:
int b = int((v - minVal) / valRange * float(numBins));
Plugging in the relevant values for the last element, we have:
(1023 - 0)/1023*256 = 256
But 256 is an invalid bin index. It turns out that this breaks the reduce_by_key operation, causing both the last bin to have 3 elements and the first bin to be "corrupted".
If you fix this you will fix both issues you describe (first bin has 256 elements, last bin has 3.)
As a simple proof, add this line of code:
d_binResults[1023] = 255;
immediately after your thrust::transform operation. The results are then correct. How you choose to correct your bin calculation arithmetic is up to you. (possibly "fixable" by adding 1 to valRange but that may imply something about your expected histogram values).

Related

sum of Maclaurin series c++

I am struggling to make this equation equals to each other because of a bad understanding of mathematics.
The problem is that the equation does not equal to each other
here is my code for better understand
#include <iostream>
#include <ccomplex>
using std::cout;
int main() {
int n = 8;
double sum = 0.0;
unsigned long long fact =1;
for (int i = 1; i <= n; i++)
{
fact *= 2*i*(2*i-1);
sum += 1.0 / fact;
}
std::cout << "first equation " << sum << std::endl;
double e = M_E;
double st = 1.0/2.0*(e + (1.0/e));
std::cout <<"second equation " << st << std::endl;
return 0;
}
the output
first equation 0.543081
second equation 1.54308
The result it nearly It must be at least equal before the comma,
You don't account for n = 0, which yields 0! and thus 1. Therefore, you need to add 1 to sum.

Generate integers with even hamming weight (popcount) c++

I want to effectively (by using bit hacks) generate all integers up a given number, k, such that they have an even hamming weight without explicitly calculating their hamming weights. It is not important to me whether that is done in ascending or descending order.
A bonus (related task) would be if I could generate all integers with even hamming weight which are subsets (in the Gray code sense) of k.
Example:
input-> k=14 (binary 1110)
output all-> 3 (0011), 5(0101), 6 (0110), 9 (1001), 10 (1010), 12 (1100)
output subsets-> 6 (0110), 10 (1010), 12 (1100)
Example code using popcount:
for (unsigned int sub=1; sub<k; sub++){
if (__builtin_popcount(sub) % 2 == 0){
cout << sub << endl;
}
}
Example code using popcount for subsets:
for (unsigned int sub=((k-1)&k); sub!=0; sub=((sub-1)&k)){
if (__builtin_popcount(sub) % 2 == 0){
cout << sub << endl;
}
}
We can build a tree with numbers in nodes, each node has two childs, one with flipped bit number x and the other with not flipped bit number x. We need to exclude all the childs with value greater then initial value. We can store the popcount in a variable and decrement and increment each time we flip a bit depending on the flipped bit value, thus avoiding calculating popcount each time the variable is changed.
I don't know if this method is faster or not. I guess it may be faster, but the overhead for recursive function may be too big.
That was fun:
#include <cstdio>
#include <iostream>
#include <vector>
#include <algorithm>
#include <climits>
#include <cinttypes>
#include <cassert>
#include <bitset>
#include <cstring>
namespace gen {
bool isEven(unsigned int x) {
return x % 2 == 0;
}
// find last set bit, just like ffs, but backwards
unsigned int fls(unsigned int x)
{
assert(x >= 1);
if (x == 0) {
return 0;
}
#ifdef __GNUC__
const unsigned int clz = __builtin_clz(x);
#else
#error find clz function in C++
#endif
assert(clz >= 1 && (sizeof(x) * CHAR_BIT) >= clz + 1);
return (sizeof(x) * CHAR_BIT) - clz - 1;
}
unsigned int popcount(unsigned int x) {
#ifdef __GNUC__
return __builtin_popcount(x);
#else
return std::bitset<sizeof(x)*CHAR_BIT>(x).count();
#endif
}
/**
* Generates all integers up a given number k with even hamming weight
* #param out - output vector with push_backed results
* #param greycodesubset - set to true, if only interested in grey subset integers only
* #param startk - starting k value
* #param k - the current number value
* #param pos - one plus the position of the bit in number k that we will change in this run
* #param popcount - Hamming weight of number k up to position pos
* #param changes - the number of bits changed in number k since startk. Used only if greycodesubset = true
*/
void loop(std::vector<unsigned int>& out, const bool& greycodesubset,
const unsigned int& startk,
unsigned int k, unsigned int pos, unsigned int popcount,
unsigned int changes)
{
// k > startk may happen for example for 0b10, if we flip last byte, then k = 0b11
if (startk < k) {
return;
}
// end of recusive function
if (pos == 0) {
if (isEven(popcount) && k != 0) {
out.push_back(k);
}
return;
}
// decrement pos
--pos;
const int mask = 1 << pos;
const bool is_pos_bit_set = k & mask;
// call without changes
loop(out, greycodesubset, startk,
k, pos, popcount + (is_pos_bit_set ? +1 : 0), changes);
// when finding grey code subset only we can change maximum 1 byte
if (greycodesubset) {
if (changes >= 1) {
return;
}
++changes;
}
// call with toggled bit number pos
loop(out, greycodesubset, startk,
k ^ mask, pos, popcount + (!is_pos_bit_set ? +1 : 0), changes);
}
std::vector<unsigned int> run(const unsigned int& k, const bool& greycodesubsetonly)
{
assert(k > 0);
std::vector<unsigned int> out;
if (k < 2) return out;
loop(out, greycodesubsetonly, k, k, fls(k) + 1, 0, 0);
return out;
}
} // namespace gen
int main()
{
const unsigned int k = 14;
const int bits_in_k = 4;
std::vector<unsigned int> out = gen::run(k, false);
std::vector<unsigned int> out_subset = gen::run(k, true);
std::cout << "input-> k=" << k << "(" << std::bitset<bits_in_k>(k).to_string() << ") " << std::endl;
std::cout << "output all-> ";
std::for_each(out.begin(), out.end(), [](int v) {
std::cout << v << "(" << std::bitset<bits_in_k>(v).to_string() << ") ";
});
std::cout << std::endl;
std::cout << "output subsets-> ";
std::for_each(out_subset.begin(), out_subset.end(), [](int v) {
std::cout << v << "(" << std::bitset<bits_in_k>(v).to_string() << ") ";
});
std::cout << std::endl;
return 0;
}
input-> k=14(1110)
output all-> 12(1100) 10(1010) 9(1001) 6(0110) 5(0101) 3(0011)
output subsets-> 12(1100) 10(1010) 6(0110)

Thrust CUDA find maximum per each group(segment)

My data like
value = [1, 2, 3, 4, 5, 6]
key = [0, 1, 0, 2, 1, 2]
I need to now maximum(value and index) per each group(key).
So the result should be
max = [3, 5, 6]
index = [2, 4, 5]
key = [0, 1, 2]
How can I get it with cuda thrust?
I can do sort -> reduce_by_key but it's not really efficient. In my case vector size > 10M and key space ~ 1K(starts from 0 without gaps).
Since the original question focused on thrust, I didn't have any suggestions other than what I mentioned in the comments,
However, based on further dialog in the comments, I thought I would post an answer that covers both CUDA and thrust.
The thrust method uses a sort_by_key operation to group like keys together, followed by a reduce_by_key operation to find the max + index for each key-group.
The CUDA method uses a custom atomic approach I describe here to find a 32-bit max plus 32-bit index (for each key-group).
The CUDA method is substantially (~10x) faster, for this specific test case. I used a vector size of 10M and a key size of 10K for this test.
My test platform was CUDA 8RC, RHEL 7, and Tesla K20X GPU. K20X is a member of the Kepler generation which has much faster global atomics than previous GPU generations.
Here's a fully worked example, covering both cases, and providing a timing comparison:
$ cat t1234.cu
#include <iostream>
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/functional.h>
#include <cstdlib>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const size_t ksize = 10000;
const size_t vsize = 10000000;
const int nTPB = 256;
struct my_max_func
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 t1, const T2 t2){
T1 res;
if (thrust::get<0>(t1) > thrust::get<0>(t2)){
thrust::get<0>(res) = thrust::get<0>(t1);
thrust::get<1>(res) = thrust::get<1>(t1);}
else {
thrust::get<0>(res) = thrust::get<0>(t2);
thrust::get<1>(res) = thrust::get<1>(t2);}
return res;
}
};
typedef union {
float floats[2]; // floats[0] = maxvalue
int ints[2]; // ints[1] = maxindex
unsigned long long int ulong; // for atomic update
} my_atomics;
__device__ unsigned long long int my_atomicMax(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
while (loctest.floats[0] < val1)
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
return loctest.ulong;
}
__global__ void my_max_idx(const float *data, const int *keys,const int ds, my_atomics *res)
{
int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx < ds)
my_atomicMax(&(res[keys[idx]].ulong), data[idx],idx);
}
int main(){
float *h_vals = new float[vsize];
int *h_keys = new int[vsize];
for (int i = 0; i < vsize; i++) {h_vals[i] = rand(); h_keys[i] = rand()%ksize;}
// thrust method
thrust::device_vector<float> d_vals(h_vals, h_vals+vsize);
thrust::device_vector<int> d_keys(h_keys, h_keys+vsize);
thrust::device_vector<int> d_keys_out(ksize);
thrust::device_vector<float> d_vals_out(ksize);
thrust::device_vector<int> d_idxs(vsize);
thrust::device_vector<int> d_idxs_out(ksize);
thrust::sequence(d_idxs.begin(), d_idxs.end());
cudaDeviceSynchronize();
unsigned long long et = dtime_usec(0);
thrust::sort_by_key(d_keys.begin(), d_keys.end(), thrust::make_zip_iterator(thrust::make_tuple(d_vals.begin(), d_idxs.begin())));
thrust::reduce_by_key(d_keys.begin(), d_keys.end(), thrust::make_zip_iterator(thrust::make_tuple(d_vals.begin(),d_idxs.begin())), d_keys_out.begin(), thrust::make_zip_iterator(thrust::make_tuple(d_vals_out.begin(), d_idxs_out.begin())), thrust::equal_to<int>(), my_max_func());
cudaDeviceSynchronize();
et = dtime_usec(et);
std::cout << "Thrust time: " << et/(float)USECPSEC << "s" << std::endl;
// cuda method
float *vals;
int *keys;
my_atomics *results;
cudaMalloc(&keys, vsize*sizeof(int));
cudaMalloc(&vals, vsize*sizeof(float));
cudaMalloc(&results, ksize*sizeof(my_atomics));
cudaMemset(results, 0, ksize*sizeof(my_atomics)); // works because vals are all positive
cudaMemcpy(keys, h_keys, vsize*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(vals, h_vals, vsize*sizeof(float), cudaMemcpyHostToDevice);
et = dtime_usec(0);
my_max_idx<<<(vsize+nTPB-1)/nTPB, nTPB>>>(vals, keys, vsize, results);
cudaDeviceSynchronize();
et = dtime_usec(et);
std::cout << "CUDA time: " << et/(float)USECPSEC << "s" << std::endl;
// verification
my_atomics *h_results = new my_atomics[ksize];
cudaMemcpy(h_results, results, ksize*sizeof(my_atomics), cudaMemcpyDeviceToHost);
for (int i = 0; i < ksize; i++){
if (h_results[i].floats[0] != d_vals_out[i]) {std::cout << "value mismatch at index: " << i << " thrust: " << d_vals_out[i] << " CUDA: " << h_results[i].floats[0] << std::endl; return -1;}
if (h_results[i].ints[1] != d_idxs_out[i]) {std::cout << "index mismatch at index: " << i << " thrust: " << d_idxs_out[i] << " CUDA: " << h_results[i].ints[1] << std::endl; return -1;}
}
std::cout << "Success!" << std::endl;
return 0;
}
$ nvcc -arch=sm_35 -o t1234 t1234.cu
$ ./t1234
Thrust time: 0.026593s
CUDA time: 0.002451s
Success!
$

thrust vector distance calculation

Consider the following dataset and centroids. There are 7 individuals and two means each with 8 dimensions. They are stored row major order.
short dim = 8;
float centroids[] = {
0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612,
0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
};
float data[] = {
0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744,
0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869,
0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769,
0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719,
0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530,
0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114
};
I want to calculate each euclidean distances. c1 - d1, c1 - d2 ....
On CPU I would do:
float dist = 0.0, dist_sqrt;
for(int i = 0; i < 2; i++)
for(int j = 0; j < 7; j++)
{
float dist_sum = 0.0;
for(int k = 0; k < dim; k++)
{
dist = centroids[i * dim + k] - data[j * dim + k];
dist_sum += dist * dist;
}
dist_sqrt = sqrt(dist_sum);
// do something with the distance
std::cout << dist_sqrt << std::endl;
}
Is there any built in solution of vector distance calculation in THRUST?
It can be done in thrust. Explaining how will be rather involved, and the code is rather dense.
The key observation to start with is that the core operation can be done via a transformed reduction. The thrust transform operation is used to perform the elementwise subtraction of the vectors (individual-centroid) and squaring of each result, and the reduction sums the results together to produce the square of the euclidean distance. The starting point for this operation is thrust::reduce_by_key, but it gets rather involved to present the data correctly to reduce_by_key.
The final results are produced by taking the square root of each result from above, and we can use an ordinary thrust::transform for this.
The above is a summary description of the only 2 lines of thrust code that do all the work. However, the first line has considerable complexity to it. In order to exploit parallelism, the approach I took was to virtually "lay out" the necessary vectors in sequence, to be presented to reduce_by_key. To take a simple example, suppose we have 2 centroids and 4 individuals, and suppose our dimension is 2.
centroid 0: C00 C01
centroid 1: C10 C11
individ 0: I00 I01
individ 1: I10 I11
individ 2: I20 I21
individ 3: I30 I31
We can "lay out" the vectors like this:
C00 C01 C00 C01 C00 C01 C00 C01 C10 C11 C10 C11 C10 C11 C10 C11
I00 I01 I10 I11 I20 I21 I30 I31 I00 I01 I10 I11 I20 I21 I30 I31
To facilitate the reduce_by_key, we will also need to generate key values to delineate the vectors:
0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
The above data "laid-out" data sets can be quite large, and we don't want to incur storage and retrieval cost, so we will generate these "on-the-fly" using thrust's collection of fancy iterators. This is where things get quite dense. With the above strategy in mind, we will use thrust::reduce_by_key to do the work. We'll create a custom functor provided to a transform_iterator to do the subtraction (and squaring) of the I and C vectors, which will be zipped together for this purpose. The "lay out" of the vectors will be created on the fly using permutation iterators with additional custom index-creation functors, to help with the replicated patterns in each of I and C.
Therefore, working from the "inside out", the sequence of steps is as follows:
for both I (data) and C (centr) use a counting_iterator combined with a custom indexing functor inside of a transform_iterator to produce the indexing sequences we will need.
using the indexing sequences created in step 1 and the base I and C vectors, virtually "lay out" the vectors via a permutation_iterator (one for each laid-out vector).
zip the 2 "laid out" virtual I and C vectors together, to create a <float, float> tuple vector (virtual).
take the zip_iterator from step 3, and combine with a custom distance-calculation functor ((I-C)^2) in a transform_iterator
use another transform_iterator, combining a counting_iterator with a custom key-generating functor, to produce the key sequence (virtual)
pass the iterators in steps 4 and 5 to reduce_by_keyas the inputs (keys, values) to be reduced. The output vectors for reduce_by_key are also keys and values. We don't need the keys, so we'll use a discard_iterator to dump those. The values we will save.
The above steps are all accomplished in a single line of thrust code.
Here's a code illustrating the above:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/copy.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#define MAX_DATA 100000000
#define MAX_CENT 5000
#define TOL 0.001
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(&tv1,0);
return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
unsigned verify(float *d1, float *d2, int len){
unsigned pass = 1;
for (int i = 0; i < len; i++)
if (fabsf(d1[i] - d2[i]) > TOL){
std::cout << "mismatch at: " << i << " val1: " << d1[i] << " val2: " << d2[i] << std::endl;
pass = 0;
break;}
return pass;
}
void eucl_dist_cpu(const float *centroids, const float *data, float *rdist, int num_centroids, int dim, int num_data, int print){
int out_idx = 0;
float dist, dist_sqrt;
for(int i = 0; i < num_centroids; i++)
for(int j = 0; j < num_data; j++)
{
float dist_sum = 0.0;
for(int k = 0; k < dim; k++)
{
dist = centroids[i * dim + k] - data[j * dim + k];
dist_sum += dist * dist;
}
dist_sqrt = sqrt(dist_sum);
// do something with the distance
rdist[out_idx++] = dist_sqrt;
if (print) std::cout << dist_sqrt << ", ";
}
if (print) std::cout << std::endl;
}
struct dkeygen : public thrust::unary_function<int, int>
{
int dim;
int numd;
dkeygen(const int _dim, const int _numd) : dim(_dim), numd(_numd) {};
__host__ __device__ int operator()(const int val) const {
return (val/dim);
}
};
typedef thrust::tuple<float, float> mytuple;
struct my_dist : public thrust::unary_function<mytuple, float>
{
__host__ __device__ float operator()(const mytuple &my_tuple) const {
float temp = thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple);
return temp*temp;
}
};
struct d_idx : public thrust::unary_function<int, int>
{
int dim;
int numd;
d_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};
__host__ __device__ int operator()(const int val) const {
return (val % (dim*numd));
}
};
struct c_idx : public thrust::unary_function<int, int>
{
int dim;
int numd;
c_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};
__host__ __device__ int operator()(const int val) const {
return (val % dim) + (dim * (val/(dim*numd)));
}
};
struct my_sqrt : public thrust::unary_function<float, float>
{
__host__ __device__ float operator()(const float val) const {
return sqrtf(val);
}
};
unsigned long long eucl_dist_thrust(thrust::host_vector<float> &centroids, thrust::host_vector<float> &data, thrust::host_vector<float> &dist, int num_centroids, int dim, int num_data, int print){
thrust::device_vector<float> d_data = data;
thrust::device_vector<float> d_centr = centroids;
thrust::device_vector<float> values_out(num_centroids*num_data);
unsigned long long compute_time = dtime_usec(0);
thrust::reduce_by_key(thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), dkeygen(dim, num_data)), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(dim*num_data*num_centroids), dkeygen(dim, num_data)),thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_centr.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), c_idx(dim, num_data))), thrust::make_permutation_iterator(d_data.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), d_idx(dim, num_data))))), my_dist()), thrust::make_discard_iterator(), values_out.begin());
thrust::transform(values_out.begin(), values_out.end(), values_out.begin(), my_sqrt());
cudaDeviceSynchronize();
compute_time = dtime_usec(compute_time);
if (print){
thrust::copy(values_out.begin(), values_out.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << std::endl;
}
thrust::copy(values_out.begin(), values_out.end(), dist.begin());
return compute_time;
}
int main(int argc, char *argv[]){
int dim = 8;
int num_centroids = 2;
float centroids[] = {
0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612,
0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
};
int num_data = 8;
float data[] = {
0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744,
0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869,
0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769,
0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719,
0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530,
0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114,
0.721, 0.555, 0.979, 0.412, 0.007, 0.501, 0.844, 0.234
};
std::cout << "cpu results: " << std::endl;
float dist[num_data*num_centroids];
eucl_dist_cpu(centroids, data, dist, num_centroids, dim, num_data, 1);
thrust::host_vector<float> h_data(data, data + (sizeof(data)/sizeof(float)));
thrust::host_vector<float> h_centr(centroids, centroids + (sizeof(centroids)/sizeof(float)));
thrust::host_vector<float> h_dist(num_centroids*num_data);
std::cout << "gpu results: " << std::endl;
eucl_dist_thrust(h_centr, h_data, h_dist, num_centroids, dim, num_data, 1);
float *data2, *centroids2, *dist2;
num_centroids = 10;
num_data = 1000000;
if (argc > 2) {
num_centroids = atoi(argv[1]);
num_data = atoi(argv[2]);
if ((num_centroids < 1) || (num_centroids > MAX_CENT)) {std::cout << "Num centroids out of range" << std::endl; return 1;}
if ((num_data < 1) || (num_data > MAX_DATA)) {std::cout << "Num data out of range" << std::endl; return 1;}
if (num_data * dim * num_centroids > 2000000000) {std::cout << "data set out of range" << std::endl; return 1;}}
std::cout << "Num Data: " << num_data << std::endl;
std::cout << "Num Cent: " << num_centroids << std::endl;
std::cout << "result size: " << ((num_data*num_centroids*4)/1048576) << " Mbytes" << std::endl;
data2 = new float[dim*num_data];
centroids2 = new float[dim*num_centroids];
dist2 = new float[num_data*num_centroids];
for (int i = 0; i < dim*num_data; i++) data2[i] = rand()/(float)RAND_MAX;
for (int i = 0; i < dim*num_centroids; i++) centroids2[i] = rand()/(float)RAND_MAX;
unsigned long long dtime = dtime_usec(0);
eucl_dist_cpu(centroids2, data2, dist2, num_centroids, dim, num_data, 0);
dtime = dtime_usec(dtime);
std::cout << "cpu time: " << dtime/(float)USECPSEC << "s" << std::endl;
thrust::host_vector<float> h_data2(data2, data2 + (dim*num_data));
thrust::host_vector<float> h_centr2(centroids2, centroids2 + (dim*num_centroids));
thrust::host_vector<float> h_dist2(num_data*num_centroids);
dtime = dtime_usec(0);
unsigned long long ctime = eucl_dist_thrust(h_centr2, h_data2, h_dist2, num_centroids, dim, num_data, 0);
dtime = dtime_usec(dtime);
std::cout << "gpu total time: " << dtime/(float)USECPSEC << "s, gpu compute time: " << ctime/(float)USECPSEC << "s" << std::endl;
if (!verify(dist2, &(h_dist2[0]), num_data*num_centroids)) {std::cout << "Verification failure." << std::endl; return 1;}
std::cout << "Success!" << std::endl;
return 0;
}
Notes:
The code is set up to do 2 passes, a short one using a data set similar to yours, with printout for visual check. Then a larger data set can be entered, via command-line sizing parameters (number of centroids, then number of individuals), for benchmark comparison and validation of results.
Contrary to what I stated in the comments, the thrust code is only running about 25% faster than the naive single-threaded CPU code. Your mileage may vary.
This is just one way to think about handling it. I have had other ideas, but not enough time to flesh them out.
The data sets can become rather large. The code right now is intended to be limited to data sets where the product of dimension*number_of_centroids*number_of_individuals is less than 2 billion. However, as you approach even this number, you will need a GPU and CPU that both have a few GB of memory. I briefly explored larger data set sizes. A few code changes would be needed in various places to extend from e.g. int to unsigned long long, etc. However I haven't provided that as I am still investigating an issue with that code.
For another, non-thrust-related look at computing euclidean distances on the GPU, you may be interested in this question. If you follow the sequence of optimizations that were made there, it may shed some light on either how this thrust code might be improved, or else how another non-thrust realization could be used.
Sorry I wasn't able to squeeze more performance out.

C++ - Comparing multiple int arrays and return array with smallest span

I am working on a problem where i have multiple arrays that are to be compared against a single array. The array with the shortest span between indexes will be returned.
Here is an example of a set of arrays i would be working with:
(if it is of any importance these values represent RGB values)
int[] workingset = {55, 34,102};
int[] test1 = {54,36,99};`
int[] test2 = {21,65,231};
int[] test3 = {76,35,1};
int[] test4 = {133,22,3};
Because test1[] values are closest to workingset[], test1[] would be the array that would be returned.
*I apologize for not putting up sample code, but i simply could not think of a way to piece this puzzle together.
you could easily sum up all components (r,g,b) and check which has the smallest difference.
#include <iostream>
int main(int argc, char* args[])
{
int workingset [] = {55, 34,102};
int test1 [] = {54,36,99};
int test2 []= {21,65,231};
int test3 [] = {76,35,1};
int test4 [] = {133,22,3};
int sums [4] = {};
for(int i=0; i<3;++i){
sums[0] += abs(test1[i]-workingset[i]);
}
std::cout << "diff test1 " << sums[0] << std::endl;
for(int i=0; i<3;++i){
sums[1] += abs(test2[i]-workingset[i]);
}
std::cout << "diff test2 " << sums[1] << std::endl;
for(int i=0; i<3;++i){
sums[2] += abs(test3[i]-workingset[i]);
}
std::cout << "diff test3 " << sums[2] << std::endl;
for(int i=0; i<3;++i){
sums[3] += abs(test4[i]-workingset[i]);
}
std::cout << "diff test4 " << sums[3] << std::endl;
int smallestIndex = 0;
int smallestDiff = INT_MAX;
for(int i=0; i< 4; i++){
if(sums[i] < smallestDiff){
smallestIndex = i;
smallestDiff = sums[i];
}
}
std::cout << "array with smallest index: " << smallestIndex << std::endl;
return 0;
}
I edited the microsoft specific includes and datatypes.
What's your metric for "shortest span between indexes"? I'm guessing that you're trying to minimize the sum of the absolute values of the differences between the two arrays?
Once you've defined your metric, try something like this pseudocode:
min_metric = MAX_INT
min_array = null
for array in arrays:
if array.length() == workingset.length():
metric = calculateMetric(workingset, array)
if metric < min_metric:
min_metric = metric
min_array = array
Let me guess too. Assuming you are trying to write a color matcher. Consider these arrays vectors. Then the absolute length of the vector difference between workingset and testX will be the metric to use.
Or in the code:
int [] delta = { 0, 0, 0 };
for (int i = 0; i < delta.length; ++i) delta[i] = workingset[i] - testX[i];
double metric = 0;
for (int x: delta) metric += x * x;
metric = sqrt(metric); // and use this to assess how far away the testX color is from the workingset color (sqrt operation is optional)