Related
Is there any tip for improving CUDA performance in that case such as declaring global/local variable, parameter passing, memory copy.
I'm trying to figure out the reason why two performance are too different between sum_gpu_FAST and sum_gpu_SLOW in example below.
Here you can see the whole example code.
#include <iostream>
#include <chrono>
#define N 10000000
__global__
void sum_gpu_FAST(int (&data)[N][2], int& sum, int n) { // runtime : 2.42342s
int s = 0;
for (int i = 0; i < n; i++)
s += data[i][0] * 10 + data[i][1];
sum = s;
}
__global__
void sum_gpu_SLOW(int (&data)[N][2], int& sum, int n) { // runtime : 436.64ms
sum = 0;
for (int i = 0; i < n; i++) {
sum += data[i][0] * 10 + data[i][1];
}
}
void sum_cpu(int (*data)[2], int& sum, int n) {
for (int i = 0; i < n; i++) {
sum += data[i][0] * 10 + data[i][1];
}
}
int main()
{
int (*v)[2] = new int[N][2];
for (int i = 0; i < N; i++)
v[i][0] = 1, v[i][1] = 3;
printf ("-CPU------------------------------------------------\n");
{
int sum = 0;
auto start = std::chrono::system_clock::now();
sum_cpu(v, sum, N);
auto end = std::chrono::system_clock::now();
// print output
std::cout << sum << " / " << (end-start).count() / 1000000 << "ms" << std::endl;
}
printf ("-GPU-Ready------------------------------------------\n");
int *dev_sum = nullptr;
int (*dev_v)[N][2] = nullptr;
cudaMalloc((void **)&dev_v, sizeof(int[N][2]));
cudaMalloc((void **)&dev_sum, sizeof(int));
cudaMemcpy(dev_v, v, sizeof(int[N][2]), cudaMemcpyHostToDevice);
printf("-GPU-FAST-------------------------------------------\n");
{
int sum = 0;
auto start = std::chrono::system_clock::now();
sum_gpu_FAST<<<1, 1>>> (*dev_v, *dev_sum, N);
cudaDeviceSynchronize(); // wait until end of kernel
auto end = std::chrono::system_clock::now();
// print output
cudaMemcpy( &sum, dev_sum, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << sum << " / " << (end-start).count() / 1000000 << "ms" << std::endl;
}
printf("-GPU-SLOW-------------------------------------------\n");
{
int sum = 0;
auto start = std::chrono::system_clock::now();
sum_gpu_SLOW<<<1, 1>>> (*dev_v, *dev_sum, N);
cudaDeviceSynchronize(); // wait until end of kernel
auto end = std::chrono::system_clock::now();
// print output
cudaMemcpy( &sum, dev_sum, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << sum << " / " << (end-start).count() / 1000000 << "ms" << std::endl;
}
printf("----------------------------------------------------\n");
return 0;
}
I'm trying to figure out the reason why two performance are too different between sum_gpu_FAST and sum_gpu_SLOW in example below.
In the fast case, you are creating a local variable which is contained (presumably) in a register:
int s = 0;
During the loop iterations, reads are occurring from global memory, but the only write operation is to a register:
for (int i = 0; i < n; i++)
s += data[i][0] * 10 + data[i][1];
In the slow case, the running sum is contained in a variable resident in global memory:
sum = 0;
therefore, at each loop iteration, the updated value is written to global memory:
for (int i = 0; i < n; i++) {
sum += data[i][0] * 10 + data[i][1];
Therefore the loop has additional overhead to write to global memory at each iteration, which is slower than maintaining the sum in a register.
I'm not going to completely dissect the SASS code to compare these two cases, because the compiler is making other decisions in the fast case around loop unrolling and possibly other factors, but my guess is that the lack of a need to store results to global memory during the loop iterations considerably assists with loop unrolling as well. However we can make a simple deduction based on the tail end of the SASS code for each case:
Function : _Z12sum_gpu_FASTRA10000000_A2_iRii
.headerflags #"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fd00000000f00 */
...
/*0b00*/ STG.E.SYS [R2], R20 ; /* 0x0000001402007386 */
/* 0x000fe2000010e900 */
/*0b10*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
In the fast case above, we see that there is a single global store (STG) instruction at the end of the kernel, right before the return statement (EXIT), and outside of any loops in the kernel. Although I haven't shown it all, indeed there are no other STG instructions in the fast kernel, except the one at the end. We see a different story looking at the tail end of the slow kernel:
code for sm_70
Function : _Z12sum_gpu_SLOWRA10000000_A2_iRii
.headerflags #"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
/*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /* 0x00000a00ff017624 */
/* 0x000fd000078e00ff */
...
/*0460*/ STG.E.SYS [R2], R7 ; /* 0x0000000702007386 */
/* 0x0005e2000010e900 */
/*0470*/ #!P0 BRA 0x2f0 ; /* 0xfffffe7000008947 */
/* 0x000fea000383ffff */
/*0480*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
The slow kernel ends a loop with the STG instruction inside the loop. The slow kernel also has many instances of the STG instruction throughout the kernel, presumably because of compiler unrolling.
My data like
value = [1, 2, 3, 4, 5, 6]
key = [0, 1, 0, 2, 1, 2]
I need to now maximum(value and index) per each group(key).
So the result should be
max = [3, 5, 6]
index = [2, 4, 5]
key = [0, 1, 2]
How can I get it with cuda thrust?
I can do sort -> reduce_by_key but it's not really efficient. In my case vector size > 10M and key space ~ 1K(starts from 0 without gaps).
Since the original question focused on thrust, I didn't have any suggestions other than what I mentioned in the comments,
However, based on further dialog in the comments, I thought I would post an answer that covers both CUDA and thrust.
The thrust method uses a sort_by_key operation to group like keys together, followed by a reduce_by_key operation to find the max + index for each key-group.
The CUDA method uses a custom atomic approach I describe here to find a 32-bit max plus 32-bit index (for each key-group).
The CUDA method is substantially (~10x) faster, for this specific test case. I used a vector size of 10M and a key size of 10K for this test.
My test platform was CUDA 8RC, RHEL 7, and Tesla K20X GPU. K20X is a member of the Kepler generation which has much faster global atomics than previous GPU generations.
Here's a fully worked example, covering both cases, and providing a timing comparison:
$ cat t1234.cu
#include <iostream>
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/functional.h>
#include <cstdlib>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const size_t ksize = 10000;
const size_t vsize = 10000000;
const int nTPB = 256;
struct my_max_func
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 t1, const T2 t2){
T1 res;
if (thrust::get<0>(t1) > thrust::get<0>(t2)){
thrust::get<0>(res) = thrust::get<0>(t1);
thrust::get<1>(res) = thrust::get<1>(t1);}
else {
thrust::get<0>(res) = thrust::get<0>(t2);
thrust::get<1>(res) = thrust::get<1>(t2);}
return res;
}
};
typedef union {
float floats[2]; // floats[0] = maxvalue
int ints[2]; // ints[1] = maxindex
unsigned long long int ulong; // for atomic update
} my_atomics;
__device__ unsigned long long int my_atomicMax(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
while (loctest.floats[0] < val1)
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
return loctest.ulong;
}
__global__ void my_max_idx(const float *data, const int *keys,const int ds, my_atomics *res)
{
int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx < ds)
my_atomicMax(&(res[keys[idx]].ulong), data[idx],idx);
}
int main(){
float *h_vals = new float[vsize];
int *h_keys = new int[vsize];
for (int i = 0; i < vsize; i++) {h_vals[i] = rand(); h_keys[i] = rand()%ksize;}
// thrust method
thrust::device_vector<float> d_vals(h_vals, h_vals+vsize);
thrust::device_vector<int> d_keys(h_keys, h_keys+vsize);
thrust::device_vector<int> d_keys_out(ksize);
thrust::device_vector<float> d_vals_out(ksize);
thrust::device_vector<int> d_idxs(vsize);
thrust::device_vector<int> d_idxs_out(ksize);
thrust::sequence(d_idxs.begin(), d_idxs.end());
cudaDeviceSynchronize();
unsigned long long et = dtime_usec(0);
thrust::sort_by_key(d_keys.begin(), d_keys.end(), thrust::make_zip_iterator(thrust::make_tuple(d_vals.begin(), d_idxs.begin())));
thrust::reduce_by_key(d_keys.begin(), d_keys.end(), thrust::make_zip_iterator(thrust::make_tuple(d_vals.begin(),d_idxs.begin())), d_keys_out.begin(), thrust::make_zip_iterator(thrust::make_tuple(d_vals_out.begin(), d_idxs_out.begin())), thrust::equal_to<int>(), my_max_func());
cudaDeviceSynchronize();
et = dtime_usec(et);
std::cout << "Thrust time: " << et/(float)USECPSEC << "s" << std::endl;
// cuda method
float *vals;
int *keys;
my_atomics *results;
cudaMalloc(&keys, vsize*sizeof(int));
cudaMalloc(&vals, vsize*sizeof(float));
cudaMalloc(&results, ksize*sizeof(my_atomics));
cudaMemset(results, 0, ksize*sizeof(my_atomics)); // works because vals are all positive
cudaMemcpy(keys, h_keys, vsize*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(vals, h_vals, vsize*sizeof(float), cudaMemcpyHostToDevice);
et = dtime_usec(0);
my_max_idx<<<(vsize+nTPB-1)/nTPB, nTPB>>>(vals, keys, vsize, results);
cudaDeviceSynchronize();
et = dtime_usec(et);
std::cout << "CUDA time: " << et/(float)USECPSEC << "s" << std::endl;
// verification
my_atomics *h_results = new my_atomics[ksize];
cudaMemcpy(h_results, results, ksize*sizeof(my_atomics), cudaMemcpyDeviceToHost);
for (int i = 0; i < ksize; i++){
if (h_results[i].floats[0] != d_vals_out[i]) {std::cout << "value mismatch at index: " << i << " thrust: " << d_vals_out[i] << " CUDA: " << h_results[i].floats[0] << std::endl; return -1;}
if (h_results[i].ints[1] != d_idxs_out[i]) {std::cout << "index mismatch at index: " << i << " thrust: " << d_idxs_out[i] << " CUDA: " << h_results[i].ints[1] << std::endl; return -1;}
}
std::cout << "Success!" << std::endl;
return 0;
}
$ nvcc -arch=sm_35 -o t1234 t1234.cu
$ ./t1234
Thrust time: 0.026593s
CUDA time: 0.002451s
Success!
$
Consider the following dataset and centroids. There are 7 individuals and two means each with 8 dimensions. They are stored row major order.
short dim = 8;
float centroids[] = {
0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612,
0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
};
float data[] = {
0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744,
0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869,
0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769,
0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719,
0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530,
0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114
};
I want to calculate each euclidean distances. c1 - d1, c1 - d2 ....
On CPU I would do:
float dist = 0.0, dist_sqrt;
for(int i = 0; i < 2; i++)
for(int j = 0; j < 7; j++)
{
float dist_sum = 0.0;
for(int k = 0; k < dim; k++)
{
dist = centroids[i * dim + k] - data[j * dim + k];
dist_sum += dist * dist;
}
dist_sqrt = sqrt(dist_sum);
// do something with the distance
std::cout << dist_sqrt << std::endl;
}
Is there any built in solution of vector distance calculation in THRUST?
It can be done in thrust. Explaining how will be rather involved, and the code is rather dense.
The key observation to start with is that the core operation can be done via a transformed reduction. The thrust transform operation is used to perform the elementwise subtraction of the vectors (individual-centroid) and squaring of each result, and the reduction sums the results together to produce the square of the euclidean distance. The starting point for this operation is thrust::reduce_by_key, but it gets rather involved to present the data correctly to reduce_by_key.
The final results are produced by taking the square root of each result from above, and we can use an ordinary thrust::transform for this.
The above is a summary description of the only 2 lines of thrust code that do all the work. However, the first line has considerable complexity to it. In order to exploit parallelism, the approach I took was to virtually "lay out" the necessary vectors in sequence, to be presented to reduce_by_key. To take a simple example, suppose we have 2 centroids and 4 individuals, and suppose our dimension is 2.
centroid 0: C00 C01
centroid 1: C10 C11
individ 0: I00 I01
individ 1: I10 I11
individ 2: I20 I21
individ 3: I30 I31
We can "lay out" the vectors like this:
C00 C01 C00 C01 C00 C01 C00 C01 C10 C11 C10 C11 C10 C11 C10 C11
I00 I01 I10 I11 I20 I21 I30 I31 I00 I01 I10 I11 I20 I21 I30 I31
To facilitate the reduce_by_key, we will also need to generate key values to delineate the vectors:
0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
The above data "laid-out" data sets can be quite large, and we don't want to incur storage and retrieval cost, so we will generate these "on-the-fly" using thrust's collection of fancy iterators. This is where things get quite dense. With the above strategy in mind, we will use thrust::reduce_by_key to do the work. We'll create a custom functor provided to a transform_iterator to do the subtraction (and squaring) of the I and C vectors, which will be zipped together for this purpose. The "lay out" of the vectors will be created on the fly using permutation iterators with additional custom index-creation functors, to help with the replicated patterns in each of I and C.
Therefore, working from the "inside out", the sequence of steps is as follows:
for both I (data) and C (centr) use a counting_iterator combined with a custom indexing functor inside of a transform_iterator to produce the indexing sequences we will need.
using the indexing sequences created in step 1 and the base I and C vectors, virtually "lay out" the vectors via a permutation_iterator (one for each laid-out vector).
zip the 2 "laid out" virtual I and C vectors together, to create a <float, float> tuple vector (virtual).
take the zip_iterator from step 3, and combine with a custom distance-calculation functor ((I-C)^2) in a transform_iterator
use another transform_iterator, combining a counting_iterator with a custom key-generating functor, to produce the key sequence (virtual)
pass the iterators in steps 4 and 5 to reduce_by_keyas the inputs (keys, values) to be reduced. The output vectors for reduce_by_key are also keys and values. We don't need the keys, so we'll use a discard_iterator to dump those. The values we will save.
The above steps are all accomplished in a single line of thrust code.
Here's a code illustrating the above:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/copy.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#define MAX_DATA 100000000
#define MAX_CENT 5000
#define TOL 0.001
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(&tv1,0);
return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
unsigned verify(float *d1, float *d2, int len){
unsigned pass = 1;
for (int i = 0; i < len; i++)
if (fabsf(d1[i] - d2[i]) > TOL){
std::cout << "mismatch at: " << i << " val1: " << d1[i] << " val2: " << d2[i] << std::endl;
pass = 0;
break;}
return pass;
}
void eucl_dist_cpu(const float *centroids, const float *data, float *rdist, int num_centroids, int dim, int num_data, int print){
int out_idx = 0;
float dist, dist_sqrt;
for(int i = 0; i < num_centroids; i++)
for(int j = 0; j < num_data; j++)
{
float dist_sum = 0.0;
for(int k = 0; k < dim; k++)
{
dist = centroids[i * dim + k] - data[j * dim + k];
dist_sum += dist * dist;
}
dist_sqrt = sqrt(dist_sum);
// do something with the distance
rdist[out_idx++] = dist_sqrt;
if (print) std::cout << dist_sqrt << ", ";
}
if (print) std::cout << std::endl;
}
struct dkeygen : public thrust::unary_function<int, int>
{
int dim;
int numd;
dkeygen(const int _dim, const int _numd) : dim(_dim), numd(_numd) {};
__host__ __device__ int operator()(const int val) const {
return (val/dim);
}
};
typedef thrust::tuple<float, float> mytuple;
struct my_dist : public thrust::unary_function<mytuple, float>
{
__host__ __device__ float operator()(const mytuple &my_tuple) const {
float temp = thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple);
return temp*temp;
}
};
struct d_idx : public thrust::unary_function<int, int>
{
int dim;
int numd;
d_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};
__host__ __device__ int operator()(const int val) const {
return (val % (dim*numd));
}
};
struct c_idx : public thrust::unary_function<int, int>
{
int dim;
int numd;
c_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};
__host__ __device__ int operator()(const int val) const {
return (val % dim) + (dim * (val/(dim*numd)));
}
};
struct my_sqrt : public thrust::unary_function<float, float>
{
__host__ __device__ float operator()(const float val) const {
return sqrtf(val);
}
};
unsigned long long eucl_dist_thrust(thrust::host_vector<float> ¢roids, thrust::host_vector<float> &data, thrust::host_vector<float> &dist, int num_centroids, int dim, int num_data, int print){
thrust::device_vector<float> d_data = data;
thrust::device_vector<float> d_centr = centroids;
thrust::device_vector<float> values_out(num_centroids*num_data);
unsigned long long compute_time = dtime_usec(0);
thrust::reduce_by_key(thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), dkeygen(dim, num_data)), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(dim*num_data*num_centroids), dkeygen(dim, num_data)),thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_centr.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), c_idx(dim, num_data))), thrust::make_permutation_iterator(d_data.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), d_idx(dim, num_data))))), my_dist()), thrust::make_discard_iterator(), values_out.begin());
thrust::transform(values_out.begin(), values_out.end(), values_out.begin(), my_sqrt());
cudaDeviceSynchronize();
compute_time = dtime_usec(compute_time);
if (print){
thrust::copy(values_out.begin(), values_out.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << std::endl;
}
thrust::copy(values_out.begin(), values_out.end(), dist.begin());
return compute_time;
}
int main(int argc, char *argv[]){
int dim = 8;
int num_centroids = 2;
float centroids[] = {
0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612,
0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
};
int num_data = 8;
float data[] = {
0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744,
0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869,
0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769,
0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719,
0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530,
0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114,
0.721, 0.555, 0.979, 0.412, 0.007, 0.501, 0.844, 0.234
};
std::cout << "cpu results: " << std::endl;
float dist[num_data*num_centroids];
eucl_dist_cpu(centroids, data, dist, num_centroids, dim, num_data, 1);
thrust::host_vector<float> h_data(data, data + (sizeof(data)/sizeof(float)));
thrust::host_vector<float> h_centr(centroids, centroids + (sizeof(centroids)/sizeof(float)));
thrust::host_vector<float> h_dist(num_centroids*num_data);
std::cout << "gpu results: " << std::endl;
eucl_dist_thrust(h_centr, h_data, h_dist, num_centroids, dim, num_data, 1);
float *data2, *centroids2, *dist2;
num_centroids = 10;
num_data = 1000000;
if (argc > 2) {
num_centroids = atoi(argv[1]);
num_data = atoi(argv[2]);
if ((num_centroids < 1) || (num_centroids > MAX_CENT)) {std::cout << "Num centroids out of range" << std::endl; return 1;}
if ((num_data < 1) || (num_data > MAX_DATA)) {std::cout << "Num data out of range" << std::endl; return 1;}
if (num_data * dim * num_centroids > 2000000000) {std::cout << "data set out of range" << std::endl; return 1;}}
std::cout << "Num Data: " << num_data << std::endl;
std::cout << "Num Cent: " << num_centroids << std::endl;
std::cout << "result size: " << ((num_data*num_centroids*4)/1048576) << " Mbytes" << std::endl;
data2 = new float[dim*num_data];
centroids2 = new float[dim*num_centroids];
dist2 = new float[num_data*num_centroids];
for (int i = 0; i < dim*num_data; i++) data2[i] = rand()/(float)RAND_MAX;
for (int i = 0; i < dim*num_centroids; i++) centroids2[i] = rand()/(float)RAND_MAX;
unsigned long long dtime = dtime_usec(0);
eucl_dist_cpu(centroids2, data2, dist2, num_centroids, dim, num_data, 0);
dtime = dtime_usec(dtime);
std::cout << "cpu time: " << dtime/(float)USECPSEC << "s" << std::endl;
thrust::host_vector<float> h_data2(data2, data2 + (dim*num_data));
thrust::host_vector<float> h_centr2(centroids2, centroids2 + (dim*num_centroids));
thrust::host_vector<float> h_dist2(num_data*num_centroids);
dtime = dtime_usec(0);
unsigned long long ctime = eucl_dist_thrust(h_centr2, h_data2, h_dist2, num_centroids, dim, num_data, 0);
dtime = dtime_usec(dtime);
std::cout << "gpu total time: " << dtime/(float)USECPSEC << "s, gpu compute time: " << ctime/(float)USECPSEC << "s" << std::endl;
if (!verify(dist2, &(h_dist2[0]), num_data*num_centroids)) {std::cout << "Verification failure." << std::endl; return 1;}
std::cout << "Success!" << std::endl;
return 0;
}
Notes:
The code is set up to do 2 passes, a short one using a data set similar to yours, with printout for visual check. Then a larger data set can be entered, via command-line sizing parameters (number of centroids, then number of individuals), for benchmark comparison and validation of results.
Contrary to what I stated in the comments, the thrust code is only running about 25% faster than the naive single-threaded CPU code. Your mileage may vary.
This is just one way to think about handling it. I have had other ideas, but not enough time to flesh them out.
The data sets can become rather large. The code right now is intended to be limited to data sets where the product of dimension*number_of_centroids*number_of_individuals is less than 2 billion. However, as you approach even this number, you will need a GPU and CPU that both have a few GB of memory. I briefly explored larger data set sizes. A few code changes would be needed in various places to extend from e.g. int to unsigned long long, etc. However I haven't provided that as I am still investigating an issue with that code.
For another, non-thrust-related look at computing euclidean distances on the GPU, you may be interested in this question. If you follow the sequence of optimizations that were made there, it may shed some light on either how this thrust code might be improved, or else how another non-thrust realization could be used.
Sorry I wasn't able to squeeze more performance out.
I was interested how std::inner_product() performs compared with a manual dot-product calculation, so I did a test.
std::inner_product() was 4x faster than the manual implementation. I find this odd because there aren't really that many ways to calculate it, surely?! I also cannot see any SSE/AVX registers being used at the point of calculation.
Setup: VS2013/MSVC(12?), Haswell i7 4770 CPU, 64-bit compilation, release mode.
Here is the C++ test code:
#include <iostream>
#include <functional>
#include <numeric>
#include <cstdint>
int main() {
const int arraySize = 1000;
const int numTests = 500;
unsigned int x, y = 0;
unsigned long long* array1 = new unsigned long long[arraySize];
unsigned long long* array2 = new unsigned long long[arraySize];
//Initialise arrays
for (int i = 0; i < arraySize; i++){
unsigned long long val = __rdtsc();
array1[i] = val;
array2[i] = val;
}
//std::inner_product test
unsigned long long timingBegin1 = __rdtscp(&s);
for (int i = 0; i < numTests; i++){
volatile unsigned long long result = std::inner_product(array1, array1 + arraySize, array2, static_cast<uint64_t>(0));
}
unsigned long long timingEnd1 = __rdtscp(&s);
f, s = 0;
//Manual Dot Product test
unsigned long long timingBegin2 = __rdtscp(&f);
for (int i = 0; i < numTests; i++){
volatile unsigned long long result = 0;
for (int i = 0; i < arraySize; i++){
result += (array1[i] * array2[i]);
}
}
unsigned long long timeEnd2 = __rdtscp(&f);
std::cout << "STL: : " << static_cast<double>(finish1 - start1) / numTests << " CPU cycles per dot product" << std::endl;
std::cout << "Manually : " << static_cast<double>(finish2 - start2) / numTests << " CPU cycles per dot product" << std::endl;
Your test is bad, and this is likely to make a big difference.
volatile uint64_t result = 0;
for (int i = 0; i < arraySize; i++){
result += (array1[i] * array2[i]);
Note how you're continually using the volatile-qualified variable here. This forces the compiler to write the temporary results to memory.
In contrast, your inner_product version:
volatile uint64_t result = std::inner_product(array1, array1 + arraySize, array2, static_cast<uint64_t>(0));
first calculates the inner product, allowing optimisations, and only then assigns the result to a volatile-qualfied variable.
Hey, my friends and I are trying to beat each other's runtimes for generating "Self Numbers" between 1 and a million. I've written mine in c++ and I'm still trying to shave off precious time.
Here's what I have so far,
#include <iostream>
using namespace std;
bool v[1000000];
int main(void) {
long non_self = 0;
for(long i = 1; i < 1000000; ++i) {
if(!(v[i])) std::cout << i << '\n';
non_self = i + (i%10) + (i/10)%10 + (i/100)%10 + (i/1000)%10 + (i/10000)%10 +(i/100000)%10;
v[non_self] = 1;
}
std::cout << "1000000" << '\n';
return 0;
}
The code works fine now, I just want to optimize it.
Any tips? Thanks.
I built an alternate C solution that doesn't require any modulo or division operations:
#include <stdio.h>
#include <string.h>
int main(int argc, char *argv[]) {
int v[1100000];
int j1, j2, j3, j4, j5, j6, s, n=0;
memset(v, 0, sizeof(v));
for (j6=0; j6<10; j6++) {
for (j5=0; j5<10; j5++) {
for (j4=0; j4<10; j4++) {
for (j3=0; j3<10; j3++) {
for (j2=0; j2<10; j2++) {
for (j1=0; j1<10; j1++) {
s = j6 + j5 + j4 + j3 + j2 + j1;
v[n + s] = 1;
n++;
}
}
}
}
}
}
for (n=1; n<=1000000; n++) {
if (!v[n]) printf("%6d\n", n);
}
}
It generates 97786 self numbers including 1 and 1000000.
With output, it takes
real 0m1.419s
user 0m0.060s
sys 0m0.152s
When I redirect output to /dev/null, it takes
real 0m0.030s
user 0m0.024s
sys 0m0.004s
on my 3 Ghz quad core rig.
For comparison, your version produces the same number of numbers, so I assume we're either both correct or equally wrong; but your version chews up
real 0m0.064s
user 0m0.060s
sys 0m0.000s
under the same conditions, or about 2x as much.
That, or the fact that you're using longs, which is unnecessary on my machine. Here, int goes up to 2 billion. Maybe you should check INT_MAX on yours?
Update
I had a hunch that it may be better to calculate the sum piecewise. Here's my new code:
#include <stdio.h>
#include <string.h>
int main(int argc, char *argv[]) {
char v[1100000];
int j1, j2, j3, j4, j5, j6, s, n=0;
int s1, s2, s3, s4, s5;
memset(v, 0, sizeof(v));
for (j6=0; j6<10; j6++) {
for (j5=0; j5<10; j5++) {
s5 = j6 + j5;
for (j4=0; j4<10; j4++) {
s4 = s5 + j4;
for (j3=0; j3<10; j3++) {
s3 = s4 + j3;
for (j2=0; j2<10; j2++) {
s2 = s3 + j2;
for (j1=0; j1<10; j1++) {
v[s2 + j1 + n++] = 1;
}
}
}
}
}
}
for (n=1; n<=1000000; n++) {
if (!v[n]) printf("%d\n", n);
}
}
...and what do you know, that brought down the time for the top loop from 12 ms to 4 ms. Or maybe 8, my clock seems to be getting a bit jittery way down there.
State of affairs, Summary
The actual finding of self numbers up to 1M is now taking roughly 4 ms, and I'm having trouble measuring any further improvements. On the other hand, as long as output is to the console, it will continue to take about 1.4 seconds, my best efforts to leverage buffering notwithstanding. The I/O time so drastically dwarfs computation time that any further optimization would be essentially futile. Thus, although inspired by further comments, I've decided to leave well enough alone.
All times cited are on my (pretty fast) machine and are for comparison purposes with each other only. Your mileage may vary.
Generate the numbers once, copy the output into your code as a gigantic string. Print the string.
Those mods (%) look expensive. If you are allowed to move to base 16 (or even base 2), then you can probably code this a lot faster. If you have to stay in decimal, try creating an array of digits for each place (units, tens, hundreds) and build some rollover code. That will make summating the numbers far easier.
Alternatively, you could recognise the behaviour of the core self function (let's call it s):
s = n + f(b,n)
where f(b,n) is the sum of the digits of the number n in base b.
For base 10, it's clear that as the ones (also known as least significant) digit moves from 0,1,2,...,9, that n and f(b,n) proceed in lockstep as you move from n to n+1, it's only that 10% of the time that 9 rolls to 0 that it doesnt, so:
f(b,n+1) = f(b,n) + 1 // 90% of the time
thus the core self function s advances as
n+1 + f(b,n+1) = n + 1 + f(b,n) + 1 = n + f(b,n) + 2
s(n+1) = s(n) + 2 // again, 90% of the time
In the remaining (and easily identifiable) 10% of the time, the 9 rolls back to zero and adds one to the next digit, which in the simplest case subtracts (9-1) from the running total, but might cascade up through a series of 9s, to subtract 99-1, 999-1 etc.
So the first optimisation can remove most of the work from 90% of your cycles!
if ((n % 10) != 0)
{
n + f(b,n) = n-1 + f(b,n-1) + 2;
}
or
if ((n % 10) != 0)
{
s = old_s + 2;
}
That should be enough to substantially increase your performance without really changing your algorithm.
If you want more, then work out a simple algorithm for the change between iterations for the remaining 10%.
If you want your output to be fast, it may be worth investigating replacing iostream output with plain old printf() - depends on the rules for winning the competition whether this is important.
Multithread (use different arrays/ranges for every thread). Also, dont use more threads than your number of cpu cores =)
cout or printf within a loop will be slow. If you can remove any prints from a loop you will see significant performance increase.
Since the range is limited (1 to 1000000) the maximum sum of the digits does not exceed 9*6 = 54. This means that to implement the sieve a circular buffer of 54 elements should be perfectly sufficient (and the size of the sieve grows very slowly as the range increases).
You already have a sieve-based solution, but it is based on pre-building the full-length buffer (sieve of 1000000 elements), which is rather inelegant (if not completely unacceptable). The performance of your solution also suffers from non-locality of memory access.
For example, this is a possible very simple implementation
#define N 1000000U
void print_self_numbers(void)
{
#define NMARKS 64U /* make it 64 just in case (and to make division work faster :) */
unsigned char marks[NMARKS] = { 0 };
unsigned i, imark;
for (i = 1, imark = i; i <= N; ++i, imark = (imark + 1) % NMARKS)
{
unsigned digits, sum;
if (!marks[imark])
printf("%u ", i);
else
marks[imark] = 0;
sum = i;
for (digits = i; digits > 0; digits /= 10)
sum += digits % 10;
marks[sum % NMARKS] = 1;
}
}
(I'm not going for the best possible performance in terms of CPU clocks here, just illustrating the key idea with the circular buffer.)
Of course, the range can be easily turned into a parameter of the function, while the size of the curcular buffer can be easily calculated at run-time from the range.
As for "optimizations"... There's no point in trying to optimize the code that contains I/O operations. You won't achieve anything by such optimizations. If you want to analyze the performance of the algorithm itself, you'll have to put the generated numbers into an output array and print them later.
For such simple task, the best option would be to think of alternative algorithms to produce the same result. %10 is not usually considered a fast operation.
Why not use the recurrence relation given on the wikipedia page instead?
That should be blazingly fast.
EDIT: Ignore this .. the recurrence relation generates some but not all of the self numbers.
In fact only very few of them. Thats not particularly clear from thewikipedia page though :(
This may help speed up C++ iostreams output:
cin.tie(0);
ios::sync_with_stdio(false);
Put them in main before you start writing to cout.
I created a CUDA-based solution based on Carl Smotricz's second algorithm. The code to identify Self Numbers itself is extremely fast -- on my machine it executes in ~45 nanoseconds; this is about 150 x faster than Carl Smotricz's algorithm, which ran in 7 milliseconds on my machine.
There is a bottleneck, however, and that seems to be the PCIe interface. It took my code a whopping 43 milliseconds to move the computed data from the graphics card back to RAM. This might be optimizable, and I will look in to this.
Still, 45 nanosedons is pretty darn fast. Scary fast, actually, and I added code to my program which runs Carl Smotricz's algorithm and compares the results for accuracy. The results are accurate. Here is the program output (compiled in VS2008 64-bit, Windows7):
UPDATE
I recompiled this code in release mode with full optimization and using static runtime libraries, with signifigant results. The optimizer seems to have done very well with Carl's algorithm, reducing the runtime from 7 ms to 1 ms. The CUDA implementation sped up as well, from 35 us to 20 us. The memory copy from video card to RAM was unaffected.
Program Output:
Running on device: 'Quadro NVS 295'
Reference Implementation Ran In 15603 ticks (7 ms)
Kernel Executed in 40 ms -- Breakdown:
[kernel] : 35 us (0.09%)
[memcpy] : 40 ms (99.91%)
CUDA Implementation Ran In 111889 ticks (51 ms)
Compute Slots: 1000448 (1954 blocks X 512 threads)
Number of Errors: 0
The code is as follows:
file : main.h
#pragma once
#include <cstdlib>
#include <functional>
typedef std::pair<int*, size_t> sized_ptr;
static sized_ptr make_sized_ptr(int* ptr, size_t size)
{
return make_pair<int*, size_t>(ptr, size);
}
__host__ void ComputeSelfNumbers(sized_ptr hostMem, sized_ptr deviceMemory, unsigned const blocks, unsigned const threads);
inline std::string format_elapsed(double d)
{
char buf[256] = {0};
if( d < 0.00000001 )
{
// show in ps with 4 digits
sprintf(buf, "%0.4f ps", d * 1000000000000.0);
}
else if( d < 0.00001 )
{
// show in ns
sprintf(buf, "%0.0f ns", d * 1000000000.0);
}
else if( d < 0.001 )
{
// show in us
sprintf(buf, "%0.0f us", d * 1000000.0);
}
else if( d < 0.1 )
{
// show in ms
sprintf(buf, "%0.0f ms", d * 1000.0);
}
else if( d <= 60.0 )
{
// show in seconds
sprintf(buf, "%0.2f s", d);
}
else if( d < 3600.0 )
{
// show in min:sec
sprintf(buf, "%01.0f:%02.2f", floor(d/60.0), fmod(d,60.0));
}
// show in h:min:sec
else
sprintf(buf, "%01.0f:%02.0f:%02.2f", floor(d/3600.0), floor(fmod(d,3600.0)/60.0), fmod(d,60.0));
return buf;
}
inline std::string format_pct(double d)
{
char buf[256] = {0};
sprintf(buf, "%.2f", 100.0 * d);
return buf;
}
file: main.cpp
#define _CRT_SECURE_NO_WARNINGS
#include <windows.h>
#include "C:\CUDA\include\cuda_runtime.h"
#include <cstdlib>
#include <iostream>
#include <string>
using namespace std;
#include <cmath>
#include <map>
#include <algorithm>
#include <list>
#include "main.h"
int main()
{
unsigned numVals = 1000000;
int* gold = new int[numVals];
memset(gold, 0, sizeof(int)*numVals);
LARGE_INTEGER li = {0}, li2 = {0};
QueryPerformanceFrequency(&li);
__int64 freq = li.QuadPart;
// get cuda properties...
cudaDeviceProp cdp = {0};
cudaError_t err = cudaGetDeviceProperties(&cdp, 0);
cout << "Running on device: '" << cdp.name << "'" << endl;
// first run the reference implementation
QueryPerformanceCounter(&li);
for( int j6=0, n = 0; j6<10; j6++ )
{
for( int j5=0; j5<10; j5++ )
{
for( int j4=0; j4<10; j4++ )
{
for( int j3=0; j3<10; j3++ )
{
for( int j2=0; j2<10; j2++ )
{
for( int j1=0; j1<10; j1++ )
{
int s = j6 + j5 + j4 + j3 + j2 + j1;
gold[n + s] = 1;
n++;
}
}
}
}
}
}
QueryPerformanceCounter(&li2);
__int64 ticks = li2.QuadPart-li.QuadPart;
cout << "Reference Implementation Ran In " << ticks << " ticks" << " (" << format_elapsed((double)ticks/(double)freq) << ")" << endl;
// now run the cuda version...
unsigned threads = cdp.maxThreadsPerBlock;
unsigned blocks = numVals/threads;
if( numVals%threads ) ++blocks;
unsigned computeSlots = blocks * threads; // this may be != the number of vals since we want 32-thread warps
// allocate device memory for test
int* deviceTest = 0;
err = cudaMalloc(&deviceTest, sizeof(int)*computeSlots);
err = cudaMemset(deviceTest, 0, sizeof(int)*computeSlots);
int* hostTest = new int[numVals]; // the repository for the resulting data on the host
memset(hostTest, 0, sizeof(int)*numVals);
// run the CUDA code...
LARGE_INTEGER li3 = {0}, li4={0};
QueryPerformanceCounter(&li3);
ComputeSelfNumbers(make_sized_ptr(hostTest, numVals), make_sized_ptr(deviceTest, computeSlots), blocks, threads);
QueryPerformanceCounter(&li4);
__int64 ticksCuda = li4.QuadPart-li3.QuadPart;
cout << "CUDA Implementation Ran In " << ticksCuda << " ticks" << " (" << format_elapsed((double)ticksCuda/(double)freq) << ")" << endl;
cout << "Compute Slots: " << computeSlots << " (" << blocks << " blocks X " << threads << " threads)" << endl;
unsigned errorCount = 0;
for( size_t i = 0; i < numVals; ++i )
{
if( gold[i] != hostTest[i] )
{
++errorCount;
}
}
cout << "Number of Errors: " << errorCount << endl;
return 0;
}
file: self.cu
#pragma warning( disable : 4231)
#include <windows.h>
#include <cstdlib>
#include <vector>
#include <iostream>
#include <string>
#include <iomanip>
using namespace std;
#include "main.h"
__global__ void SelfNum(int * slots)
{
__shared__ int N;
N = (blockIdx.x * blockDim.x) + threadIdx.x;
const int numDigits = 10;
__shared__ int digits[numDigits];
for( int i = 0, temp = N; i < numDigits; ++i, temp /= 10 )
{
digits[numDigits-i-1] = temp - 10 * (temp/10) /*temp % 10*/;
}
__shared__ int s;
s = 0;
for( int i = 0; i < numDigits; ++i )
s += digits[i];
slots[N+s] = 1;
}
__host__ void ComputeSelfNumbers(sized_ptr hostMem, sized_ptr deviceMem, const unsigned blocks, const unsigned threads)
{
LARGE_INTEGER li = {0};
QueryPerformanceFrequency(&li);
double freq = (double)li.QuadPart;
LARGE_INTEGER liStart = {0};
QueryPerformanceCounter(&liStart);
// run the kernel
SelfNum<<<blocks, threads>>>(deviceMem.first);
LARGE_INTEGER liKernel = {0};
QueryPerformanceCounter(&liKernel);
cudaMemcpy(hostMem.first, deviceMem.first, hostMem.second*sizeof(int), cudaMemcpyDeviceToHost); // dont copy the overflow - just throw it away
LARGE_INTEGER liMemcpy = {0};
QueryPerformanceCounter(&liMemcpy);
// display performance stats
double e = double(liMemcpy.QuadPart - liStart.QuadPart)/freq,
eKernel = double(liKernel.QuadPart - liStart.QuadPart)/freq,
eMemcpy = double(liMemcpy.QuadPart - liKernel.QuadPart)/freq;
double pKernel = eKernel/e,
pMemcpy = eMemcpy/e;
cout << "Kernel Executed in " << format_elapsed(e) << " -- Breakdown: " << endl
<< " [kernel] : " << format_elapsed(eKernel) << " (" << format_pct(pKernel) << "%)" << endl
<< " [memcpy] : " << format_elapsed(eMemcpy) << " (" << format_pct(pMemcpy) << "%)" << endl;
}
UPDATE2:
I refactored my CUDA implementation to try to speed it up a bit. I did this by unrolling loops manually, fixing some questionable use of __shared__ memory which might have been an error, and getting rid of some redundancy.
The output of my new kernel is:
Reference Implementation Ran In 69610 ticks (5 ms)
Kernel Executed in 2 ms -- Breakdown:
[kernel] : 39 us (1.57%)
[memcpy] : 2 ms (98.43%)
CUDA Implementation Ran In 62970 ticks (4 ms)
Compute Slots: 1000448 (1954 blocks X 512 threads)
Number of Errors: 0
The only code I changed is the kernel itself, so that's all I will post here:
__global__ void SelfNum(int * slots)
{
int N = (blockIdx.x * blockDim.x) + threadIdx.x;
int s = 0;
int temp = N;
s += temp - 10 * (temp/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
slots[N+s] = 1;
}
I wonder if multi-threading would help. This algorithm looks like it would lend itself well to multi-threading. (Poor-man's test of this: Create two copies of the program and run them at the same time. If it runs in less than 200% of the time, multi-threading may help).
I was actually surprised that the code below was faster then any other posted here. I probably measured it wrong, but maybe it helps; or at least is interesting.
#include <iostream>
#include <boost/progress.hpp>
class SelfCalc
{
private:
bool array[1000000];
int non_self;
public:
SelfCalc()
{
memset(&array, 0, sizeof(array));
}
void operator()(const int i)
{
if (!(array[i]))
std::cout << i << '\n';
non_self = i + (i%10) + (i/10)%10 + (i/100)%10 + (i/1000)%10 + (i/10000)%10 +(i/100000)%10;
array[non_self] = true;
}
};
class IntIterator
{
private:
int value;
public:
IntIterator(const int _value):value(_value){}
int operator*(){ return value; }
bool operator!=(const IntIterator &v){ return value != v.value; }
int operator++(){ return ++value; }
};
int main()
{
boost::progress_timer t;
SelfCalc selfCalc;
IntIterator i(1), end(100000);
std::for_each(i, end, selfCalc);
std::cout << 100000 << std::endl;
return 0;
}
Fun problem. The problem as stated does not specify what base it must be in. I fiddled around with it some and wrote a base-2 version. It generates an extra few thousand entries because the termination point of 1,000,000 is not as natural with base-2. This pre-counts the number of bits in a byte for a table lookup. The generation of the result set (without the I/O) took 2.4 ms.
One interesting thing (assuming I wrote it correctly) is that the base-2 version has about 250,000 "self numbers" up to 1,000,000 while there are just under 100,000 base-10 self numbers in that range.
#include <windows.h>
#include <stdio.h>
#include <string.h>
void StartTimer( _int64 *pt1 )
{
QueryPerformanceCounter( (LARGE_INTEGER*)pt1 );
}
double StopTimer( _int64 t1 )
{
_int64 t2, ldFreq;
QueryPerformanceCounter( (LARGE_INTEGER*)&t2 );
QueryPerformanceFrequency( (LARGE_INTEGER*)&ldFreq );
return ((double)( t2 - t1 ) / (double)ldFreq) * 1000.0;
}
#define RANGE 1000000
char sn[0x100000 + 32];
int bitCount[256];
// precompute bitcounts for each byte
void PreCountBits()
{
int i;
// generate count of bits in each byte
memset( bitCount, 0, sizeof( bitCount ));
for ( i = 0; i < 256; i++ )
{
int tmp = i;
while ( tmp )
{
if ( tmp & 0x01 )
bitCount[i]++;
tmp >>= 1;
}
}
}
void GenBase2( )
{
int i;
int *b1, *b2, *b3;
int b1sum, b2sum, b3sum;
i = 0;
for ( b1 = bitCount; b1 < bitCount + 256; b1++ )
{
b1sum = *b1;
for ( b2 = bitCount; b2 < bitCount + 256; b2++ )
{
b2sum = b1sum + *b2;
for ( b3 = bitCount; b3 < bitCount + 256; b3++ )
{
sn[i++ + *b3 + b2sum] = 1;
}
}
// 1000000 does not provide a great termination number for base 2. So check
// here. Overshoots the target some but avoids repeated checks
if ( i > RANGE )
return;
}
}
int main( int argc, char* argv[] )
{
int i = 0;
__int64 t1;
memset( sn, 0, sizeof( sn ));
StartTimer( &t1 );
PreCountBits();
GenBase2();
printf( "Generation time = %.3f\n", StopTimer( t1 ));
#if 1
for ( i = 1; i <= RANGE; i++ )
if ( !sn[i] ) printf( "%d\n", i );
#endif
return 0;
}
Maybe try just computing the recurrence relation defined below?
http://en.wikipedia.org/wiki/Self_number