I need a fast and efficient implementation for finding the index of the maximum value in an array in CUDA. This operation needs to be performed several times. I originally used cublasIsamax for this, however, it sadly returns the index of the maximum absolute value, which is not what I want. Instead, I'm using thrust::max_element, however the speed is rather slow in comparison to cublasIsamax. I use it in the following manner:
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
The number of elements in the vector range between 10'000 and 20'000. The difference in speed between thrust::max_element and cublasIsamax is rather big. Perhaps I'm performing several memory transactions without knowing?
A more efficient implementation would be to write your own max-index reduction code in CUDA. It's likely that cublasIsamax is using something like this under the hood.
We can compare 3 approaches:
thrust::max_element
cublasIsamax
custom CUDA kernel
Here's a fully worked example:
$ cat t665.cu
#include <cublas_v2.h>
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <stdlib.h>
#define DSIZE 10000
// nTPB should be a power-of-2
#define nTPB 256
#define MAX_KERNEL_BLOCKS 30
#define MAX_BLOCKS ((DSIZE/nTPB)+1)
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f
#include <time.h>
#include <sys/time.h>
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(&tv1,0);
return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
__device__ volatile float blk_vals[MAX_BLOCKS];
__device__ volatile int blk_idxs[MAX_BLOCKS];
__device__ int blk_num = 0;
template <typename T>
__global__ void max_idx_kernel(const T *data, const int dsize, int *result){
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
T my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize){
if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
idx += blockDim.x*gridDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
// perform block-level reduction
if (!threadIdx.x){
blk_vals[blockIdx.x] = vals[0];
blk_idxs[blockIdx.x] = idxs[0];
if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
last_block = 1;}
__syncthreads();
if (last_block){
idx = threadIdx.x;
my_val = FLOAT_MIN;
my_idx = -1;
while (idx < gridDim.x){
if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
idx += blockDim.x;}
// populate shared memory
vals[threadIdx.x] = my_val;
idxs[threadIdx.x] = my_idx;
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i)
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();}
if (!threadIdx.x)
*result = idxs[0];
}
}
int main(){
int nrElements = DSIZE;
float *d_vector, *h_vector;
h_vector = new float[DSIZE];
for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
h_vector[10] = 10; // create definite max element
cublasHandle_t my_handle;
cublasStatus_t my_status = cublasCreate(&my_handle);
cudaMalloc(&d_vector, DSIZE*sizeof(float));
cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
int max_index = 0;
unsigned long long dtime = dtime_usec(0);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "thrust time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
max_index = 0;
dtime = dtime_usec(0);
my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "cublas time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
max_index = 0;
int *d_max_index;
cudaMalloc(&d_max_index, sizeof(int));
dtime = dtime_usec(0);
max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
dtime = dtime_usec(dtime);
std::cout << "kernel time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t665 t665.cu -lcublas
$ ./t665
thrust time: 0.00075 max index: 10
cublas time: 6.3e-05 max index: 11
kernel time: 2.5e-05 max index: 10
$
Notes:
CUBLAS returns an index 1 higher than the others because CUBLAS uses 1-based indexing.
CUBLAS might be quicker if you used CUBLAS_POINTER_MODE_DEVICE, however for validation you would still have to copy the result back to the host.
CUBLAS with CUBLAS_POINTER_MODE_DEVICE should be asynchronous, so the cudaDeviceSynchronize() will be desirable for the host based timing I've shown here. In some cases, thrust can be asynchronous as well.
For convenience and results comparison between CUBLAS and the other methods, I am using all nonnegative values for my data. You may want to adjust the FLOAT_MIN value if you are using negative values as well.
If you're freaky about performance, you can try tuning the nTPB and MAX_KERNEL_BLOCKS parameters to see if you can max out performance on your specific GPU. The kernel code also arguably leaves some performance on the table by not switching carefully into a warp-synchronous mode for the final stages of the (two) threadblock reduction(s).
The threadblock reduction kernel uses a block-draining/last-block strategy to avoid the overhead of an additional kernel launch to perform the final reduction.
I wrote a (probably-inefficient, but anyway..) Rcpp code using inline to simulate a stochastic SEIR model.
The serial version compiles and works perfectly, but since I need to simulate from it a large number of times and since it seems to me like an embarrassingly parallel problem (just need to simulate again for other parameter values and return a matrix with the results) I tried to add #pragma omp parallel for and to compile with -fopenmp -lgomp but ... boom!
I get a segfault even for very small examples!
I tried to add setenv("OMP_STACKSIZE","24M",1); and values well over 24M but still the segfault happens.
I'll explain briefly the code since it's a bit long (I tried to shorten it but the result change and I can't reproduce it..):
I have two nested loops, the inner one execute the model for a given parameter set and the outer one changes the parameters.
The only reason a race condition might happen is if the code were trying to execute set of instructions inside inner the loop in parallel (which cannot be done because of the model structure, on iteration t it depends on iteration t-1) and not to parallelize the outer, but if I'm not mistaken that is what the parallel for constructor does for default if put just outside the outer...
This is basically the form of the code I'm trying to run:
mat result(n_param,T_MAX);
#pragma omp parallel for
for(int i=0,i<n_param_set;i++){
t=0;
rowvec jnk(T_MAX);
while(t < T_MAX){
...
jnk(t) = something(jnk(t-1));
...
t++;
}
result.row(i)=jnk;
}
return wrap(result);
And my question is: How I tell the compiler that I just want to compute in parallel the outer loop (even distributing them statically like n_loops/n_threads for each thread) and not the inner one (which is actually non-parallelizable)?
The real code is a bit more involved and I'll present it here for the sake of reproducibility if you're really willing, but I'm only asking about the behavior of OpenMP. Please notice that the only OpenMP instruction appears at line 122.
library(Rcpp);library(RcppArmadillo);library(inline)
misc='
#include <math.h>
#define _USE_MATH_DEFINES
#include <omp.h>
using namespace arma;
template <typename T> int sgn(T val) {
return (T(0) < val) - (val < T(0));
}
uvec rmultinomial(int n,vec prob)
{
int K = prob.n_elem;
uvec rN = zeros<uvec>(K);
double p_tot = sum(prob);
double pp;
for(int k = 0; k < K-1; k++) {
if(prob(k)>0) {
pp = prob[k] / p_tot;
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
n -= rN[k];
} else
rN[k] = 0;
if(n <= 0) /* we have all*/
return rN;
p_tot -= prob[k]; /* i.e. = sum(prob[(k+1):K]) */
}
rN[K-1] = n;
return rN;
}
'
model_and_summary='
mat SEIR_sim_plus_summaries()
{
vec alpha;
alpha << 0.002 << 0.0045;
vec beta;
beta << 0.01 << 0.01;
vec gamma;
gamma << 1.0/14.0 << 1.0/14.0;
vec sigma;
sigma << 1.0/(3.5) << 1.0/(3.5);
vec phi;
phi << 0.8 << 0.8;
int S_0 = 800;
int E_0 = 100;
int I_0 = 100;
int R_0 = 0;
int pop = 1000;
double tau = 0.01;
double t_0 = 0;
vec obs_time;
obs_time << 1 << 2 << 3 << 4 << 5 << 6 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 15 << 16 << 17 << 18 << 19 << 20 << 21 << 22 << 23 << 24;
const int n_obs = obs_time.n_elem;
const int n_part = alpha.n_elem;
mat stat(n_part,6);
//#pragma omp parallel for
for(int k=0;k<n_part;k++) {
ivec INC_i(n_obs);
ivec INC_o(n_obs);
// Event variables
double alpha_t;
int nX; //current number of people moving
vec rates(8);
uvec trans(4); // current transitions, e.g. from S to E,I,R,Universe
vec r(4); // rates e.g. from S to E, I, R, Univ.
/*********************** Initialize **********************/
int S_curr = S_0;
int S_prev = S_0;
int E_curr = E_0;
int E_prev = E_0;
int I_curr = I_0;
int I_prev = I_0;
int R_curr = R_0;
int R_prev = R_0;
int IncI_curr = 0;
int IncI_prev = 0;
int IncO_curr = 0;
int IncO_prev = 0;
double t_curr = t_0;
int t_idx =0;
while( t_idx < n_obs ) {
// next time preparation
t_curr += tau;
S_prev = S_curr;
E_prev = E_curr;
I_prev = I_curr;
R_prev = R_curr;
IncI_prev = IncI_curr;
IncO_prev = IncO_curr;
/*********************** description (rates) of the events **********************/
alpha_t = alpha(k)*(1+phi(k)*sin(2*M_PI*(t_curr+0)/52)); //real contact rate, time expressed in weeks
rates(0) = (alpha_t * ((double)I_curr / (double)pop ) * ((double)S_curr)); //e+1, s-1, r,i one s get infected (goes in E, not yey infectous)
rates(1) = (sigma(k) * E_curr); //e-1, i+1, r,s one exposed become infectous (goes in I) INCIDENCE!!
rates(2) = (gamma(k) * I_curr); //i-1, s,e, r+1 one i recover
rates(3) = (beta(k) * I_curr); //i-1, s, r,e one i dies
rates(4) = (beta(k) * R_curr); //i,e, s, r-1 one r dies
rates(5) = (beta(k) * E_curr); //e-1, s, r,i one e dies
rates(6) = (beta(k) * S_curr); //s-1 e, i ,r one s dies
rates(7) = (beta(k) * pop); //s+1 one susc is born
// Let the events occour
/*********************** S compartement **********************/
if((rates(0)+rates(6))>0){
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
r(0) = rates(0)/(rates(0)+rates(6)); r(1) = 0.0; r(2) = 0; r(3) = rates(6)/(rates(0)+rates(6));
trans = rmultinomial(nX, r);
S_curr -= nX;
E_curr += trans(0);
I_curr += trans(1);
R_curr += trans(2);
//trans(3) contains dead individual, who disappear...we could avoid this using sequential conditional binomial
}
/*********************** E compartement **********************/
if((rates(1)+rates(5))>0){
nX = rbinom(1,E_prev,1-exp(-(rates(1)+rates(5))*tau))(0);
r(0) = 0.0; r(1) = rates(1)/(rates(1)+rates(5)); r(2) = 0.0; r(3) = rates(5)/(rates(1)+rates(5));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr -= nX;
I_curr += trans(1);
R_curr += trans(2);
IncI_curr += trans(1);
}
/*********************** I compartement **********************/
if((rates(2)+rates(3))>0){
nX = rbinom(1,I_prev,1-exp(-(rates(2)+rates(3))*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = rates(2)/(rates(2)+rates(3)); r(3) = rates(3)/(rates(2)+rates(3));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr -= nX;
R_curr += trans(2);
IncO_curr += trans(2);
}
/*********************** R compartement **********************/
if(rates(4)>0){
nX = rbinom(1,R_prev,1-exp(-rates(4)*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = 0.0; r(3) = rates(4)/rates(4);
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr += trans(2);
R_curr -= nX;
}
/*********************** Universe **********************/
S_curr += pop - (S_curr+E_curr+I_curr+R_curr); //it should be poisson, but since the pop is fixed...
/*********************** Save & Continue **********************/
// Check if the time is interesting for us
if(t_curr > obs_time[t_idx]){
INC_i(t_idx) = IncI_curr;
INC_o(t_idx) = IncO_curr;
IncI_curr = IncI_prev = 0;
IncO_curr = IncO_prev = 0;
t_idx++;
}
//else just go on...
}
/*********************** Finished - Starting w/ stats **********************/
// INC_i is the useful variable, how can I change its reference withour copying it?
ivec incidence = INC_i; //just so if I want to use INC_o i have to change just this...
//Scan the epidemics to recover the summary stats (naively divide the data each 52 weeks)
double n_years = ceil((double)obs_time(n_obs-1)/52.0);
vec mu_attack(n_years);
vec ratio_attack(n_years-1);
vec peak(n_years);
vec atk(52);
peak(0)=0.0;
vec tmpExplo(52); //explosiveness
vec explo(n_years);
int year=0;
int week;
for(week=0 ; week<n_obs ; week++){
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
if(year>0)
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<52;i++){
if(atk(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
explo(year) = sum(tmpExplo);
year++;
peak(year)=0.0;
}
atk(week-52*year) = incidence(week);
if( peak(year) < incidence(week) )
peak(year)=incidence(week);
}
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
} else {
ivec idx(52);
for(int i=0;i<52;i++)
{ idx(i) = i; } //take just the updated ones...
vec tmp = atk.elem(find(idx<(week - 52*year)));
mu_attack(year) = sum( tmp )/((double)pop * (tmp.n_elem/52.0));
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<tmp.n_elem;i++){
if(tmp(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
for(int i=tmp.n_elem;i<52;i++)
tmpExplo(i) = 0.0; //to reset the others
explo(year) = sum(tmpExplo);
}
double correlation2;
double correlation4;
vec autocorr = acf(peak);
/***** ACF *****/
if(n_years<3){
correlation2=0.0;
correlation4=0.0;
} else {
if(n_years<5){
correlation2 = autocorr(1);
correlation4 = 0.0;
} else {
correlation2 = autocorr(1);
correlation4 = autocorr(3);
}
}
rowvec jnk(6);
jnk << sum(mu_attack)/(year+1.0)
<< (sum( log(ratio_attack)%log(ratio_attack) )/(n_years-1)) - (pow(sum( log(ratio_attack) )/(n_years-1),2))
<< correlation2 << correlation4 << max(peak) << sum(explo)/n_years;
stat.row(k) = jnk;
}
return stat;
}
'
main='
std::cout << "max_num_threads " << omp_get_max_threads() << std::endl;
RNGScope scope;
mat summaries = SEIR_sim_plus_summaries();
return wrap(summaries);
'
plug = getPlugin("RcppArmadillo")
## modify the plugin for Rcpp to support OpenMP
plug$env$PKG_CXXFLAGS <- paste('-fopenmp', plug$env$PKG_CXXFLAGS)
plug$env$PKG_LIBS <- paste('-fopenmp -lgomp', plug$env$PKG_LIBS)
SEIR_sim_summary = cxxfunction(sig=signature(),main,settings=plug,inc = paste(misc,model_and_summary),verbose=TRUE)
SEIR_sim_summary()
Thanks for the help!
NB: before you ask, I slightly modified the Rcpp multinomial sampling function just because I liked that way more than the one using pointer...not any other particular reason! :)
The core pseudo-random number generators (PRNGs) in R are not designed to be used in multithreaded environments. That is, their state is stored in a static array (dummy from src/main/PRNG.c) and therefore is shared among all threads. Moreover several other static structures are used to store states for the higher-level interfaces to the core PRNGs.
A possible solution could be that you put each call to rnorm() or other sampling functions inside named critical sections with all having the same name, e.g.:
...
#pragma omp critical(random)
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
...
if((rates(0)+rates(6))>0){
#pragma omp critical(random)
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
...
Note that the critical construct operates on the structured block following it and therefore locks the entire statement. If a random number is being drawn inline inside a call to a time-consuming function, e.g.
#pragma omp critical(random)
x = slow_computation(rbinom(...));
this is better transformed to:
#pragma omp critical(random)
rb = rbinom(...);
x = slow_computation(rb);
That way only the rb = rbinom(...); statement will be protected.
I was trying to prove a point with OpenMP compared to MPICH, and I cooked up the following example to demonstrate how easy it was to do some high performance in OpenMP.
The Gauss-Seidel iteration is split into two separate runs, such that in each sweep every operation can be performed in any order, and there should be no dependency between each task. So in theory each processor should never have to wait for another process to perform any kind of synchronization.
The problem I am encountering, is that I, independent of problem size, find there is only a weak speed-up of 2 processors and with more than 2 processors it might even be slower.
Many other linear paralleled routine I can obtain very good scaling, but this one is tricky.
My fear is that I am unable to "explain" to the compiler that operation that I perform on the array, is thread-safe, such that it is unable to be really effective.
See the example below.
Anyone has any clue on how to make this more effective with OpenMP?
void redBlackSmooth(std::vector<double> const & b,
std::vector<double> & x,
double h)
{
// Setup relevant constants.
double const invh2 = 1.0/(h*h);
double const h2 = (h*h);
int const N = static_cast<int>(x.size());
double sigma = 0;
// Setup some boundary conditions.
x[0] = 0.0;
x[N-1] = 0.0;
// Red sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 1; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2/2.0)*(b[i] - sigma);
}
// Black sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 2; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2/2.0)*(b[i] - sigma);
}
}
Addition:
I have now also tried with a raw pointer implementation and it has the same behavior as using STL container, so it can be ruled out that it is some pseudo-critical behavior comming from STL.
First of all, make sure that the x vector is aligned to cache boundaries. I did some test, and I get something like a 100% improvement with your code on my machine (core duo) if I force the alignment of memory:
double * x;
const size_t CACHE_LINE_SIZE = 256;
posix_memalign( reinterpret_cast<void**>(&x), CACHE_LINE_SIZE, sizeof(double) * N);
Second, you can try to assign more computation to each thread (in this way you can keep cache-lines separated), but I suspect that openmp already does something like this under the hood, so it may be worthless with large N.
In my case this implementation is much faster when x is not cache-aligned.
const int workGroupSize = CACHE_LINE_SIZE / sizeof(double);
assert(N % workGroupSize == 0); //Need to tweak the code a bit to let it work with any N
const int workgroups = N / workGroupSize;
int j, base , k, i;
#pragma omp parallel for shared(b, x) private(sigma, j, base, k, i)
for ( j = 0; j < workgroups; j++ ) {
base = j * workGroupSize;
for (int k = 0; k < workGroupSize; k+=2)
{
i = base + k + (redSweep ? 1 : 0);
if ( i == 0 || i+1 == N) continue;
sigma = -invh2* ( x[i-1] + x[i+1] );
x[i] = ( h2/2.0 ) * ( b[i] - sigma );
}
}
In conclusion, you definitely have a problem of cache-fighting, but given the way openmp works (sadly I am not familiar with it) it should be enough to work with properly allocated buffers.
I think the main problem is about type of array structure you are using. Lets try comparing results with vectors and arrays. (Arrays = c-arrays using new operator).
Vector and array sizes are N = 10000000. I force the smoothing function to repeat in order to maintain runtime > 0.1secs.
Vector Time: 0.121007 Repeat: 1 MLUPS: 82.6399
Array Time: 0.164009 Repeat: 2 MLUPS: 121.945
MLUPS = ((N-2)*repeat/runtime)/1000000 (Million Lattice Points Update per second)
MFLOPS are misleading when it comes to grid calculation. A few changes in the basic equation can lead to consider high performance for the same runtime.
The modified code:
double my_redBlackSmooth(double *b, double* x, double h, int N)
{
// Setup relevant constants.
double const invh2 = 1.0/(h*h);
double const h2 = (h*h);
double sigma = 0;
// Setup some boundary conditions.
x[0] = 0.0;
x[N-1] = 0.0;
double runtime(0.0), wcs, wce;
int repeat = 1;
timing(&wcs);
for(; runtime < 0.1; repeat*=2)
{
for(int r = 0; r < repeat; ++r)
{
// Red sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 1; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2*0.5)*(b[i] - sigma);
}
// Black sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 2; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2*0.5)*(b[i] - sigma);
}
// cout << "In Array: " << r << endl;
}
if(x[0] != 0) dummy(x[0]);
timing(&wce);
runtime = (wce-wcs);
}
// cout << "Before division: " << repeat << endl;
repeat /= 2;
cout << "Array Time:\t" << runtime << "\t" << "Repeat:\t" << repeat
<< "\tMLUPS:\t" << ((N-2)*repeat/runtime)/1000000.0 << endl;
return runtime;
}
I didn't change anything in the code except than array type. For better cache access and blocking you should look into data alignment (_mm_malloc).
Hey, my friends and I are trying to beat each other's runtimes for generating "Self Numbers" between 1 and a million. I've written mine in c++ and I'm still trying to shave off precious time.
Here's what I have so far,
#include <iostream>
using namespace std;
bool v[1000000];
int main(void) {
long non_self = 0;
for(long i = 1; i < 1000000; ++i) {
if(!(v[i])) std::cout << i << '\n';
non_self = i + (i%10) + (i/10)%10 + (i/100)%10 + (i/1000)%10 + (i/10000)%10 +(i/100000)%10;
v[non_self] = 1;
}
std::cout << "1000000" << '\n';
return 0;
}
The code works fine now, I just want to optimize it.
Any tips? Thanks.
I built an alternate C solution that doesn't require any modulo or division operations:
#include <stdio.h>
#include <string.h>
int main(int argc, char *argv[]) {
int v[1100000];
int j1, j2, j3, j4, j5, j6, s, n=0;
memset(v, 0, sizeof(v));
for (j6=0; j6<10; j6++) {
for (j5=0; j5<10; j5++) {
for (j4=0; j4<10; j4++) {
for (j3=0; j3<10; j3++) {
for (j2=0; j2<10; j2++) {
for (j1=0; j1<10; j1++) {
s = j6 + j5 + j4 + j3 + j2 + j1;
v[n + s] = 1;
n++;
}
}
}
}
}
}
for (n=1; n<=1000000; n++) {
if (!v[n]) printf("%6d\n", n);
}
}
It generates 97786 self numbers including 1 and 1000000.
With output, it takes
real 0m1.419s
user 0m0.060s
sys 0m0.152s
When I redirect output to /dev/null, it takes
real 0m0.030s
user 0m0.024s
sys 0m0.004s
on my 3 Ghz quad core rig.
For comparison, your version produces the same number of numbers, so I assume we're either both correct or equally wrong; but your version chews up
real 0m0.064s
user 0m0.060s
sys 0m0.000s
under the same conditions, or about 2x as much.
That, or the fact that you're using longs, which is unnecessary on my machine. Here, int goes up to 2 billion. Maybe you should check INT_MAX on yours?
Update
I had a hunch that it may be better to calculate the sum piecewise. Here's my new code:
#include <stdio.h>
#include <string.h>
int main(int argc, char *argv[]) {
char v[1100000];
int j1, j2, j3, j4, j5, j6, s, n=0;
int s1, s2, s3, s4, s5;
memset(v, 0, sizeof(v));
for (j6=0; j6<10; j6++) {
for (j5=0; j5<10; j5++) {
s5 = j6 + j5;
for (j4=0; j4<10; j4++) {
s4 = s5 + j4;
for (j3=0; j3<10; j3++) {
s3 = s4 + j3;
for (j2=0; j2<10; j2++) {
s2 = s3 + j2;
for (j1=0; j1<10; j1++) {
v[s2 + j1 + n++] = 1;
}
}
}
}
}
}
for (n=1; n<=1000000; n++) {
if (!v[n]) printf("%d\n", n);
}
}
...and what do you know, that brought down the time for the top loop from 12 ms to 4 ms. Or maybe 8, my clock seems to be getting a bit jittery way down there.
State of affairs, Summary
The actual finding of self numbers up to 1M is now taking roughly 4 ms, and I'm having trouble measuring any further improvements. On the other hand, as long as output is to the console, it will continue to take about 1.4 seconds, my best efforts to leverage buffering notwithstanding. The I/O time so drastically dwarfs computation time that any further optimization would be essentially futile. Thus, although inspired by further comments, I've decided to leave well enough alone.
All times cited are on my (pretty fast) machine and are for comparison purposes with each other only. Your mileage may vary.
Generate the numbers once, copy the output into your code as a gigantic string. Print the string.
Those mods (%) look expensive. If you are allowed to move to base 16 (or even base 2), then you can probably code this a lot faster. If you have to stay in decimal, try creating an array of digits for each place (units, tens, hundreds) and build some rollover code. That will make summating the numbers far easier.
Alternatively, you could recognise the behaviour of the core self function (let's call it s):
s = n + f(b,n)
where f(b,n) is the sum of the digits of the number n in base b.
For base 10, it's clear that as the ones (also known as least significant) digit moves from 0,1,2,...,9, that n and f(b,n) proceed in lockstep as you move from n to n+1, it's only that 10% of the time that 9 rolls to 0 that it doesnt, so:
f(b,n+1) = f(b,n) + 1 // 90% of the time
thus the core self function s advances as
n+1 + f(b,n+1) = n + 1 + f(b,n) + 1 = n + f(b,n) + 2
s(n+1) = s(n) + 2 // again, 90% of the time
In the remaining (and easily identifiable) 10% of the time, the 9 rolls back to zero and adds one to the next digit, which in the simplest case subtracts (9-1) from the running total, but might cascade up through a series of 9s, to subtract 99-1, 999-1 etc.
So the first optimisation can remove most of the work from 90% of your cycles!
if ((n % 10) != 0)
{
n + f(b,n) = n-1 + f(b,n-1) + 2;
}
or
if ((n % 10) != 0)
{
s = old_s + 2;
}
That should be enough to substantially increase your performance without really changing your algorithm.
If you want more, then work out a simple algorithm for the change between iterations for the remaining 10%.
If you want your output to be fast, it may be worth investigating replacing iostream output with plain old printf() - depends on the rules for winning the competition whether this is important.
Multithread (use different arrays/ranges for every thread). Also, dont use more threads than your number of cpu cores =)
cout or printf within a loop will be slow. If you can remove any prints from a loop you will see significant performance increase.
Since the range is limited (1 to 1000000) the maximum sum of the digits does not exceed 9*6 = 54. This means that to implement the sieve a circular buffer of 54 elements should be perfectly sufficient (and the size of the sieve grows very slowly as the range increases).
You already have a sieve-based solution, but it is based on pre-building the full-length buffer (sieve of 1000000 elements), which is rather inelegant (if not completely unacceptable). The performance of your solution also suffers from non-locality of memory access.
For example, this is a possible very simple implementation
#define N 1000000U
void print_self_numbers(void)
{
#define NMARKS 64U /* make it 64 just in case (and to make division work faster :) */
unsigned char marks[NMARKS] = { 0 };
unsigned i, imark;
for (i = 1, imark = i; i <= N; ++i, imark = (imark + 1) % NMARKS)
{
unsigned digits, sum;
if (!marks[imark])
printf("%u ", i);
else
marks[imark] = 0;
sum = i;
for (digits = i; digits > 0; digits /= 10)
sum += digits % 10;
marks[sum % NMARKS] = 1;
}
}
(I'm not going for the best possible performance in terms of CPU clocks here, just illustrating the key idea with the circular buffer.)
Of course, the range can be easily turned into a parameter of the function, while the size of the curcular buffer can be easily calculated at run-time from the range.
As for "optimizations"... There's no point in trying to optimize the code that contains I/O operations. You won't achieve anything by such optimizations. If you want to analyze the performance of the algorithm itself, you'll have to put the generated numbers into an output array and print them later.
For such simple task, the best option would be to think of alternative algorithms to produce the same result. %10 is not usually considered a fast operation.
Why not use the recurrence relation given on the wikipedia page instead?
That should be blazingly fast.
EDIT: Ignore this .. the recurrence relation generates some but not all of the self numbers.
In fact only very few of them. Thats not particularly clear from thewikipedia page though :(
This may help speed up C++ iostreams output:
cin.tie(0);
ios::sync_with_stdio(false);
Put them in main before you start writing to cout.
I created a CUDA-based solution based on Carl Smotricz's second algorithm. The code to identify Self Numbers itself is extremely fast -- on my machine it executes in ~45 nanoseconds; this is about 150 x faster than Carl Smotricz's algorithm, which ran in 7 milliseconds on my machine.
There is a bottleneck, however, and that seems to be the PCIe interface. It took my code a whopping 43 milliseconds to move the computed data from the graphics card back to RAM. This might be optimizable, and I will look in to this.
Still, 45 nanosedons is pretty darn fast. Scary fast, actually, and I added code to my program which runs Carl Smotricz's algorithm and compares the results for accuracy. The results are accurate. Here is the program output (compiled in VS2008 64-bit, Windows7):
UPDATE
I recompiled this code in release mode with full optimization and using static runtime libraries, with signifigant results. The optimizer seems to have done very well with Carl's algorithm, reducing the runtime from 7 ms to 1 ms. The CUDA implementation sped up as well, from 35 us to 20 us. The memory copy from video card to RAM was unaffected.
Program Output:
Running on device: 'Quadro NVS 295'
Reference Implementation Ran In 15603 ticks (7 ms)
Kernel Executed in 40 ms -- Breakdown:
[kernel] : 35 us (0.09%)
[memcpy] : 40 ms (99.91%)
CUDA Implementation Ran In 111889 ticks (51 ms)
Compute Slots: 1000448 (1954 blocks X 512 threads)
Number of Errors: 0
The code is as follows:
file : main.h
#pragma once
#include <cstdlib>
#include <functional>
typedef std::pair<int*, size_t> sized_ptr;
static sized_ptr make_sized_ptr(int* ptr, size_t size)
{
return make_pair<int*, size_t>(ptr, size);
}
__host__ void ComputeSelfNumbers(sized_ptr hostMem, sized_ptr deviceMemory, unsigned const blocks, unsigned const threads);
inline std::string format_elapsed(double d)
{
char buf[256] = {0};
if( d < 0.00000001 )
{
// show in ps with 4 digits
sprintf(buf, "%0.4f ps", d * 1000000000000.0);
}
else if( d < 0.00001 )
{
// show in ns
sprintf(buf, "%0.0f ns", d * 1000000000.0);
}
else if( d < 0.001 )
{
// show in us
sprintf(buf, "%0.0f us", d * 1000000.0);
}
else if( d < 0.1 )
{
// show in ms
sprintf(buf, "%0.0f ms", d * 1000.0);
}
else if( d <= 60.0 )
{
// show in seconds
sprintf(buf, "%0.2f s", d);
}
else if( d < 3600.0 )
{
// show in min:sec
sprintf(buf, "%01.0f:%02.2f", floor(d/60.0), fmod(d,60.0));
}
// show in h:min:sec
else
sprintf(buf, "%01.0f:%02.0f:%02.2f", floor(d/3600.0), floor(fmod(d,3600.0)/60.0), fmod(d,60.0));
return buf;
}
inline std::string format_pct(double d)
{
char buf[256] = {0};
sprintf(buf, "%.2f", 100.0 * d);
return buf;
}
file: main.cpp
#define _CRT_SECURE_NO_WARNINGS
#include <windows.h>
#include "C:\CUDA\include\cuda_runtime.h"
#include <cstdlib>
#include <iostream>
#include <string>
using namespace std;
#include <cmath>
#include <map>
#include <algorithm>
#include <list>
#include "main.h"
int main()
{
unsigned numVals = 1000000;
int* gold = new int[numVals];
memset(gold, 0, sizeof(int)*numVals);
LARGE_INTEGER li = {0}, li2 = {0};
QueryPerformanceFrequency(&li);
__int64 freq = li.QuadPart;
// get cuda properties...
cudaDeviceProp cdp = {0};
cudaError_t err = cudaGetDeviceProperties(&cdp, 0);
cout << "Running on device: '" << cdp.name << "'" << endl;
// first run the reference implementation
QueryPerformanceCounter(&li);
for( int j6=0, n = 0; j6<10; j6++ )
{
for( int j5=0; j5<10; j5++ )
{
for( int j4=0; j4<10; j4++ )
{
for( int j3=0; j3<10; j3++ )
{
for( int j2=0; j2<10; j2++ )
{
for( int j1=0; j1<10; j1++ )
{
int s = j6 + j5 + j4 + j3 + j2 + j1;
gold[n + s] = 1;
n++;
}
}
}
}
}
}
QueryPerformanceCounter(&li2);
__int64 ticks = li2.QuadPart-li.QuadPart;
cout << "Reference Implementation Ran In " << ticks << " ticks" << " (" << format_elapsed((double)ticks/(double)freq) << ")" << endl;
// now run the cuda version...
unsigned threads = cdp.maxThreadsPerBlock;
unsigned blocks = numVals/threads;
if( numVals%threads ) ++blocks;
unsigned computeSlots = blocks * threads; // this may be != the number of vals since we want 32-thread warps
// allocate device memory for test
int* deviceTest = 0;
err = cudaMalloc(&deviceTest, sizeof(int)*computeSlots);
err = cudaMemset(deviceTest, 0, sizeof(int)*computeSlots);
int* hostTest = new int[numVals]; // the repository for the resulting data on the host
memset(hostTest, 0, sizeof(int)*numVals);
// run the CUDA code...
LARGE_INTEGER li3 = {0}, li4={0};
QueryPerformanceCounter(&li3);
ComputeSelfNumbers(make_sized_ptr(hostTest, numVals), make_sized_ptr(deviceTest, computeSlots), blocks, threads);
QueryPerformanceCounter(&li4);
__int64 ticksCuda = li4.QuadPart-li3.QuadPart;
cout << "CUDA Implementation Ran In " << ticksCuda << " ticks" << " (" << format_elapsed((double)ticksCuda/(double)freq) << ")" << endl;
cout << "Compute Slots: " << computeSlots << " (" << blocks << " blocks X " << threads << " threads)" << endl;
unsigned errorCount = 0;
for( size_t i = 0; i < numVals; ++i )
{
if( gold[i] != hostTest[i] )
{
++errorCount;
}
}
cout << "Number of Errors: " << errorCount << endl;
return 0;
}
file: self.cu
#pragma warning( disable : 4231)
#include <windows.h>
#include <cstdlib>
#include <vector>
#include <iostream>
#include <string>
#include <iomanip>
using namespace std;
#include "main.h"
__global__ void SelfNum(int * slots)
{
__shared__ int N;
N = (blockIdx.x * blockDim.x) + threadIdx.x;
const int numDigits = 10;
__shared__ int digits[numDigits];
for( int i = 0, temp = N; i < numDigits; ++i, temp /= 10 )
{
digits[numDigits-i-1] = temp - 10 * (temp/10) /*temp % 10*/;
}
__shared__ int s;
s = 0;
for( int i = 0; i < numDigits; ++i )
s += digits[i];
slots[N+s] = 1;
}
__host__ void ComputeSelfNumbers(sized_ptr hostMem, sized_ptr deviceMem, const unsigned blocks, const unsigned threads)
{
LARGE_INTEGER li = {0};
QueryPerformanceFrequency(&li);
double freq = (double)li.QuadPart;
LARGE_INTEGER liStart = {0};
QueryPerformanceCounter(&liStart);
// run the kernel
SelfNum<<<blocks, threads>>>(deviceMem.first);
LARGE_INTEGER liKernel = {0};
QueryPerformanceCounter(&liKernel);
cudaMemcpy(hostMem.first, deviceMem.first, hostMem.second*sizeof(int), cudaMemcpyDeviceToHost); // dont copy the overflow - just throw it away
LARGE_INTEGER liMemcpy = {0};
QueryPerformanceCounter(&liMemcpy);
// display performance stats
double e = double(liMemcpy.QuadPart - liStart.QuadPart)/freq,
eKernel = double(liKernel.QuadPart - liStart.QuadPart)/freq,
eMemcpy = double(liMemcpy.QuadPart - liKernel.QuadPart)/freq;
double pKernel = eKernel/e,
pMemcpy = eMemcpy/e;
cout << "Kernel Executed in " << format_elapsed(e) << " -- Breakdown: " << endl
<< " [kernel] : " << format_elapsed(eKernel) << " (" << format_pct(pKernel) << "%)" << endl
<< " [memcpy] : " << format_elapsed(eMemcpy) << " (" << format_pct(pMemcpy) << "%)" << endl;
}
UPDATE2:
I refactored my CUDA implementation to try to speed it up a bit. I did this by unrolling loops manually, fixing some questionable use of __shared__ memory which might have been an error, and getting rid of some redundancy.
The output of my new kernel is:
Reference Implementation Ran In 69610 ticks (5 ms)
Kernel Executed in 2 ms -- Breakdown:
[kernel] : 39 us (1.57%)
[memcpy] : 2 ms (98.43%)
CUDA Implementation Ran In 62970 ticks (4 ms)
Compute Slots: 1000448 (1954 blocks X 512 threads)
Number of Errors: 0
The only code I changed is the kernel itself, so that's all I will post here:
__global__ void SelfNum(int * slots)
{
int N = (blockIdx.x * blockDim.x) + threadIdx.x;
int s = 0;
int temp = N;
s += temp - 10 * (temp/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
s += temp - 10 * ((temp/=10)/10) /*temp % 10*/;
slots[N+s] = 1;
}
I wonder if multi-threading would help. This algorithm looks like it would lend itself well to multi-threading. (Poor-man's test of this: Create two copies of the program and run them at the same time. If it runs in less than 200% of the time, multi-threading may help).
I was actually surprised that the code below was faster then any other posted here. I probably measured it wrong, but maybe it helps; or at least is interesting.
#include <iostream>
#include <boost/progress.hpp>
class SelfCalc
{
private:
bool array[1000000];
int non_self;
public:
SelfCalc()
{
memset(&array, 0, sizeof(array));
}
void operator()(const int i)
{
if (!(array[i]))
std::cout << i << '\n';
non_self = i + (i%10) + (i/10)%10 + (i/100)%10 + (i/1000)%10 + (i/10000)%10 +(i/100000)%10;
array[non_self] = true;
}
};
class IntIterator
{
private:
int value;
public:
IntIterator(const int _value):value(_value){}
int operator*(){ return value; }
bool operator!=(const IntIterator &v){ return value != v.value; }
int operator++(){ return ++value; }
};
int main()
{
boost::progress_timer t;
SelfCalc selfCalc;
IntIterator i(1), end(100000);
std::for_each(i, end, selfCalc);
std::cout << 100000 << std::endl;
return 0;
}
Fun problem. The problem as stated does not specify what base it must be in. I fiddled around with it some and wrote a base-2 version. It generates an extra few thousand entries because the termination point of 1,000,000 is not as natural with base-2. This pre-counts the number of bits in a byte for a table lookup. The generation of the result set (without the I/O) took 2.4 ms.
One interesting thing (assuming I wrote it correctly) is that the base-2 version has about 250,000 "self numbers" up to 1,000,000 while there are just under 100,000 base-10 self numbers in that range.
#include <windows.h>
#include <stdio.h>
#include <string.h>
void StartTimer( _int64 *pt1 )
{
QueryPerformanceCounter( (LARGE_INTEGER*)pt1 );
}
double StopTimer( _int64 t1 )
{
_int64 t2, ldFreq;
QueryPerformanceCounter( (LARGE_INTEGER*)&t2 );
QueryPerformanceFrequency( (LARGE_INTEGER*)&ldFreq );
return ((double)( t2 - t1 ) / (double)ldFreq) * 1000.0;
}
#define RANGE 1000000
char sn[0x100000 + 32];
int bitCount[256];
// precompute bitcounts for each byte
void PreCountBits()
{
int i;
// generate count of bits in each byte
memset( bitCount, 0, sizeof( bitCount ));
for ( i = 0; i < 256; i++ )
{
int tmp = i;
while ( tmp )
{
if ( tmp & 0x01 )
bitCount[i]++;
tmp >>= 1;
}
}
}
void GenBase2( )
{
int i;
int *b1, *b2, *b3;
int b1sum, b2sum, b3sum;
i = 0;
for ( b1 = bitCount; b1 < bitCount + 256; b1++ )
{
b1sum = *b1;
for ( b2 = bitCount; b2 < bitCount + 256; b2++ )
{
b2sum = b1sum + *b2;
for ( b3 = bitCount; b3 < bitCount + 256; b3++ )
{
sn[i++ + *b3 + b2sum] = 1;
}
}
// 1000000 does not provide a great termination number for base 2. So check
// here. Overshoots the target some but avoids repeated checks
if ( i > RANGE )
return;
}
}
int main( int argc, char* argv[] )
{
int i = 0;
__int64 t1;
memset( sn, 0, sizeof( sn ));
StartTimer( &t1 );
PreCountBits();
GenBase2();
printf( "Generation time = %.3f\n", StopTimer( t1 ));
#if 1
for ( i = 1; i <= RANGE; i++ )
if ( !sn[i] ) printf( "%d\n", i );
#endif
return 0;
}
Maybe try just computing the recurrence relation defined below?
http://en.wikipedia.org/wiki/Self_number