How to launching 2 CUDA kernels concurrently ? - concurrency

I tried to create 4 streams to launch 4 kernels concurrently, but it seems it run serially using nsight.
My Hardware: RTX2060
My test code is as follows:
#include "cuda_runtime.h"
#include <stdio.h>
#define N 1000000
__global__ void kernel_1()
{
double sum = 0.0;
for (int i = 0; i < N; i++) {
sum = sum + tan(0.1) * tan(0.1);
}
}
int main()
{
const int n_streams = 4;
cudaStream_t *streams = (cudaStream_t *)malloc(n_streams * sizeof(cudaStream_t));
for (int i = 0; i < n_streams; i++) {
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
dim3 block(1);
dim3 grid(1);
for (int i = 0; i < n_streams; i++) {
kernel_1 << <grid, block, 0, streams[i] >> >();
kernel_1 << <grid, block, 0, streams[i] >> >();
kernel_1 << <grid, block, 0, streams[i] >> >();
kernel_1 << <grid, block, 0, streams[i] >> >();
}
printf("done\n");
return 0;
}
timeline shows as this:
kernel running timeline shotcut

Related

how do i execute both cufftXt and CUDA kernels on multiple GPUs?

I would like to use two GPUs to execute a kernel then execute a single FFT using cufftXt. The data could be several GBs in size.
My understanding of allocating memory for kernels on 2 GPUs is that you should split the host array in half and send the first half to GPU0 and the other half to GPU1. The following example shows how this could be done.
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<real_size){
float x = (i % (N+2));
if(x < N){
data3[i] = pow(data[i], 3.0f);
}
else{
data3[i] = 0.0f;
}
}
__syncthreads();
}
int main (int argc, char **argv) {
int x;
int N = 8;
int cplx_size = N * (N/2 + 1);
int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftReal)*real_size;
int half_real_size = real_size/2;
int half_mem_size = mem_size/2;
cufftReal *h_data = (cufftReal*)malloc(mem_size);
cufftReal *h_data3 = (cufftReal*)malloc(mem_size);
cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);
for(int i=0; i<real_size; i++){
x = (i % (N+2));
if(x < N){h_data[i] = 2;}
else{h_data[i] = 0;}
}
for(int i=0; i<half_real_size; i++){
h0_data[i] = h_data[i];
h1_data[i] = h_data[i+half_real_size];
}
cufftReal *d0_data;
cufftReal *d0_data3;
cufftReal *d1_data;
cufftReal *d1_data3;
cudaSetDevice(0);
gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
cudaSetDevice(1);
gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));
cout <<"device memory allocated" <<endl;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (half_real_size)/threadsPerBlock;
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));
cout <<"mem copied to devices" <<endl;
cudaSetDevice(0);
Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(1);
Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h0_data3[i] <<" ";
}
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h1_data3[i] <<" ";
}
//clean up
cudaFree(d0_data);
cudaFree(d0_data3);
cudaFree(d1_data);
cudaFree(d1_data3);
return 0;
}
However, I do not see how this approach is compatible with cufftXt. It appears that I should use the helper function cufftXtMemcpy to automatically split up the data onto the devices. But if I do that, then the multi-gpu kernel method shown above is not useable unless I allocate separate device memory for cufftXt and kernels. Is there any way to run both cufftXt and kernels without doubly allocating device memory?
Here is how I did it, following the simpleCUFFT_2d_MGPU code sample from the toolkit. I am not sure if it is completely correct. It is 50% slower on 2 GPUs than it was using only 1. I tested it this code (versus another code using R2C and C2R FFTs) on Tesla K40 GPUs.
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int N, int n, int nGPUs) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data3[i].x = pow(data[i].x, 3.0f);
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Normalize (cufftComplex *data, int N, int n, int nGPUs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data[i].x /= n;
}
__syncthreads();
}
int main (int argc, char **argv) {
int x, y;
int N = 8192;
int n = N*N;
//int cplx_size = N * (N/2 + 1);
//int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftComplex)*n;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (n)/threadsPerBlock;
cout <<"numBlocks " <<numBlocks <<endl;
cufftComplex *h_data;
h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cout <<"host data allocated" <<endl;
int index;
float lambda = N*.1;
for(y=0; y<N; y++){
for(x=0; x<N; x++){
//cout <<x <<" " <<y <<endl;
index = x + y*N;
h_data[index].x = cos(2*M_PI*(x+y)/lambda);
h_data[index].y = 0;
}
}
cout <<"host data values set" <<endl;
cufftResult res;
int device;
int nGPUs;
cudaGetDeviceCount(&nGPUs);
cout <<nGPUs <<" CUDA devices" <<endl;
size_t total_mem, free_mem;
for(int i=0; i<nGPUs; i++){
cudaMemGetInfo(&free_mem, &total_mem);
cout <<"GPU" <<i <<" used memory " <<(total_mem-free_mem)/pow(10,9);
}
int whichGPUs[nGPUs];
for(int i=0; i<nGPUs; i++){
whichGPUs[i]=i;
}
cout <<"whichgpus set" <<endl;
size_t* worksize;
worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);
cout <<"worksize set" <<endl;
cufftHandle plan_complex;
res = cufftCreate(&plan_complex);
if (res != CUFFT_SUCCESS){cout <<"create plan failed" <<endl;}
res = cufftXtSetGPUs(plan_complex, nGPUs, whichGPUs);
if (res != CUFFT_SUCCESS){cout <<"setgpus forward failed" <<endl;}
cout <<"set gpus" <<endl;
res = cufftMakePlan2d(plan_complex, N, N, CUFFT_C2C, worksize);
if (res != CUFFT_SUCCESS){cout <<"make plan forward failed" <<endl;}
cout <<"plan created" <<endl;
cudaLibXtDesc *d_data;
cudaLibXtDesc *d_data3;
res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data, CUFFT_XT_FORMAT_INPLACE);
if (res != CUFFT_SUCCESS){cout <<"data malloc failed" <<endl;}
res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data3, CUFFT_XT_FORMAT_INPLACE);
if (res != CUFFT_SUCCESS){cout <<"data3 malloc failed" <<endl;}
cout <<"xtmalloc done" <<endl;
res = cufftXtMemcpy (plan_complex, d_data, h_data, CUFFT_COPY_HOST_TO_DEVICE);
if (res != CUFFT_SUCCESS){cout <<"memcpy to device failed" <<endl;}
cout <<"memcpy h to d" <<endl;
int tmax = 10000;
int start = time(0);
for(int tau=0; tau<tmax; tau++){
res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_INVERSE);
if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
for(int i=0; i<nGPUs; i++){
device = d_data->descriptor->GPUs[i];
cudaSetDevice(device);
Normalize <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], N, n, nGPUs);
}
cudaDeviceSynchronize();
}
int stop = time(0);
cout <<tmax <<" timesteps" <<endl <<(stop-start) <<" seconds"<<endl;
/*
for(int i=0; i<nGPUs; i++){
device = d_data->descriptor->GPUs[i];
cudaSetDevice(device);
Cube <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], (cufftComplex*) d_data3->descriptor->data[i], N, real_size);
}
*/
/*
cudaDeviceSynchronize();
res = cufftXtMemcpy (plan_complex, h_data, d_data, CUFFT_COPY_DEVICE_TO_HOST);
if (res != CUFFT_SUCCESS){cout <<"memcpy to host failed" <<endl;}
cout <<"memcpy d to h" <<endl;
ofstream fout;
ostringstream outstr;
outstr.precision(4);
outstr <<time(0) <<".dat";
string filename=outstr.str();
fout.open(filename.c_str());
fout.precision(4);
for (int i = 0; i < n; i++) {
x = (i % (N));
y = (i /(N))%N;
fout <<x <<" " <<y <<" " <<h_data[i].x <<endl;
}
fout.close();
*/
//clean up
res = cufftXtFree(d_data);
if (res != CUFFT_SUCCESS){cout <<"free data failed" <<endl;}
res = cufftXtFree(d_data3);
if (res != CUFFT_SUCCESS){cout <<"free data3 failed" <<endl;}
cufftDestroy(plan_complex);
return 0;
}

Difference in speed between GSL and MKL

I have two codes that are both working, yet I cannot figure out why one is so much faster than the other. To my knowledge, BLAS with MKL (Intel) should be much faster than GSL (GNU), although my code is showing quite the opposite. Here are the codes themselves where I am simply creating 2 matrices at the master node and then sending different rows to different "slave" processors (with OpenMPI) which compute the final matrices elements and then return them back to the master node.
GSL example (the fast code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include <gsl/gsl_blas.h>
using namespace std;
int main(int argc, char** argv){
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000; //must be same if matrices multiplied together = acols = brows
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols]; //here ncols corresponds to numbers of rows for matrix b
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
}; //this is imply a 1-d array of zeros which will be updated and passed by processors
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ;
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code
// send one row to each slave tagged with row number, assume nprocs<nrows
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
gsl_matrix_view AAAA = gsl_matrix_view_array(buff, 1, nsame);
gsl_matrix_view BBBB = gsl_matrix_view_array(b, nsame, bcols);
gsl_matrix_view CCCC = gsl_matrix_view_array(CC, 1, bcols);
/* Compute C = A B */
gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, &AAAA.matrix, &BBBB.matrix,
0.0, &CCCC.matrix);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
// cout << ans << " OUTPUT \n";
}
}
MPI_Finalize();
return 0;
};
MKL example (the slow code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include </opt/intel/compilers_and_libraries_2017.1.126/mac/mkl/include/mkl.h>
using namespace std;
int main(int argc, char** argv){ //THE IDENTITY MATRIX ONLY WORKS IF arows = nsame!
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000;
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols];
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
};
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ; // = 1.*i as test value
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code nprocs<nrows
delete[] b;
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
/* Compute C = A B */
cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, bcols, nsame, 1.0, buff, nsame, b, bcols,
0.0, CC, bcols);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
}
}
MPI_Finalize();
return 0;
};
I was thinking it might be due to me not deleting any of the new elements created, although I use essentially the same approach to initialize the arrays in both codes. I even tried deleting values in the MKL code (as shown) yet this appears to not have much of an effect. When I increase the size of the arrays from nsame = arows = bcols = 1000 to nsame = arows = bcols = 10000, the time differences in the two codes can readily be observed (the GSL code takes approximately 45 seconds while the MKL code takes quite a few minutes). Thus I am wondering if this is simply inherent to the way GSL and MKL are designed and incorporated in my code or if there is perhaps something else more subtle going on.

C++ trying to improve performance of pthread program

i need help with improving speed of my multithread program in c++ using pthreads.
std::vector<double> solve_progon(std::vector<std::vector<double> > A, std::vector <double> B) {
// solving
}
std::vector<double> solve(std::vector<double> left, std::vector<double> mid, std::vector<double> right, int j) {
//solving
}
void * calc(void *thread) {
long t = (long) thread;
int start_index = t * (X_SIZE / THREADS);
int end_index = (t != THREADS - 1)?(t + 1) * (X_SIZE / THREADS) - 1: X_SIZE - 1;
std::vector<std::vector<double> > local, next;
std::vector<double> zeros;
for (int i = 0; i < Y_SIZE; i++) {
zeros.push_back(0);
}
double cur_time = 0;
while (cur_time < T) {
for (int i = start_index; i <= end_index; i ++) {
next.push_back(solve(phi[i - 1], phi[i], phi[i + 1], i - start_index));
}
cur_time += dt;
pthread_barrier_wait(&bar);
for (int i = start_index; i <=end_index; i++) {
phi[i] = next[i - start_index];
}
next.clear();
pthread_barrier_wait(&syn);
}
pthread_exit(NULL);
}
int main(int argc, char **argv) {
//Some init
pthread_barrier_init(&bar, NULL, THREADS);
pthread_barrier_init(&syn, NULL, THREADS);
pthread_t *threads = new pthread_t[THREADS];
unsigned long long start = clock_time();
for (long i = 0; i < THREADS; i++) {
if (pthread_create(&threads[i], NULL, calc, (void *)i) != 0) {
std::cout << "Can't create thread " << i << std::endl;
}
}
for (int i = 0; i < THREADS; i++) {
pthread_join(threads[i], NULL);
}
std::cout << "It takes " << (double)(clock_time() - start) / 1e9 << std::endl;
return 0;
}
Full version at https://github.com/minaevmike/fedlab_pthread/blob/master/main.cpp
So f.e. if i have 4 thread calculation time is 118.288 sec. If 1 101.993. So how i can improve the speed. Thank you.

Using popcnt on the GPU

I need to compute
(a & b).count()
over a large set (> 10000) bit vectors (std::bitset<N>) where N is anywhere from 2 ^ 10 to 2 ^16.
const size_t N = 2048;
std::vector<std::vector<char>> distances;
std::vector<std::bitset<N>> bits(100000);
load_from_file(bits);
for(int i = 0; i < bits.size(); i++){
for(int j = 0; j < bits.size(); j++){
distance[i][j] = (bits[i] & bits[j]).count();
}
}
Currently I'm relying on chunked multithreading and SSE/AVX to compute distances. Luckily I can use vpand from AVX to compute the & but my code is still using popcnt (%rax) and a loop to compute the bit counts.
Is there a way I can compute the (a & b).count() function on my GPU (nVidia 760m)? Ideally I would just pass 2 chunks of memory of N bits. I was looking at using thrust but I couldn't find a popcnt function.
EDIT:
Current CPU implementation.
double validate_pooled(const size_t K) const{
int right = 0;
const size_t num_examples = labels.size();
threadpool tp;
std::vector<std::future<bool>> futs;
for(size_t i = 0; i < num_examples; i++){
futs.push_back(tp.enqueue(&kNN<N>::validate_N, this, i, K));
}
for(auto& fut : futs)
if(fut.get()) right++;
return right / (double) num_examples;
}
bool validate_N(const size_t cmp, const size_t n) const{
const size_t num_examples = labels.size();
std::vector<char> dists(num_examples, -1);
for(size_t i = 0; i < num_examples; i++){
if(i == cmp) continue;
dists[i] = (bits[cmp] & bits[i]).count();
}
typedef std::unordered_map<std::string,size_t> counter;
counter counts;
for(size_t i = 0; i < n; i++){
auto iter = std::max_element(dists.cbegin(), dists.cend());
size_t idx = std::distance(dists.cbegin(), iter);
dists[idx] = -1; // Remove the top result.
counts[labels[idx]] += 1;
}
auto iter = std::max_element(counts.cbegin(), counts.cend(),
[](const counter::value_type& a, const counter::value_type& b){ return a.second < b.second; });
return labels[cmp] == iter->first;;
}
EDIT:
This is what I've come up with. However its brutally slow. I'm not sure if I'm doing something wrong
template<size_t N>
struct popl
{
typedef unsigned long word_type;
std::bitset<N> _cmp;
popl(const std::bitset<N>& cmp) : _cmp(cmp) {}
__device__
int operator()(const std::bitset<N>& x) const
{
int pop_total = 0;
#pragma unroll
for(size_t i = 0; i < N/64; i++)
pop_total += __popcll(x._M_w[i] & _cmp._M_w[i]);
return pop_total;
}
};
int main(void) {
const size_t N = 2048;
thrust::host_vector<std::bitset<N> > h_vec;
load_bits(h_vec);
thrust::device_vector<std::bitset<N> > d_vec = h_vec;
thrust::device_vector<int> r_vec(h_vec.size(), 0);
for(int i = 0; i < h_vec.size(); i++){
r_vec[i] = thrust::transform_reduce(d_vec.cbegin(), d_vec.cend(), popl<N>(d_vec[i]), 0, thrust::maximum<int>());
}
return 0;
}
CUDA has population count intrinsics for both 32-bit and 64-bit types. (__popc() and __popcll())
These could be used directly in a CUDA kernel or via thrust (in a functor) perhaps passed to thrust::transform_reduce.
If that is the only function you want to do on the GPU, it may be difficult to get a net "win" because of the "cost" of transferring data to/from the GPU. Your overall input data set appears to be about 1GB in size (100000 vectors of bit length 65536), but the output data set appears to be 10-40GB in size based on my calculations (100000 * 100000 * 1-4 bytes per result).
Either the CUDA kernel or the thrust function and data layout should be crafted carefully with the objective of having the code run limited only by memory bandwidth. The cost of data transfer could also be mitigated, perhaps to a large extent, by overlap of copy and compute operations, mainly on the output data set.
At first glance, this problem appears to be somewhat similar to the problem of computing euclidean distances among sets of vectors, so this question/answer may be of interest, from a CUDA perspective.
EDIT: adding some code that I used to investigate this. I am able to get a significant speedup (~25x including data copy time) over a naive single-threaded CPU implementation, but I don't know how fast the CPU version would be using "chunked multithreading and SSE/AVX ", so it would be interesting to see more of your implementation or get some performance numbers. I also don't think the CUDA code I have here is highly optimized, it's just a "first cut".
In this case, for proof-of-concept, I focused on a small problem size, N=2048, 10000 bitsets. For this small problem size, I can fit enough of the vector of bitsets in shared memory, for a "small" threadblock size, to take advantage of shared memory. So this particular approach would have to be modified for larger N.
$ cat t581.cu
#include <iostream>
#include <vector>
#include <bitset>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define nTPB 128
#define OUT_CHUNK 250
#define N_bits 2048
#define N_vecs 10000
const size_t N = N_bits;
__global__ void comp_dist(unsigned *in, unsigned *out, unsigned numvecs, unsigned start_idx, unsigned end_idx){
__shared__ unsigned sdata[(N/32)*nTPB];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < numvecs)
for (int i = 0; i < (N/32); i++)
sdata[(i*nTPB)+threadIdx.x] = in[(i*numvecs)+idx];
__syncthreads();
int vidx = start_idx;
if (idx < numvecs)
while (vidx < end_idx) {
unsigned sum = 0;
for (int i = 0; i < N/32; i++)
sum += __popc(sdata[(i*nTPB)+ threadIdx.x] & in[(i*numvecs)+vidx]);
out[((vidx-start_idx)*numvecs)+idx] = sum;
vidx++;}
}
void cpu_test(std::vector<std::bitset<N> > &in, std::vector<std::vector<unsigned> > &out){
for (int i=0; i < in.size(); i++)
for (int j=0; j< in.size(); j++)
out[i][j] = (in[i] & in[j]).count();
}
int check_data(unsigned *d1, unsigned start_idx, std::vector<std::vector<unsigned> > &d2){
for (int i = start_idx; i < start_idx+OUT_CHUNK; i++)
for (int j = 0; j<N_vecs; j++)
if (d1[((i-start_idx)*N_vecs)+j] != d2[i][j]) {std::cout << "mismatch at " << i << "," << j << " was: " << d1[((i-start_idx)*N_vecs)+j] << " should be: " << d2[i][j] << std::endl; return 1;}
return 0;
}
unsigned long long get_time_usec(){
timeval tv;
gettimeofday(&tv, 0);
return (unsigned long long)(((unsigned long long)tv.tv_sec*1000000ULL)+(unsigned long long)tv.tv_usec);
}
int main(){
unsigned long long t1, t2;
std::vector<std::vector<unsigned> > distances;
std::vector<std::bitset<N> > bits;
for (int i = 0; i < N_vecs; i++){
std::vector<unsigned> dist_row(N_vecs, 0);
distances.push_back(dist_row);
std::bitset<N> data;
for (int j =0; j < N; j++) data[j] = rand() & 1;
bits.push_back(data);}
t1 = get_time_usec();
cpu_test(bits, distances);
t1 = get_time_usec() - t1;
unsigned *h_data = new unsigned[(N/32)*N_vecs];
memset(h_data, 0, (N/32)*N_vecs*sizeof(unsigned));
for (int i = 0; i < N_vecs; i++)
for (int j = 0; j < N; j++)
if (bits[i][j]) h_data[(i)+((j/32)*N_vecs)] |= 1U<<(31-(j&31));
unsigned *d_in, *d_out1, *d_out2, *h_out1, *h_out2;
cudaMalloc(&d_in, (N/32)*N_vecs*sizeof(unsigned));
cudaMalloc(&d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaMalloc(&d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
h_out1 = new unsigned[N_vecs*OUT_CHUNK];
h_out2 = new unsigned[N_vecs*OUT_CHUNK];
t2 = get_time_usec();
cudaMemcpy(d_in, h_data, (N/32)*N_vecs*sizeof(unsigned), cudaMemcpyHostToDevice);
for (int i = 0; i < N_vecs; i += 2*OUT_CHUNK){
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream1>>>(d_in, d_out1, N_vecs, i, i+OUT_CHUNK);
cudaStreamSynchronize(stream2);
if (i > 0) if (check_data(h_out2, i-OUT_CHUNK, distances)) return 1;
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream2>>>(d_in, d_out2, N_vecs, i+OUT_CHUNK, i+2*OUT_CHUNK);
cudaMemcpyAsync(h_out1, d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(h_out2, d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream2);
cudaStreamSynchronize(stream1);
if (check_data(h_out1, i, distances)) return 1;
}
cudaDeviceSynchronize();
t2 = get_time_usec() - t2;
std::cout << "cpu time: " << ((float)t1)/(float)1000 << "ms gpu time: " << ((float) t2)/(float)1000 << "ms" << std::endl;
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t581 t581.cu
$ ./t581
cpu time: 20324.1ms gpu time: 753.76ms
$
CUDA 6.5, Fedora20, Xeon X5560, Quadro5000 (cc2.0) GPU. The above test case includes results verification between the distances data produced on the CPU vs. the GPU. I've also broken this into a chunked algorithm with results data transfer (and verification) overlapped with compute operations, to make it more easily extendable to the case where there is a very large amount of output data (e.g. 100000 bitsets). I haven't actually run this through the profiler yet, however.
EDIT 2: Here's a "windows version" of the code:
#include <iostream>
#include <vector>
#include <bitset>
#include <stdlib.h>
#include <time.h>
#define nTPB 128
#define OUT_CHUNK 250
#define N_bits 2048
#define N_vecs 10000
const size_t N = N_bits;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void comp_dist(unsigned *in, unsigned *out, unsigned numvecs, unsigned start_idx, unsigned end_idx){
__shared__ unsigned sdata[(N/32)*nTPB];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < numvecs)
for (int i = 0; i < (N/32); i++)
sdata[(i*nTPB)+threadIdx.x] = in[(i*numvecs)+idx];
__syncthreads();
int vidx = start_idx;
if (idx < numvecs)
while (vidx < end_idx) {
unsigned sum = 0;
for (int i = 0; i < N/32; i++)
sum += __popc(sdata[(i*nTPB)+ threadIdx.x] & in[(i*numvecs)+vidx]);
out[((vidx-start_idx)*numvecs)+idx] = sum;
vidx++;}
}
void cpu_test(std::vector<std::bitset<N> > &in, std::vector<std::vector<unsigned> > &out){
for (unsigned i=0; i < in.size(); i++)
for (unsigned j=0; j< in.size(); j++)
out[i][j] = (in[i] & in[j]).count();
}
int check_data(unsigned *d1, unsigned start_idx, std::vector<std::vector<unsigned> > &d2){
for (unsigned i = start_idx; i < start_idx+OUT_CHUNK; i++)
for (unsigned j = 0; j<N_vecs; j++)
if (d1[((i-start_idx)*N_vecs)+j] != d2[i][j]) {std::cout << "mismatch at " << i << "," << j << " was: " << d1[((i-start_idx)*N_vecs)+j] << " should be: " << d2[i][j] << std::endl; return 1;}
return 0;
}
unsigned long long get_time_usec(){
return (unsigned long long)((clock()/(float)CLOCKS_PER_SEC)*(1000000ULL));
}
int main(){
unsigned long long t1, t2;
std::vector<std::vector<unsigned> > distances;
std::vector<std::bitset<N> > bits;
for (int i = 0; i < N_vecs; i++){
std::vector<unsigned> dist_row(N_vecs, 0);
distances.push_back(dist_row);
std::bitset<N> data;
for (int j =0; j < N; j++) data[j] = rand() & 1;
bits.push_back(data);}
t1 = get_time_usec();
cpu_test(bits, distances);
t1 = get_time_usec() - t1;
unsigned *h_data = new unsigned[(N/32)*N_vecs];
memset(h_data, 0, (N/32)*N_vecs*sizeof(unsigned));
for (int i = 0; i < N_vecs; i++)
for (int j = 0; j < N; j++)
if (bits[i][j]) h_data[(i)+((j/32)*N_vecs)] |= 1U<<(31-(j&31));
unsigned *d_in, *d_out1, *d_out2, *h_out1, *h_out2;
cudaMalloc(&d_in, (N/32)*N_vecs*sizeof(unsigned));
cudaMalloc(&d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaMalloc(&d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaCheckErrors("cudaMalloc fail");
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaCheckErrors("cudaStrem fail");
h_out1 = new unsigned[N_vecs*OUT_CHUNK];
h_out2 = new unsigned[N_vecs*OUT_CHUNK];
t2 = get_time_usec();
cudaMemcpy(d_in, h_data, (N/32)*N_vecs*sizeof(unsigned), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
for (int i = 0; i < N_vecs; i += 2*OUT_CHUNK){
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream1>>>(d_in, d_out1, N_vecs, i, i+OUT_CHUNK);
cudaCheckErrors("cuda kernel loop 1 fail");
cudaStreamSynchronize(stream2);
if (i > 0) if (check_data(h_out2, i-OUT_CHUNK, distances)) return 1;
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream2>>>(d_in, d_out2, N_vecs, i+OUT_CHUNK, i+2*OUT_CHUNK);
cudaCheckErrors("cuda kernel loop 2 fail");
cudaMemcpyAsync(h_out1, d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(h_out2, d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream2);
cudaCheckErrors("cuda kernel loop 3 fail");
cudaStreamSynchronize(stream1);
if (check_data(h_out1, i, distances)) return 1;
}
cudaDeviceSynchronize();
cudaCheckErrors("cuda kernel loop 4 fail");
t2 = get_time_usec() - t2;
std::cout << "cpu time: " << ((float)t1)/(float)1000 << "ms gpu time: " << ((float) t2)/(float)1000 << "ms" << std::endl;
return 0;
}
I've added CUDA error checking to this code. Be sure to build a release project in Visual Studio, not debug. When I run this on a windows 7 laptop with a Quadro1000M GPU I get about 35 seconds for the CPU execution and about 1.5 seconds for the GPU.
OpenCL 1.2 has popcount which would seem to do what you want. It can work on a vector, so up to ulong16 which is 1024 bits at a time. Note that NVIDIA drivers only support OpenCL 1.1 which does not include this function.
Of course you could just use a function or table to compute it pretty quickly, so an OpenCL 1.1 implementation is possible as well, and would likely run at the memory bandwidth of the device.

measured runtime from c++ "time.h" is double than real

I am running this pthread-c++ program (gauss elimination) on my laptop to measure its runtime.
The program runs about 10 seconds in real but my output shows about 20 seconds. What is wrong with this program?
I used
g++ -pthread main.c
./a.out 32 2048
to run
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <cstdlib>
#include <pthread.h>
#include <iostream>
typedef float Type;
void mat_rand (Type**, int, int);
Type** mat_aloc (int, int);
void mat_free (Type**);
void mat_print (Type**, int, int);
void* eliminate(void*);
unsigned int n, max_threads, active_threads, thread_length;
Type** A;
int current_row;
struct args
{
int start;
int end;
};
typedef struct args argument;
void *print_message_function( void *ptr );
int main(int argc, char *argv[])
{
if (argc < 3)
{
printf ("Error!. Please Enter The Matrix Dimension and No. of Threads!\n");
return 0;
} else
{
n = atoi(argv[2]);
max_threads = atoi(argv[1]);
if (n > 4096)
{
printf ("The maximum allowed size is 4096!\n");
return 0;
}
if (max_threads > 32)
{
printf ("The maximum allowed Threads Count is 32!\n");
return 0;
}
}
A = mat_aloc(n , n+1);
mat_rand (A, n, n+1);
//mat_print (A, n, n+1);
std::clock_t start;
double exe_time;
start = std::clock();
pthread_attr_t attr;
pthread_attr_init(&attr);
argument* thread_args = new argument[max_threads];
pthread_t* thread = new pthread_t[max_threads];
for (int i=0; i<n-1; i++)
{
current_row = i;
if (max_threads >= n-i)
active_threads = n-i-1;
else
active_threads = max_threads;
thread_length = (n-i-1)/active_threads;
for (int j=0; j<active_threads-1; j++)
{
thread_args[j].start = i+1+j*thread_length;
thread_args[j].end = i+1+(j+1)*thread_length;
pthread_create( &thread[j], &attr, eliminate, (void*) &thread_args[j]);
}
thread_args[active_threads-1].start = i+1+(active_threads-1)*thread_length;
thread_args[active_threads-1].end = n-1;
pthread_create(&thread[active_threads-1], &attr, eliminate, (void*) &thread_args[active_threads-1]);
for (int j=0; j<active_threads; j++)
{
pthread_join(thread[j], NULL);
}
}
exe_time = (clock() - start) / (double) CLOCKS_PER_SEC;
printf("Execution time for Matrix of size %i: %f\n", n, exe_time);
//mat_print (A, n, n+1);
return 0;
}
void* eliminate(void* arg)
{
Type k, row_constant;
argument* info = (argument*) arg;
row_constant = A[current_row][current_row];
for (int i=info->start; i<=info->end; i++)
{
k = A[i][current_row] / row_constant;
A[i][current_row] = 0;
for (int j=current_row+1; j<n+1; j++)
{
A[i][j] -= k*A[current_row][j];
}
}
}
// matrix random values
void mat_rand (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
for (int j=0; j<column; j++)
{
matrix[i][j] = (float)(1) + ((float)rand()/(float)RAND_MAX)*256;
}
}
// allocates a 2d matrix
Type** mat_aloc (int row, int column)
{
Type* temp = new Type [row*column];
if (temp == NULL)
{
delete [] temp;
return 0;
}
Type** mat = new Type* [row];
if (temp == NULL)
{
delete [] mat;
return 0;
}
for (int i=0; i<row; i++)
{
mat[i] = temp + i*column;
}
return mat;
}
// free memory of matrix
void mat_free (Type** matrix)
{
delete[] (*matrix);
delete[] matrix;
}
// print matrix
void mat_print (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
{
for (int j=0; j<column; j++)
{
std::cout<< matrix[i][j] << "\t\t";
}
printf("\n");
}
printf(".................\n");
}
clock reports CPU time used. If you have 2 CPUs and run a thread on each one for 10 seconds, clock will report 20 seconds.