Using popcnt on the GPU - c++

I need to compute
(a & b).count()
over a large set (> 10000) bit vectors (std::bitset<N>) where N is anywhere from 2 ^ 10 to 2 ^16.
const size_t N = 2048;
std::vector<std::vector<char>> distances;
std::vector<std::bitset<N>> bits(100000);
load_from_file(bits);
for(int i = 0; i < bits.size(); i++){
for(int j = 0; j < bits.size(); j++){
distance[i][j] = (bits[i] & bits[j]).count();
}
}
Currently I'm relying on chunked multithreading and SSE/AVX to compute distances. Luckily I can use vpand from AVX to compute the & but my code is still using popcnt (%rax) and a loop to compute the bit counts.
Is there a way I can compute the (a & b).count() function on my GPU (nVidia 760m)? Ideally I would just pass 2 chunks of memory of N bits. I was looking at using thrust but I couldn't find a popcnt function.
EDIT:
Current CPU implementation.
double validate_pooled(const size_t K) const{
int right = 0;
const size_t num_examples = labels.size();
threadpool tp;
std::vector<std::future<bool>> futs;
for(size_t i = 0; i < num_examples; i++){
futs.push_back(tp.enqueue(&kNN<N>::validate_N, this, i, K));
}
for(auto& fut : futs)
if(fut.get()) right++;
return right / (double) num_examples;
}
bool validate_N(const size_t cmp, const size_t n) const{
const size_t num_examples = labels.size();
std::vector<char> dists(num_examples, -1);
for(size_t i = 0; i < num_examples; i++){
if(i == cmp) continue;
dists[i] = (bits[cmp] & bits[i]).count();
}
typedef std::unordered_map<std::string,size_t> counter;
counter counts;
for(size_t i = 0; i < n; i++){
auto iter = std::max_element(dists.cbegin(), dists.cend());
size_t idx = std::distance(dists.cbegin(), iter);
dists[idx] = -1; // Remove the top result.
counts[labels[idx]] += 1;
}
auto iter = std::max_element(counts.cbegin(), counts.cend(),
[](const counter::value_type& a, const counter::value_type& b){ return a.second < b.second; });
return labels[cmp] == iter->first;;
}
EDIT:
This is what I've come up with. However its brutally slow. I'm not sure if I'm doing something wrong
template<size_t N>
struct popl
{
typedef unsigned long word_type;
std::bitset<N> _cmp;
popl(const std::bitset<N>& cmp) : _cmp(cmp) {}
__device__
int operator()(const std::bitset<N>& x) const
{
int pop_total = 0;
#pragma unroll
for(size_t i = 0; i < N/64; i++)
pop_total += __popcll(x._M_w[i] & _cmp._M_w[i]);
return pop_total;
}
};
int main(void) {
const size_t N = 2048;
thrust::host_vector<std::bitset<N> > h_vec;
load_bits(h_vec);
thrust::device_vector<std::bitset<N> > d_vec = h_vec;
thrust::device_vector<int> r_vec(h_vec.size(), 0);
for(int i = 0; i < h_vec.size(); i++){
r_vec[i] = thrust::transform_reduce(d_vec.cbegin(), d_vec.cend(), popl<N>(d_vec[i]), 0, thrust::maximum<int>());
}
return 0;
}

CUDA has population count intrinsics for both 32-bit and 64-bit types. (__popc() and __popcll())
These could be used directly in a CUDA kernel or via thrust (in a functor) perhaps passed to thrust::transform_reduce.
If that is the only function you want to do on the GPU, it may be difficult to get a net "win" because of the "cost" of transferring data to/from the GPU. Your overall input data set appears to be about 1GB in size (100000 vectors of bit length 65536), but the output data set appears to be 10-40GB in size based on my calculations (100000 * 100000 * 1-4 bytes per result).
Either the CUDA kernel or the thrust function and data layout should be crafted carefully with the objective of having the code run limited only by memory bandwidth. The cost of data transfer could also be mitigated, perhaps to a large extent, by overlap of copy and compute operations, mainly on the output data set.
At first glance, this problem appears to be somewhat similar to the problem of computing euclidean distances among sets of vectors, so this question/answer may be of interest, from a CUDA perspective.
EDIT: adding some code that I used to investigate this. I am able to get a significant speedup (~25x including data copy time) over a naive single-threaded CPU implementation, but I don't know how fast the CPU version would be using "chunked multithreading and SSE/AVX ", so it would be interesting to see more of your implementation or get some performance numbers. I also don't think the CUDA code I have here is highly optimized, it's just a "first cut".
In this case, for proof-of-concept, I focused on a small problem size, N=2048, 10000 bitsets. For this small problem size, I can fit enough of the vector of bitsets in shared memory, for a "small" threadblock size, to take advantage of shared memory. So this particular approach would have to be modified for larger N.
$ cat t581.cu
#include <iostream>
#include <vector>
#include <bitset>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define nTPB 128
#define OUT_CHUNK 250
#define N_bits 2048
#define N_vecs 10000
const size_t N = N_bits;
__global__ void comp_dist(unsigned *in, unsigned *out, unsigned numvecs, unsigned start_idx, unsigned end_idx){
__shared__ unsigned sdata[(N/32)*nTPB];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < numvecs)
for (int i = 0; i < (N/32); i++)
sdata[(i*nTPB)+threadIdx.x] = in[(i*numvecs)+idx];
__syncthreads();
int vidx = start_idx;
if (idx < numvecs)
while (vidx < end_idx) {
unsigned sum = 0;
for (int i = 0; i < N/32; i++)
sum += __popc(sdata[(i*nTPB)+ threadIdx.x] & in[(i*numvecs)+vidx]);
out[((vidx-start_idx)*numvecs)+idx] = sum;
vidx++;}
}
void cpu_test(std::vector<std::bitset<N> > &in, std::vector<std::vector<unsigned> > &out){
for (int i=0; i < in.size(); i++)
for (int j=0; j< in.size(); j++)
out[i][j] = (in[i] & in[j]).count();
}
int check_data(unsigned *d1, unsigned start_idx, std::vector<std::vector<unsigned> > &d2){
for (int i = start_idx; i < start_idx+OUT_CHUNK; i++)
for (int j = 0; j<N_vecs; j++)
if (d1[((i-start_idx)*N_vecs)+j] != d2[i][j]) {std::cout << "mismatch at " << i << "," << j << " was: " << d1[((i-start_idx)*N_vecs)+j] << " should be: " << d2[i][j] << std::endl; return 1;}
return 0;
}
unsigned long long get_time_usec(){
timeval tv;
gettimeofday(&tv, 0);
return (unsigned long long)(((unsigned long long)tv.tv_sec*1000000ULL)+(unsigned long long)tv.tv_usec);
}
int main(){
unsigned long long t1, t2;
std::vector<std::vector<unsigned> > distances;
std::vector<std::bitset<N> > bits;
for (int i = 0; i < N_vecs; i++){
std::vector<unsigned> dist_row(N_vecs, 0);
distances.push_back(dist_row);
std::bitset<N> data;
for (int j =0; j < N; j++) data[j] = rand() & 1;
bits.push_back(data);}
t1 = get_time_usec();
cpu_test(bits, distances);
t1 = get_time_usec() - t1;
unsigned *h_data = new unsigned[(N/32)*N_vecs];
memset(h_data, 0, (N/32)*N_vecs*sizeof(unsigned));
for (int i = 0; i < N_vecs; i++)
for (int j = 0; j < N; j++)
if (bits[i][j]) h_data[(i)+((j/32)*N_vecs)] |= 1U<<(31-(j&31));
unsigned *d_in, *d_out1, *d_out2, *h_out1, *h_out2;
cudaMalloc(&d_in, (N/32)*N_vecs*sizeof(unsigned));
cudaMalloc(&d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaMalloc(&d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
h_out1 = new unsigned[N_vecs*OUT_CHUNK];
h_out2 = new unsigned[N_vecs*OUT_CHUNK];
t2 = get_time_usec();
cudaMemcpy(d_in, h_data, (N/32)*N_vecs*sizeof(unsigned), cudaMemcpyHostToDevice);
for (int i = 0; i < N_vecs; i += 2*OUT_CHUNK){
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream1>>>(d_in, d_out1, N_vecs, i, i+OUT_CHUNK);
cudaStreamSynchronize(stream2);
if (i > 0) if (check_data(h_out2, i-OUT_CHUNK, distances)) return 1;
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream2>>>(d_in, d_out2, N_vecs, i+OUT_CHUNK, i+2*OUT_CHUNK);
cudaMemcpyAsync(h_out1, d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(h_out2, d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream2);
cudaStreamSynchronize(stream1);
if (check_data(h_out1, i, distances)) return 1;
}
cudaDeviceSynchronize();
t2 = get_time_usec() - t2;
std::cout << "cpu time: " << ((float)t1)/(float)1000 << "ms gpu time: " << ((float) t2)/(float)1000 << "ms" << std::endl;
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t581 t581.cu
$ ./t581
cpu time: 20324.1ms gpu time: 753.76ms
$
CUDA 6.5, Fedora20, Xeon X5560, Quadro5000 (cc2.0) GPU. The above test case includes results verification between the distances data produced on the CPU vs. the GPU. I've also broken this into a chunked algorithm with results data transfer (and verification) overlapped with compute operations, to make it more easily extendable to the case where there is a very large amount of output data (e.g. 100000 bitsets). I haven't actually run this through the profiler yet, however.
EDIT 2: Here's a "windows version" of the code:
#include <iostream>
#include <vector>
#include <bitset>
#include <stdlib.h>
#include <time.h>
#define nTPB 128
#define OUT_CHUNK 250
#define N_bits 2048
#define N_vecs 10000
const size_t N = N_bits;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void comp_dist(unsigned *in, unsigned *out, unsigned numvecs, unsigned start_idx, unsigned end_idx){
__shared__ unsigned sdata[(N/32)*nTPB];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < numvecs)
for (int i = 0; i < (N/32); i++)
sdata[(i*nTPB)+threadIdx.x] = in[(i*numvecs)+idx];
__syncthreads();
int vidx = start_idx;
if (idx < numvecs)
while (vidx < end_idx) {
unsigned sum = 0;
for (int i = 0; i < N/32; i++)
sum += __popc(sdata[(i*nTPB)+ threadIdx.x] & in[(i*numvecs)+vidx]);
out[((vidx-start_idx)*numvecs)+idx] = sum;
vidx++;}
}
void cpu_test(std::vector<std::bitset<N> > &in, std::vector<std::vector<unsigned> > &out){
for (unsigned i=0; i < in.size(); i++)
for (unsigned j=0; j< in.size(); j++)
out[i][j] = (in[i] & in[j]).count();
}
int check_data(unsigned *d1, unsigned start_idx, std::vector<std::vector<unsigned> > &d2){
for (unsigned i = start_idx; i < start_idx+OUT_CHUNK; i++)
for (unsigned j = 0; j<N_vecs; j++)
if (d1[((i-start_idx)*N_vecs)+j] != d2[i][j]) {std::cout << "mismatch at " << i << "," << j << " was: " << d1[((i-start_idx)*N_vecs)+j] << " should be: " << d2[i][j] << std::endl; return 1;}
return 0;
}
unsigned long long get_time_usec(){
return (unsigned long long)((clock()/(float)CLOCKS_PER_SEC)*(1000000ULL));
}
int main(){
unsigned long long t1, t2;
std::vector<std::vector<unsigned> > distances;
std::vector<std::bitset<N> > bits;
for (int i = 0; i < N_vecs; i++){
std::vector<unsigned> dist_row(N_vecs, 0);
distances.push_back(dist_row);
std::bitset<N> data;
for (int j =0; j < N; j++) data[j] = rand() & 1;
bits.push_back(data);}
t1 = get_time_usec();
cpu_test(bits, distances);
t1 = get_time_usec() - t1;
unsigned *h_data = new unsigned[(N/32)*N_vecs];
memset(h_data, 0, (N/32)*N_vecs*sizeof(unsigned));
for (int i = 0; i < N_vecs; i++)
for (int j = 0; j < N; j++)
if (bits[i][j]) h_data[(i)+((j/32)*N_vecs)] |= 1U<<(31-(j&31));
unsigned *d_in, *d_out1, *d_out2, *h_out1, *h_out2;
cudaMalloc(&d_in, (N/32)*N_vecs*sizeof(unsigned));
cudaMalloc(&d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaMalloc(&d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaCheckErrors("cudaMalloc fail");
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaCheckErrors("cudaStrem fail");
h_out1 = new unsigned[N_vecs*OUT_CHUNK];
h_out2 = new unsigned[N_vecs*OUT_CHUNK];
t2 = get_time_usec();
cudaMemcpy(d_in, h_data, (N/32)*N_vecs*sizeof(unsigned), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
for (int i = 0; i < N_vecs; i += 2*OUT_CHUNK){
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream1>>>(d_in, d_out1, N_vecs, i, i+OUT_CHUNK);
cudaCheckErrors("cuda kernel loop 1 fail");
cudaStreamSynchronize(stream2);
if (i > 0) if (check_data(h_out2, i-OUT_CHUNK, distances)) return 1;
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream2>>>(d_in, d_out2, N_vecs, i+OUT_CHUNK, i+2*OUT_CHUNK);
cudaCheckErrors("cuda kernel loop 2 fail");
cudaMemcpyAsync(h_out1, d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(h_out2, d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream2);
cudaCheckErrors("cuda kernel loop 3 fail");
cudaStreamSynchronize(stream1);
if (check_data(h_out1, i, distances)) return 1;
}
cudaDeviceSynchronize();
cudaCheckErrors("cuda kernel loop 4 fail");
t2 = get_time_usec() - t2;
std::cout << "cpu time: " << ((float)t1)/(float)1000 << "ms gpu time: " << ((float) t2)/(float)1000 << "ms" << std::endl;
return 0;
}
I've added CUDA error checking to this code. Be sure to build a release project in Visual Studio, not debug. When I run this on a windows 7 laptop with a Quadro1000M GPU I get about 35 seconds for the CPU execution and about 1.5 seconds for the GPU.

OpenCL 1.2 has popcount which would seem to do what you want. It can work on a vector, so up to ulong16 which is 1024 bits at a time. Note that NVIDIA drivers only support OpenCL 1.1 which does not include this function.
Of course you could just use a function or table to compute it pretty quickly, so an OpenCL 1.1 implementation is possible as well, and would likely run at the memory bandwidth of the device.

Related

read/write to large array using large loop - execution time concerns

So recently I ran into a problem that I thought was interesting and I couldn't fully explain. I've highlighted the nature of the problem in the following code:
#include <cstring>
#include <chrono>
#include <iostream>
#define NLOOPS 10
void doWorkFast(int total, int *write, int *read)
{
for (int j = 0; j < NLOOPS; j++) {
for (int i = 0; i < total; i++) {
write[i] = read[i] + i;
}
}
}
void doWorkSlow(int total, int *write, int *read, int innerLoopSize)
{
for (int i = 0; i < NLOOPS; i++) {
for (int j = 0; j < total/innerLoopSize; j++) {
for (int k = 0; k < innerLoopSize; k++) {
write[j*k + k] = read[j*k + k] + j*k + k;
}
}
}
}
int main(int argc, char *argv[])
{
int n = 1000000000;
int *heapMemoryWrite = new int[n];
int *heapMemoryRead = new int[n];
for (int i = 0; i < n; i++)
{
heapMemoryRead[i] = 1;
}
std::memset(heapMemoryWrite, 0, n * sizeof(int));
auto start1 = std::chrono::high_resolution_clock::now();
doWorkFast(n,heapMemoryWrite, heapMemoryRead);
auto finish1 = std::chrono::high_resolution_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(finish1 - start1);
for (int i = 0; i < n; i++)
{
heapMemoryRead[i] = 1;
}
std::memset(heapMemoryWrite, 0, n * sizeof(int));
auto start2 = std::chrono::high_resolution_clock::now();
doWorkSlow(n,heapMemoryWrite, heapMemoryRead, 10);
auto finish2 = std::chrono::high_resolution_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(finish2 - start2);
std::cout << "Small inner loop:" << duration1.count() << " microseconds.\n" <<
"Large inner loop:" << duration2.count() << " microseconds." << std::endl;
delete[] heapMemoryWrite;
delete[] heapMemoryRead;
}
Looking at the two doWork* functions, for every iteration, we are reading the same addresses adding the same value and writing to the same addresses. I understand that in the doWorkSlow implementation, we are doing one or two more operations to resolve j*k + k, however, I think it's reasonably safe to assume that relative to the time it takes to do the load/stores for memory read and write, the time contribution of these operations is negligible.
Nevertheless, doWorkSlow takes about twice as long (46.8s) compared to doWorkFast (25.5s) on my i7-3700 using g++ --version 7.5.0. While things like cache prefetching and branch prediction come to mind, I don't have a great explanation as to why doWorkFast is much faster than doWorkSlow. Does anyone have insight?
Thanks
Looking at the two doWork* functions, for every iteration, we are reading the same addresses adding the same value and writing to the same addresses.
This is not true!
In doWorkFast, you index each integer incrementally, as array[i].
array[0]
array[1]
array[2]
array[3]
In doWorkSlow, you index each integer as array[j*k + k], which jumps around and repeats.
When j is 10, for example, and you iterate k from 0 onwards, you are accessing
array[0] // 10*0+0
array[11] // 10*1+1
array[22] // 10*2+2
array[33] // 10*3+3
This will prevent your optimizer from using instructions that can operate on many adjacent integers at once.

Most insanely efficient way to find index of the minimum of four numbers

#include <iostream>
#include <chrono>
#include <random>
using namespace std;
class MyTimer
{
private:
std::chrono::time_point<std::chrono::steady_clock> starter;
std::chrono::time_point<std::chrono::steady_clock> ender;
public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}
long long getCounter() {
ender = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(ender - starter).count();
}
};
int findBestKey(int keys[4], int values[4])
{
int index = 0;
for (int i = 1; i <= 3; i++)
if (keys[index] > keys[i])
index = i;
return values[index];
}
int findBestKeyPro(int keys[4], int values[4])
{
int index = keys[0] > keys[1];
if (keys[index] > keys[2]) index = 2;
if (keys[index] > keys[3]) return values[3];
else return values[index];
}
int findBestKeyProMax(int keys[4], int values[4])
{
// fill your implementation here. Not necessary to read the parts below
return 0;
}
void benchMethod(int (*findBestKeyFunc)(int keys[4], int values[4]), int n, int* keys, int* values, int& res, double& totalTime)
{
MyTimer timer;
timer.startCounter();
// In my actual problems, values of arrays "keys" are completely unrelated. They are not the same continuous values in memory. The line below is just an example for benchmark purposes
for (int i = 0; i < n - 4; i+=4)
res += findBestKeyFunc(&keys[i], &values[i]);
totalTime += timer.getCounter();
/*
it is possible to calculate 4 arrays "keys","values", then process them all at once.
for (int i=0; i<n-4; i+=16)
{
keys[4][4] = ...; values[4][4] = ...;
res += find4BestKeyAtOnce(&keys, &values);
}
*/
}
double totalTimeNormal = 0, totalTimePro = 0, totalTimeProMax = 0;
void benching(int& res1, int& res2, int& res3)
{
const int n = 10000000;
int* keys1 = new int[n], * values1 = new int[n];
int* keys2 = new int[n], * values2 = new int[n];
MyTimer timer;
double tmp;
for (int i = 0; i < n; i++) {
keys1[i] = rand() % 100; // need 2 arrays to prevent caching
keys2[i] = rand() % 100; // this should be % (256*256)
values1[i] = rand() % 100; // and % 256
values2[i] = rand() % 100; // but I use % 100 so that in this example it doesn't overflow int32
}
// the size of keys2/values2 is big enough to flush out keys1/values1 from cache completely.
// so order of execution doesn't affect performance here
benchMethod(&findBestKey, n, keys1, values1, res1, totalTimeNormal);
benchMethod(&findBestKey, n, keys2, values2, res1, totalTimeNormal);
benchMethod(&findBestKeyPro, n, keys1, values1, res2, totalTimePro);
benchMethod(&findBestKeyPro, n, keys2, values2, res2, totalTimePro);
benchMethod(&findBestKeyProMax, n, keys1, values1, res2, totalTimeProMax);
benchMethod(&findBestKeyProMax, n, keys2, values2, res2, totalTimeProMax);
delete[] keys1;
delete[] keys2;
delete[] values1;
delete[] values2;
}
void testIf()
{
int res1 = 0, res2 = 0, res3 = 0;
for (int t = 1; t <= 100; t++) {
benching(res1, res2, res3);
res1 %= 100;
res2 %= 100;
res3 %= 100;
cout << "Lap " << t << "\n";
cout << "time normal = " << totalTimeNormal/1000 << " ms\n";
cout << "time pro = " << totalTimePro/1000 << " ms\n";
cout << "time pro max = " << totalTimeProMax/1000 << " ms\n";
cout << "\n";
}
cout << "**********************\n" << res1 << " " << res2 << "\n";
}
int main()
{
testIf();
return 0;
}
There are two arrays, keys and values, both completely random. This function returns the value that has the minimum key. So: index = indexOfMin(keys); return values[index]; See function findBestKey. I need to fill in findBestKeyProMax
findBestKeyPro is around 30-35% faster than findBestKey, on my computer and on here: https://www.onlinegdb.com/online_c++_compiler . Compiler option is -std=c++14 -O2 Update: I get ~~5-10% more performance just by changing to -O3
Is there anyway I can make this faster? Every nanosecond matters, since this function is called ~~10^6-10^7 times (once for each pixel); saving 1 ns per call would translate to 1ms less, which is the difference between 200fps and 250fps.
Edit: no multi-threading or GPU. It's already done (each thread performs findBestKey on distinct keys/values arrays), so I want to improve this function directly. Maybe something like SIMD for CPU? Or branchless function.
Also the functions findBest... are what matters, function benchMethod() is just for benchmarking.
Edit 2: target architecture is CPUs with AVX256 capability, mainly Intel Skylake or AMD Zen 2.

How to launching 2 CUDA kernels concurrently ?

I tried to create 4 streams to launch 4 kernels concurrently, but it seems it run serially using nsight.
My Hardware: RTX2060
My test code is as follows:
#include "cuda_runtime.h"
#include <stdio.h>
#define N 1000000
__global__ void kernel_1()
{
double sum = 0.0;
for (int i = 0; i < N; i++) {
sum = sum + tan(0.1) * tan(0.1);
}
}
int main()
{
const int n_streams = 4;
cudaStream_t *streams = (cudaStream_t *)malloc(n_streams * sizeof(cudaStream_t));
for (int i = 0; i < n_streams; i++) {
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
dim3 block(1);
dim3 grid(1);
for (int i = 0; i < n_streams; i++) {
kernel_1 << <grid, block, 0, streams[i] >> >();
kernel_1 << <grid, block, 0, streams[i] >> >();
kernel_1 << <grid, block, 0, streams[i] >> >();
kernel_1 << <grid, block, 0, streams[i] >> >();
}
printf("done\n");
return 0;
}
timeline shows as this:
kernel running timeline shotcut

Why is this vectorized code subject to vector size?

I compile the following code without vectorization (-O2) and compare the time with vectorization (-O3 -march=native) for three different vector lengths (determined by uncommenting the respective #define SIZE), obtaining 29::9, 247::145 and 4866::4884, for vector sizes 10000, 100000 and 1000000, respectively.
#include <iostream>
#include <random>
#include<chrono>
#include<cmath>
using namespace std;
using namespace std::chrono;
//#define SIZE (10000) // 29::9
//#define SIZE (100000) // 247::145
#define SIZE (1000000) // 4866::4884
void vector_op_2(int * __restrict__ v1, int * __restrict__ v2) {
for (unsigned i = 0; i < SIZE; i++)
v1[i] = 2 * v2[i];
}
int main() {
using namespace std;
int* v = new int[SIZE];
int* w = new int[SIZE];
for (int i = 0; i < SIZE; i++) {
v[i] = i;
}
auto start = duration_cast<milliseconds>(system_clock::now().time_since_epoch());
for (int k = 0; k < 5000; k++) {
vector_op_2(w, v);
}
auto end = duration_cast<milliseconds>(system_clock::now().time_since_epoch());
std::cout << "Time " << end.count() - start.count() << std::endl;
for (int i = 0; i < SIZE; i++) {
if (abs(w[i]-2*v[i])>0.01) {
throw 1;
}
}
delete v;
return 0;
}
Why does no speedup occur in the case of vector size 1000000?
What is the optimal length?
Why does this vector length issue not occur with the following example?
[shortened]
long vector_op_1(int v[SIZE]) throw()
{
long s = 0;
for (unsigned i=0; i<SIZE; i++) s += v[i];
return s;
}
[... I am using g++ 7 on Ubuntu 16.04 ...]
[... For short vector size 1000 I am achieving a 6:1 ratio! ...]

Strided vs shuffling reduction

I've recently the watched CppCon talk about using Clang to compile CUDA cuda code, where the speaker after talking a bit about the architecture implements a sum reduction. I was interested in the approach he took which was doing a reduction by a shfl of the elements in the block, so with no working example I used his code modified it a little bit and got a max-reduction.
The thing is that this max reduction is very slow, compared to a CPU implementation of finding the max in 2^22 elements I get times of about ~90ms against ~20ms. Here is the code for the shfl reduction
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
using namespace std;
// Global reduce test
__global__ void d_max_reduce(const int *in, int *out, size_t N) {
int sum = 0;
size_t start = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
for (size_t i = start; i < start + 4 && i < N; i++) {
sum = max(__ldg(in + i), sum);
}
for (int i = 16; i; i >>= 1) {
sum = max(__shfl_down(sum, i), sum);
}
__shared__ int shared_max;
shared_max = 0;
__syncthreads();
if (!(threadIdx.x % 32)) {
atomicMax(&shared_max, sum);
}
__syncthreads();
if (!threadIdx.x) {
atomicMax(out, shared_max);
}
}
int test_max_reduce(std::vector<int> &v) {
int *in, *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 256;
d_max_reduce<<<ceil((float)v.size() / (threads * 4)), threads>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
So I used one of Nvidia's examples of a strided reduction (which is also is a sum) changed it to a max and I got times of about 7ms. Here is the code for the strided reduction
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
__global__ void d_max_reduction(const int *in, int *out, size_t N) {
extern __shared__ int s_data[];
size_t tid = threadIdx.x;
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
s_data[tid] = in[i];
else
s_data[tid] = 0;
__syncthreads();
for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s)
s_data[tid] = max(s_data[tid], s_data[tid + s]);
__syncthreads();
}
if (!tid)
atomicMax(out, s_data[0]);
}
int test_max_reduction(std::vector<int> &v) {
int *in;
int *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 128;
d_max_reduction<<<ceil((float)v.size() / threads),
threads,
threads * sizeof(int)>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
And just in case the rest so there is a MWE.
#include <random>
#include <timer.hpp>
int test_max_reduce(std::vector<int> &v);
int test_max_reduction(std::vector<int> &v);
int main() {
int N = 2000 * 2000; // * 2000;
std::vector<int> vec(N);
std::random_device dev;
std::mt19937 mt(dev());
std::uniform_int_distribution<int> dist(0, N << 2);
for (size_t i = 0; i < vec.size(); i++) {
vec[i] = dist(mt);
}
measure("GPU (shfl)", test_max_reduce, vec);
measure("GPU strided", test_max_reduction, vec);
measure("CPU",
[](std::vector<int> &vec) -> int {
int maximum = 0;
for (size_t i = 0; i < vec.size(); i++) {
maximum = std::max(maximum, vec[i]);
}
return maximum;
},
vec);
return 0;
}
And timer.hpp is
#ifndef TIMER_HPP
#define TIMER_HPP
#include <chrono>
#include <string>
#include <iostream>
template <typename F, typename ...Args>
void measure(std::string msg, F func, Args&&... args) {
auto start = std::chrono::steady_clock::now();
int val = func(std::forward<Args>(args)...);
auto end = std::chrono::steady_clock::now();
std::cout << msg << " Test " << std::endl;
std::cout << " Max Value : " << val << std::endl;
std::cout << " Time : ";
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>
(end - start).count() << std::endl;
}
#endif // TIMER_HPP
I generally get the following times
GPU (shfl) Test
Max Value : 15999999
Time : 86
GPU strided Test
Max Value : 15999999
Time : 7
CPU Test
Max Value : 15999999
Time : 23
EDIT new timings after warmup
GPU (shfl) Test
Max Value : 16000000
Time : 4
GPU strided Test
Max Value : 16000000
Time : 6
CPU Test
Max Value : 16000000
Time : 23
So my more general question is why is the shfl version slower than the strided? Which can be divided in
Am I missing something in the launch parameters/doing/assumed something wrong?
And when is recommended to use shfl intrinsic over a strided loop and viceversa?