Avoid blas when involving temporary memory allocation? - c++

I have a program that computes the matrix product x'Ay repeatedly. Is it better practice to compute this by making calls to MKL's blas, i.e. cblas_dgemv and cblas_ddot, which requires allocating memory to a temporary vector, or is better to simply take the sum of x_i * a_ij * y_j? In other words, does MKL's blas theoretically add any value?
I benchmarked this for my laptop. There was virtually no difference in each of the tests, other than g++_no_blas performed twice as poorly as the other tests (why?). There was also no difference between O2, O3 and Ofast.
g++_blas_static 57ms
g++_blas_dynamic 58ms
g++_no_blas 100ms
icpc_blas_static 57ms
icpc_blas_dynamic 58ms
icpc_no_blas 58ms
util.h
#ifndef UTIL_H
#define UTIL_H
#include <random>
#include <memory>
#include <iostream>
struct rng
{
rng() : unif(0.0, 1.0)
{
}
std::default_random_engine re;
std::uniform_real_distribution<double> unif;
double rand_double()
{
return unif(re);
}
std::unique_ptr<double[]> generate_square_matrix(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N * N]);
for (unsigned i = 0; i < N; ++i)
{
for (unsigned j = 0; j < N; ++j)
{
p.get()[i*N + j] = rand_double();
}
}
return p;
}
std::unique_ptr<double[]> generate_vector(const unsigned N)
{
std::unique_ptr<double[]> p (new double[N]);
for (unsigned i = 0; i < N; ++i)
{
p.get()[i] = rand_double();
}
return p;
}
};
#endif // UTIL_H
main.cpp
#include <iostream>
#include <iomanip>
#include <memory>
#include <chrono>
#include "util.h"
#include "mkl.h"
double vtmv_blas(double* x, double* A, double* y, const unsigned n)
{
double temp[n];
cblas_dgemv(CblasRowMajor, CblasNoTrans, n, n, 1.0, A, n, y, 1, 0.0, temp, 1);
return cblas_ddot(n, temp, 1, x, 1);
}
double vtmv_non_blas(double* x, double* A, double* y, const unsigned n)
{
double r = 0;
for (unsigned i = 0; i < n; ++i)
{
for (unsigned j = 0; j < n; ++j)
{
r += x[i] * A[i*n + j] * y[j];
}
}
return r;
}
int main()
{
std::cout << std::fixed;
std::cout << std::setprecision(2);
constexpr unsigned N = 10000;
rng r;
std::unique_ptr<double[]> A = r.generate_square_matrix(N);
std::unique_ptr<double[]> x = r.generate_vector(N);
std::unique_ptr<double[]> y = r.generate_vector(N);
auto start = std::chrono::system_clock::now();
const double prod = vtmv_blas(x.get(), A.get(), y.get(), N);
auto end = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start);
std::cout << "Result: " << prod << std::endl;
std::cout << "Time (ms): " << duration.count() << std::endl;

GCC no blas is poor because it does not use vectorized SMID instructions, while others all do. icpc will auto-vectorize you loop.
You don't show your matrix size, but generally gemv is memory bound. As the matrix is much larger than a temp vector, eliminating it may not be able to increase the performance a lot.

Related

CUDA thrust sort much slower when called from inside kernel [duplicate]

This question already has an answer here:
Accelerating __device__ function in Thrust comparison operator
(1 answer)
Closed last month.
#include <iostream>
#include <math.h>
#include <vector>
#include <assert.h>
#include <fstream>
#include <map>
#include <algorithm>
#include <sstream>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/functional.h>
#include <thrust/execution_policy.h>
#include <cub/cub.cuh>
using namespace std;
typedef float real;
int MAX_N = 10000000;
int N;
real* a, *b;
real* d_a;
real* h_res1, *h_res2;
volatile real v_res = 0;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
void genData()
{
N = 100000;
for (int i = 0; i < N; i++) a[i] = float(rand() % 1000) / (rand() % 1000 + 1);
}
void __attribute__((noinline)) testCpu(real* arr, real* res, int N)
{
std::sort(arr, arr + N);
v_res = arr[rand() % N];
memcpy(res, arr, N * sizeof(real));
}
__global__
void sort_kernel(float* a, int N)
{
if (blockIdx.x==0 && threadIdx.x==0)
thrust::sort(thrust::device, a, a + N);
__syncthreads();
}
void __attribute__((noinline)) testGpu(real* arr, real* res, int N)
{
MyTimer timer;
timer.startCounter();
cudaMemcpy(d_a, arr, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cout << "Copy H2D cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
//thrust::sort(thrust::device, d_a, d_a + N);
sort_kernel<<<1,1>>>(d_a, N);
cudaDeviceSynchronize();
cout << "Thrust sort cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
cudaMemcpy(res, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cout << "Copy D2H cost = " << timer.getCounterMsPrecise() << "\n";
v_res = res[rand() % N];
}
void __attribute__((noinline)) deepCopy(real* a, real* b, int N)
{
for (int i = 0; i < N; i++) b[i] = a[i];
}
void testOne(int t, bool record = true)
{
MyTimer timer;
genData();
deepCopy(a, b, N);
timer.startCounter();
testCpu(a, h_res1, N);
cout << "CPU cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
testGpu(b, h_res2, N);
cout << "GPU cost = " << timer.getCounterMsPrecise() << "\n";
for (int i = 0; i < N; i++) {
if (h_res1[i] != h_res2[i]) {
cout << "ERROR " << i << " " << h_res1[i] << " " << h_res2[i] << "\n";
exit(1);
}
}
cout << "-----------------\n";
}
int main()
{
a = new real[MAX_N];
b = new real[MAX_N];
cudaMalloc(&d_a, MAX_N * sizeof(float));
cudaMallocHost(&h_res1, MAX_N * sizeof(float));
cudaMallocHost(&h_res2, MAX_N * sizeof(float));
testOne(0, 0);
for (int i = 1; i <= 50; i++) testOne(i);
}
For legacy code reason, I have to perform sort inside a kernel completely. Basically, I need:
__global__ void mainKernel(float** a, int N, float* global_pad)
{
int x;
...
cooperative_groups::grid_group g = cooperative_groups::this_grid();
sortFunc(a[x], N); // this can be a kernel. Then only 1 thread in the grid will call it
g.sync();
...
}
I tried to use thrust::sort but it's extremely slow. For example, with N = 100000, the benchmark result is:
CPU cost = 5.82228
Copy H2D cost = 0.088908
Thrust sort from CPU cost = 0.391211 (running line thrust::sort(thrust::device, d_a, d_a + N);)
Thrust sort inside kernel cost = 116 (running line sort_kernel<<<1,1>>>(d_a, N);)
Copy D2H cost = 0.067639
Why is thrust::sort so slow in this case? I want to find an implementation of sortFunc that is fastest possible (global_pad can be used as temporary memory)
Edit: I'm using 2080ti and CUDA 11.4. The compile command I use is
nvcc -o main main.cu -O3 -std=c++17
You need to turn on dynamic parallelism in the compile command.
Use -rdc=true, nvcc -o main main.cu -O3 -std=c++17 -rdc=true.
Then the 2 block code below are equivalent
__global__
void sort_kernel(float* a, int N)
{
if (blockIdx.x==0 && threadIdx.x==0)
thrust::sort(thrust::device, a, a + N);
__syncthreads();
}
...
sort_kernel<<<1,1>>>(d_a, N);
and
thrust::sort(thrust::device, d_a, d_a + N);

Why LAPACKE_dsygvd returns error after changing size of matrix?

I am trying to solve the generalized eigenvalue problem for the hydrogen atom by using LAPACKE_dsygvd. For the parameters of the generator functions, I use an interval that starts at 0.01 and takes N steps of 0.01. What I change is the value of N. Everythings fine for N = 14 and below, where I get the answers from the analytical solution. However, when I choose N = 15 and above, I get an error and info is returned with a value > N. After reading the documentation from LAPACK, it says the following:
N: if INFO = N + i, for 1 <= i <= N, then the leading
minor of order i of B is not positive definite.
The factorization of B could not be completed and
no eigenvalues or eigenvectors were computed.
But I have checked my matrix B and it is positive definite. I don't know what is wrong.
Below I show my scripts
#include <cmath>
#include <cstdio>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include "library.h"
#include "mkl.h"
using namespace std;
double Superposition(const double ai, const double aj, const int m);
double Hamiltonian(const double ai, const double aj, const int m);
void print_matrix(double *A, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
printf("%.7f ", A[i*n + j]);
}
cout << "\n";
}
}
void print_vector(double *vec, int n) {
for (int i = 0; i < n; i++) {
cout << vec[i] << " ";
}
cout << "\n";
}
double* interval(double min, double step) {
double *result;
result = (double *)mkl_malloc( N*sizeof( double ), 64 );
for (int i = 0; i < N; i++) {
result[i] = min + i*step;
}
return result;
}
int main() {
cout << Ry << "\n";
double *S, *H, *I, *eigenvalues;
double alpha, beta;
int i, j, info;
char* uplo = "U"; char* jobz = "V";
I = interval(0.01, 0.01);
alpha = 1.0; beta = 0.0;
S = (double *)mkl_malloc( N*N*sizeof( double ), 64 );
H = (double *)mkl_malloc( N*N*sizeof( double ), 64 );
eigenvalues = (double *)mkl_malloc( N*sizeof( double ), 64 );
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
int index = i*N + j;
if (j < i) {
S[index] = 0.0;
H[index] = 0.0;
}
else {
S[index] = Superposition(I[i], I[j], m);
H[index] = Hamiltonian(I[i], I[j], m);
}
}
}
print_matrix(S, N); cout << "\n";
print_matrix(H, N); cout << "\n" << "\n";
info = LAPACKE_dsygv(LAPACK_ROW_MAJOR, 1, *jobz, *uplo, N,
H, N, S, N, eigenvalues);
//print_matrix(H, N); cout << "\n";
//for (i = 0; i < N; i++) {
// eigenvalues[i] /= Ry;
//}
cout << info << "\n" << "\n";
print_matrix(H, N); cout << "\n";
print_vector(eigenvalues, N);
mkl_free(S);
mkl_free(H);
mkl_free(I);
mkl_free(eigenvalues);
}
*Edit: I used dsygvd as included in MKL, and the same error doesn't occur. However, I get very different results for both functions using the same inputs.

Neural network with static std::array is slower than neural network using dynamic C-array

There is a minimalistic (around 200 lines) neural network C library on github called Tinn.
Tinn uses dynamic C arrays for representing weights, biases, neurons. I tried to implement it partially in C++ but using static std::array. I thought the static std::array would be much faster. However it is exactly the opposite after doing some measurements. Could anybody tell me if I am doing something wrong or tell me a reason why static array is beaten by dynamic even with -O3 optimizations?
Neural network with static arrays MLP_1.h
#pragma once
#include <cmath>
#include <array>
#include <iostream>
#include <fstream>
template<class Type, size_t nIn, size_t nHid, size_t nOut>
class MLP_1
{
public:
static constexpr size_t nInputs = nIn;
static constexpr size_t nHiddens = nHid;
static constexpr size_t nOutputs = nOut;
static constexpr size_t nWeights = nHiddens * (nInputs + nOutputs);
static constexpr size_t nBiases = 2;
static constexpr size_t weightIndexOffset = nHiddens * nInputs;
std::array<Type, nWeights> weights;
std::array<Type, nBiases> biases;
std::array<Type, nHiddens> hiddenNeurons;
std::array<Type, nOut> outputNeurons;
static Type activationFunction(const Type x) noexcept
{
//return x / (1 + std::abs(x)); // faster
return 1.0 / (1.0 + std::exp(-x));
}
void forwardPropagation(const Type* const input) noexcept
{
// Calculate hidden layer neuron values.
for(size_t i = 0; i < nHiddens; ++i)
{
Type sum = 0.0;
for(size_t j = 0; j < nInputs; ++j)
{
const size_t weightIndex = (i * nInputs) + j;
sum += input[j] * weights[weightIndex];
}
hiddenNeurons[i] = activationFunction(sum + biases[0]);
}
// Calculate output layer neuron values.
for(size_t i = 0; i < nOutputs; ++i)
{
Type sum = 0.0;
for(size_t j = 0; j < nHiddens; ++j)
{
const size_t weightIndex = weightIndexOffset + (i * nHiddens) + j;
sum += hiddenNeurons[j] * weights[weightIndex];
}
outputNeurons[i] = activationFunction(sum + biases[1]);
}
}
const Type* const predict(const Type* const input) noexcept
{
forwardPropagation(input);
return outputNeurons.data();
}
const std::array<Type, nOutputs>& predict(const std::array<Type, nInputs>& inputArray)
{
forwardPropagation(inputArray.data());
return outputNeurons;
}
void load(const char* const path) noexcept
{
std::ifstream inputFile(path);
size_t nInputsFile, nHiddensFile, nOutputsFile;
std::string ignoreString;
inputFile >> nInputsFile >> nHiddensFile >> nOutputsFile;
if ((nInputs != nInputsFile) || (nHiddens != nHiddensFile) || (nOutputs != nOutputsFile))
{
std::cout << "Size missmatch.\n";
std::cout << nInputs << ", " << nHiddens << ", " << nOutputs << std::endl;
std::cout << nInputsFile << ", " << nHiddensFile << ", " << nOutputsFile << std::endl;
}
for (auto& bias : biases)
{
Type biasFile;
inputFile >> biasFile;
bias = biasFile;
}
for (auto& weight : weights)
{
Type weightFile;
inputFile >> weightFile;
weight = weightFile;
}
}
void printWeights() const
{
std::cout << "weights: ";
for (const auto& w : weights) { std::cout << w << " "; }
std::cout << "\n";
}
void printBiases() const
{
std::cout << "biases: ";
for (const auto& b : biases) { std::cout << b << " "; }
std::cout << "\n";
}
void print() const
{
printWeights();
printBiases();
}
};
Neural network with dynamic arrays - Tinn.h
#pragma once
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
typedef struct
{
// All the weights.
float* w;
// Hidden to output layer weights.
float* x;
// Biases.
float* b;
// Hidden layer.
float* h;
// Output layer.
float* o;
// Number of biases - always two - Tinn only supports a single hidden layer.
int nb;
// Number of weights.
int nw;
// Number of inputs.
int nips;
// Number of hidden neurons.
int nhid;
// Number of outputs.
int nops;
}
Tinn;
// Returns floating point random from 0.0 - 1.0.
static float frand()
{
return rand() / (float) RAND_MAX;
}
// Activation function.
static float act(const float a)
{
return 1.0f / (1.0f + expf(-a));
}
// Performs forward propagation.
static void fprop(const Tinn t, const float* const in)
{
// Calculate hidden layer neuron values.
for(int i = 0; i < t.nhid; i++)
{
float sum = 0.0f;
for(int j = 0; j < t.nips; j++)
sum += in[j] * t.w[i * t.nips + j];
t.h[i] = act(sum + t.b[0]);
}
// Calculate output layer neuron values.
for(int i = 0; i < t.nops; i++)
{
float sum = 0.0f;
for(int j = 0; j < t.nhid; j++)
sum += t.h[j] * t.x[i * t.nhid + j];
t.o[i] = act(sum + t.b[1]);
}
}
// Randomizes tinn weights and biases.
static void wbrand(const Tinn t)
{
for(int i = 0; i < t.nw; i++) t.w[i] = frand() - 0.5f;
for(int i = 0; i < t.nb; i++) t.b[i] = frand() - 0.5f;
}
// Returns an output prediction given an input.
float* xtpredict(const Tinn t, const float* const in)
{
fprop(t, in);
return t.o;
}
// Constructs a tinn with number of inputs, number of hidden neurons, and number of outputs
Tinn xtbuild(const int nips, const int nhid, const int nops)
{
Tinn t;
// Tinn only supports one hidden layer so there are two biases.
t.nb = 2;
t.nw = nhid * (nips + nops);
t.w = (float*) calloc(t.nw, sizeof(*t.w));
t.x = t.w + nhid * nips;
t.b = (float*) calloc(t.nb, sizeof(*t.b));
t.h = (float*) calloc(nhid, sizeof(*t.h));
t.o = (float*) calloc(nops, sizeof(*t.o));
t.nips = nips;
t.nhid = nhid;
t.nops = nops;
wbrand(t);
return t;
}
// Saves a tinn to disk.
void xtsave(const Tinn t, const char* const path)
{
FILE* const file = fopen(path, "w");
// Save header.
fprintf(file, "%d %d %d\n", t.nips, t.nhid, t.nops);
// Save biases and weights.
for(int i = 0; i < t.nb; i++) fprintf(file, "%f\n", (double) t.b[i]);
for(int i = 0; i < t.nw; i++) fprintf(file, "%f\n", (double) t.w[i]);
fclose(file);
}
// Loads a tinn from disk.
Tinn xtload(const char* const path)
{
FILE* const file = fopen(path, "r");
int nips = 0;
int nhid = 0;
int nops = 0;
// Load header.
fscanf(file, "%d %d %d\n", &nips, &nhid, &nops);
// Build a new tinn.
const Tinn t = xtbuild(nips, nhid, nops);
// Load biaes and weights.
for(int i = 0; i < t.nb; i++) fscanf(file, "%f\n", &t.b[i]);
for(int i = 0; i < t.nw; i++) fscanf(file, "%f\n", &t.w[i]);
fclose(file);
return t;
}
// Frees object from heap.
void xtfree(const Tinn t)
{
free(t.w);
free(t.b);
free(t.h);
free(t.o);
}
// Prints an array of floats. Useful for printing predictions.
void xtprint(const float* arr, const int size)
{
for(int i = 0; i < size; i++)
printf("%f ", (double) arr[i]);
printf("\n");
}
void xtprint(const Tinn& tinn)
{
printf("weights: ");
xtprint(tinn.w, tinn.nw);
printf("biases: ");
xtprint(tinn.b, tinn.nb);
}
Main with tests main.cpp
#include <iostream>
#include "MLP_1.h"
#include "Tinn.h"
#include <array>
#include <iterator>
#include <random>
#include <algorithm>
#include <chrono>
constexpr size_t in = 748;
constexpr size_t hid = 20;
constexpr size_t out = 5;
const char* const path = "tinn01.txt";
template< class Iter >
void fill_with_random_values( Iter start, Iter end, int min, int max)
{
static std::random_device rd; // you only need to initialize it once
static std::mt19937 mte(rd()); // this is a relative big object to create
std::uniform_real_distribution<float> dist(min, max);
std::generate(start, end, [&] () { return dist(mte); });
}
void testMLP(MLP_1<float, in, hid, out>& mlp, const std::array<float, in>& array)
{
std::cout << "------MLP------\n";
float sum = 0;
const float* data = array.data();
auto start = std::chrono::system_clock::now();
for (size_t i = 0; i < 60000; ++i)
{
const float* inputRes1 = mlp.predict(data);
sum += inputRes1[0];
}
auto end = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "sum:" << sum << "\n";
std::cout << "elapsed time: " << elapsed.count() << "ms" << "\n";
std::cout << "------MLP------\n";
}
void testTinn(Tinn& tinn, const std::array<float, in>& array)
{
std::cout << "------TINN------\n";
float sum = 0;
const float* data = array.data();
auto start = std::chrono::system_clock::now();
for (size_t i = 0; i < 60000; ++i)
{
const float* inputRes1 = xtpredict(tinn, data);
sum += inputRes1[0];
}
auto end = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "sum:" << sum << "\n";
std::cout << "elapsed time: " << elapsed.count() << "ms" << "\n";
std::cout << "------TINN------\n";
}
int main()
{
Tinn sTinn = xtbuild(in, hid, out);
xtsave(sTinn, path);
Tinn tinn1 = xtload(path);
MLP_1<float, in, hid, out> mlp;
mlp.load(path);
std::array<float, in> inputTest;
fill_with_random_values(inputTest.begin(), inputTest.end(), -10.0, 10.0);
testMLP(mlp, inputTest);
std::cout << "\n";
testTinn(tinn1, inputTest);
return 0;
}
With g++ -std=c++14 -O0 main.cpp I get:
------MLP------
sum:33171.4
elapsed time: 6524ms
------MLP------
------TINN------
sum:33171.4
elapsed time: 2256ms
------TINN------
With g++ -std=c++14 -O3 main.cpp I get:
------MLP------
sum:19567.4
elapsed time: 758ms
------MLP------
------TINN------
sum:19567.4
elapsed time: 739ms
------TINN------
With dynamic memory allocation, the slow part is allocating and freeing memory. There is no memory allocation in the loop you measure, so there is no reason to expect the dynamically allocated version to be slower. And indeed, with -O3 optimization, the runtimes are almost identical.
One difference between the programs that could affect runtime is the use of different random number generators. std::mt19937 is vastly better than rand(), but might be slower.

Strided vs shuffling reduction

I've recently the watched CppCon talk about using Clang to compile CUDA cuda code, where the speaker after talking a bit about the architecture implements a sum reduction. I was interested in the approach he took which was doing a reduction by a shfl of the elements in the block, so with no working example I used his code modified it a little bit and got a max-reduction.
The thing is that this max reduction is very slow, compared to a CPU implementation of finding the max in 2^22 elements I get times of about ~90ms against ~20ms. Here is the code for the shfl reduction
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
using namespace std;
// Global reduce test
__global__ void d_max_reduce(const int *in, int *out, size_t N) {
int sum = 0;
size_t start = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
for (size_t i = start; i < start + 4 && i < N; i++) {
sum = max(__ldg(in + i), sum);
}
for (int i = 16; i; i >>= 1) {
sum = max(__shfl_down(sum, i), sum);
}
__shared__ int shared_max;
shared_max = 0;
__syncthreads();
if (!(threadIdx.x % 32)) {
atomicMax(&shared_max, sum);
}
__syncthreads();
if (!threadIdx.x) {
atomicMax(out, shared_max);
}
}
int test_max_reduce(std::vector<int> &v) {
int *in, *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 256;
d_max_reduce<<<ceil((float)v.size() / (threads * 4)), threads>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
So I used one of Nvidia's examples of a strided reduction (which is also is a sum) changed it to a max and I got times of about 7ms. Here is the code for the strided reduction
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
__global__ void d_max_reduction(const int *in, int *out, size_t N) {
extern __shared__ int s_data[];
size_t tid = threadIdx.x;
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
s_data[tid] = in[i];
else
s_data[tid] = 0;
__syncthreads();
for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s)
s_data[tid] = max(s_data[tid], s_data[tid + s]);
__syncthreads();
}
if (!tid)
atomicMax(out, s_data[0]);
}
int test_max_reduction(std::vector<int> &v) {
int *in;
int *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 128;
d_max_reduction<<<ceil((float)v.size() / threads),
threads,
threads * sizeof(int)>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
And just in case the rest so there is a MWE.
#include <random>
#include <timer.hpp>
int test_max_reduce(std::vector<int> &v);
int test_max_reduction(std::vector<int> &v);
int main() {
int N = 2000 * 2000; // * 2000;
std::vector<int> vec(N);
std::random_device dev;
std::mt19937 mt(dev());
std::uniform_int_distribution<int> dist(0, N << 2);
for (size_t i = 0; i < vec.size(); i++) {
vec[i] = dist(mt);
}
measure("GPU (shfl)", test_max_reduce, vec);
measure("GPU strided", test_max_reduction, vec);
measure("CPU",
[](std::vector<int> &vec) -> int {
int maximum = 0;
for (size_t i = 0; i < vec.size(); i++) {
maximum = std::max(maximum, vec[i]);
}
return maximum;
},
vec);
return 0;
}
And timer.hpp is
#ifndef TIMER_HPP
#define TIMER_HPP
#include <chrono>
#include <string>
#include <iostream>
template <typename F, typename ...Args>
void measure(std::string msg, F func, Args&&... args) {
auto start = std::chrono::steady_clock::now();
int val = func(std::forward<Args>(args)...);
auto end = std::chrono::steady_clock::now();
std::cout << msg << " Test " << std::endl;
std::cout << " Max Value : " << val << std::endl;
std::cout << " Time : ";
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>
(end - start).count() << std::endl;
}
#endif // TIMER_HPP
I generally get the following times
GPU (shfl) Test
Max Value : 15999999
Time : 86
GPU strided Test
Max Value : 15999999
Time : 7
CPU Test
Max Value : 15999999
Time : 23
EDIT new timings after warmup
GPU (shfl) Test
Max Value : 16000000
Time : 4
GPU strided Test
Max Value : 16000000
Time : 6
CPU Test
Max Value : 16000000
Time : 23
So my more general question is why is the shfl version slower than the strided? Which can be divided in
Am I missing something in the launch parameters/doing/assumed something wrong?
And when is recommended to use shfl intrinsic over a strided loop and viceversa?

quicksort, helper class problems, timesort, vector, c++

my biggest problem is getting the quickSortHelper class to work. i know what i want the parameters to be, and the stuff inside i need to call on is what i can't figure out. i've tried a mixture of using the partition and quicksort but i can't figure it out. the code is written like this because i will be using a timesort class in the future to solve and time 6+ sorting algorithms. i got it to work by just throwing the code inside main. but all i want inside main is what i have here.
#include <iostream>
#include <algorithm>
#include <vector>
#include <chrono>
#include <functional>
#include <random>
//i know not all the above libraries are being used, once quickSort is
//working i plan on adding it to 5 other sorting algorithms, where these
//are neccessary.
using namespace std;
void quickSort(vector<int>&, int, int);
int partition(vector<int>&, int, int);
double timeSort(vector<int> &v, function<void(vector<int>&)>f);
int main()
{
vector<int>intVec(1000);
generate(intVec.begin(), intVec.end(), rand);
int p = 0;
int q = 1000;
quickSort(intVec, p, q);
auto time = timeSort(intVec, quickSort);
for (auto i = 0u; i != intVec.size(); ++i)
cout << intVec[i] << " ";
cout << "\nQuick sort took " << time << " nanoseconds\n";
char chubby;
cin >> chubby;
return 0;
}
double timeSort(vector<int> &v, function<void(vector<int>&)>f)
{
auto start = chrono::high_resolution_clock::now();
f(v);
auto end = chrono::high_resolution_clock::now();
return static_cast<double>(((end - start).count()));
}
int partition(vector<int>&intVec, int p, int q)
{
int x = intVec[p];
int i = p;
int j;
for (j = p + 1; j < q; j++)
{
if (intVec[j] <= x)
{
i = i + 1;
swap(intVec[i], intVec[j]);
}
}
swap(intVec[i], intVec[p]);
return i;
}
void quickSort(vector<int>&intVec, int p, int q)
{
int r;
if (p < q)
{
r = partition(intVec, p, q);
quickSort(intVec, p, r);
quickSort(intVec, r + 1, q);
}
}
void quickSortHelper(vector<int>&intVec)
{
//i want to make a call to the timeSort function with
//quickSortHelper, i can't use quickSort directly because timeSort
//only has 2 parameters, the vector to be solved, and the method of
//solving it. i know
}
I suggest simplifying your program:
int main(void)
{
vector<int>intVec(1000);
unsigned int duration = 0;
for (unsigned int iteration = 0;
iteration < 1000000;
++iteration)
{
generate(intVec.begin(), intVec.end(), rand);
int p = 0;
int q = 1000;
auto start = chrono::high_resolution_clock::now();
quickSort(intVec, p, q);
auto end = chrono::high_resolution_clock::now();
duration += (end - start);
}
cout << "Average time for quicksort: " << (duration / 1000000) << "\n";
cout.flush();
return 0;
}
I made the following changes:
1) Running the sort for many iterations to get an average duration.
2) Removed the timing function; it only complicates things.
Umm... If I understand correctly, this should do it:
void quickSortHelper(vector<int>&intVec)
{
quickSort(intVec, 0, intVec.size());
}