how do i execute both cufftXt and CUDA kernels on multiple GPUs? - c++

I would like to use two GPUs to execute a kernel then execute a single FFT using cufftXt. The data could be several GBs in size.
My understanding of allocating memory for kernels on 2 GPUs is that you should split the host array in half and send the first half to GPU0 and the other half to GPU1. The following example shows how this could be done.
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<real_size){
float x = (i % (N+2));
if(x < N){
data3[i] = pow(data[i], 3.0f);
}
else{
data3[i] = 0.0f;
}
}
__syncthreads();
}
int main (int argc, char **argv) {
int x;
int N = 8;
int cplx_size = N * (N/2 + 1);
int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftReal)*real_size;
int half_real_size = real_size/2;
int half_mem_size = mem_size/2;
cufftReal *h_data = (cufftReal*)malloc(mem_size);
cufftReal *h_data3 = (cufftReal*)malloc(mem_size);
cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);
for(int i=0; i<real_size; i++){
x = (i % (N+2));
if(x < N){h_data[i] = 2;}
else{h_data[i] = 0;}
}
for(int i=0; i<half_real_size; i++){
h0_data[i] = h_data[i];
h1_data[i] = h_data[i+half_real_size];
}
cufftReal *d0_data;
cufftReal *d0_data3;
cufftReal *d1_data;
cufftReal *d1_data3;
cudaSetDevice(0);
gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
cudaSetDevice(1);
gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));
cout <<"device memory allocated" <<endl;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (half_real_size)/threadsPerBlock;
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));
cout <<"mem copied to devices" <<endl;
cudaSetDevice(0);
Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(1);
Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h0_data3[i] <<" ";
}
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h1_data3[i] <<" ";
}
//clean up
cudaFree(d0_data);
cudaFree(d0_data3);
cudaFree(d1_data);
cudaFree(d1_data3);
return 0;
}
However, I do not see how this approach is compatible with cufftXt. It appears that I should use the helper function cufftXtMemcpy to automatically split up the data onto the devices. But if I do that, then the multi-gpu kernel method shown above is not useable unless I allocate separate device memory for cufftXt and kernels. Is there any way to run both cufftXt and kernels without doubly allocating device memory?

Here is how I did it, following the simpleCUFFT_2d_MGPU code sample from the toolkit. I am not sure if it is completely correct. It is 50% slower on 2 GPUs than it was using only 1. I tested it this code (versus another code using R2C and C2R FFTs) on Tesla K40 GPUs.
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
__global__ void Cube (cufftComplex *data, cufftComplex *data3, int N, int n, int nGPUs) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data3[i].x = pow(data[i].x, 3.0f);
data3[i].y = 0;
}
__syncthreads();
}
__global__ void Normalize (cufftComplex *data, int N, int n, int nGPUs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n){
data[i].x /= n;
}
__syncthreads();
}
int main (int argc, char **argv) {
int x, y;
int N = 8192;
int n = N*N;
//int cplx_size = N * (N/2 + 1);
//int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftComplex)*n;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (n)/threadsPerBlock;
cout <<"numBlocks " <<numBlocks <<endl;
cufftComplex *h_data;
h_data = (cufftComplex*)malloc(mem_size);
cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);
cout <<"host data allocated" <<endl;
int index;
float lambda = N*.1;
for(y=0; y<N; y++){
for(x=0; x<N; x++){
//cout <<x <<" " <<y <<endl;
index = x + y*N;
h_data[index].x = cos(2*M_PI*(x+y)/lambda);
h_data[index].y = 0;
}
}
cout <<"host data values set" <<endl;
cufftResult res;
int device;
int nGPUs;
cudaGetDeviceCount(&nGPUs);
cout <<nGPUs <<" CUDA devices" <<endl;
size_t total_mem, free_mem;
for(int i=0; i<nGPUs; i++){
cudaMemGetInfo(&free_mem, &total_mem);
cout <<"GPU" <<i <<" used memory " <<(total_mem-free_mem)/pow(10,9);
}
int whichGPUs[nGPUs];
for(int i=0; i<nGPUs; i++){
whichGPUs[i]=i;
}
cout <<"whichgpus set" <<endl;
size_t* worksize;
worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);
cout <<"worksize set" <<endl;
cufftHandle plan_complex;
res = cufftCreate(&plan_complex);
if (res != CUFFT_SUCCESS){cout <<"create plan failed" <<endl;}
res = cufftXtSetGPUs(plan_complex, nGPUs, whichGPUs);
if (res != CUFFT_SUCCESS){cout <<"setgpus forward failed" <<endl;}
cout <<"set gpus" <<endl;
res = cufftMakePlan2d(plan_complex, N, N, CUFFT_C2C, worksize);
if (res != CUFFT_SUCCESS){cout <<"make plan forward failed" <<endl;}
cout <<"plan created" <<endl;
cudaLibXtDesc *d_data;
cudaLibXtDesc *d_data3;
res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data, CUFFT_XT_FORMAT_INPLACE);
if (res != CUFFT_SUCCESS){cout <<"data malloc failed" <<endl;}
res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data3, CUFFT_XT_FORMAT_INPLACE);
if (res != CUFFT_SUCCESS){cout <<"data3 malloc failed" <<endl;}
cout <<"xtmalloc done" <<endl;
res = cufftXtMemcpy (plan_complex, d_data, h_data, CUFFT_COPY_HOST_TO_DEVICE);
if (res != CUFFT_SUCCESS){cout <<"memcpy to device failed" <<endl;}
cout <<"memcpy h to d" <<endl;
int tmax = 10000;
int start = time(0);
for(int tau=0; tau<tmax; tau++){
res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_FORWARD);
if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_INVERSE);
if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}
for(int i=0; i<nGPUs; i++){
device = d_data->descriptor->GPUs[i];
cudaSetDevice(device);
Normalize <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], N, n, nGPUs);
}
cudaDeviceSynchronize();
}
int stop = time(0);
cout <<tmax <<" timesteps" <<endl <<(stop-start) <<" seconds"<<endl;
/*
for(int i=0; i<nGPUs; i++){
device = d_data->descriptor->GPUs[i];
cudaSetDevice(device);
Cube <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], (cufftComplex*) d_data3->descriptor->data[i], N, real_size);
}
*/
/*
cudaDeviceSynchronize();
res = cufftXtMemcpy (plan_complex, h_data, d_data, CUFFT_COPY_DEVICE_TO_HOST);
if (res != CUFFT_SUCCESS){cout <<"memcpy to host failed" <<endl;}
cout <<"memcpy d to h" <<endl;
ofstream fout;
ostringstream outstr;
outstr.precision(4);
outstr <<time(0) <<".dat";
string filename=outstr.str();
fout.open(filename.c_str());
fout.precision(4);
for (int i = 0; i < n; i++) {
x = (i % (N));
y = (i /(N))%N;
fout <<x <<" " <<y <<" " <<h_data[i].x <<endl;
}
fout.close();
*/
//clean up
res = cufftXtFree(d_data);
if (res != CUFFT_SUCCESS){cout <<"free data failed" <<endl;}
res = cufftXtFree(d_data3);
if (res != CUFFT_SUCCESS){cout <<"free data3 failed" <<endl;}
cufftDestroy(plan_complex);
return 0;
}

Related

CUDA thrust sort much slower when called from inside kernel [duplicate]

This question already has an answer here:
Accelerating __device__ function in Thrust comparison operator
(1 answer)
Closed last month.
#include <iostream>
#include <math.h>
#include <vector>
#include <assert.h>
#include <fstream>
#include <map>
#include <algorithm>
#include <sstream>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/functional.h>
#include <thrust/execution_policy.h>
#include <cub/cub.cuh>
using namespace std;
typedef float real;
int MAX_N = 10000000;
int N;
real* a, *b;
real* d_a;
real* h_res1, *h_res2;
volatile real v_res = 0;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
void genData()
{
N = 100000;
for (int i = 0; i < N; i++) a[i] = float(rand() % 1000) / (rand() % 1000 + 1);
}
void __attribute__((noinline)) testCpu(real* arr, real* res, int N)
{
std::sort(arr, arr + N);
v_res = arr[rand() % N];
memcpy(res, arr, N * sizeof(real));
}
__global__
void sort_kernel(float* a, int N)
{
if (blockIdx.x==0 && threadIdx.x==0)
thrust::sort(thrust::device, a, a + N);
__syncthreads();
}
void __attribute__((noinline)) testGpu(real* arr, real* res, int N)
{
MyTimer timer;
timer.startCounter();
cudaMemcpy(d_a, arr, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cout << "Copy H2D cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
//thrust::sort(thrust::device, d_a, d_a + N);
sort_kernel<<<1,1>>>(d_a, N);
cudaDeviceSynchronize();
cout << "Thrust sort cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
cudaMemcpy(res, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cout << "Copy D2H cost = " << timer.getCounterMsPrecise() << "\n";
v_res = res[rand() % N];
}
void __attribute__((noinline)) deepCopy(real* a, real* b, int N)
{
for (int i = 0; i < N; i++) b[i] = a[i];
}
void testOne(int t, bool record = true)
{
MyTimer timer;
genData();
deepCopy(a, b, N);
timer.startCounter();
testCpu(a, h_res1, N);
cout << "CPU cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
testGpu(b, h_res2, N);
cout << "GPU cost = " << timer.getCounterMsPrecise() << "\n";
for (int i = 0; i < N; i++) {
if (h_res1[i] != h_res2[i]) {
cout << "ERROR " << i << " " << h_res1[i] << " " << h_res2[i] << "\n";
exit(1);
}
}
cout << "-----------------\n";
}
int main()
{
a = new real[MAX_N];
b = new real[MAX_N];
cudaMalloc(&d_a, MAX_N * sizeof(float));
cudaMallocHost(&h_res1, MAX_N * sizeof(float));
cudaMallocHost(&h_res2, MAX_N * sizeof(float));
testOne(0, 0);
for (int i = 1; i <= 50; i++) testOne(i);
}
For legacy code reason, I have to perform sort inside a kernel completely. Basically, I need:
__global__ void mainKernel(float** a, int N, float* global_pad)
{
int x;
...
cooperative_groups::grid_group g = cooperative_groups::this_grid();
sortFunc(a[x], N); // this can be a kernel. Then only 1 thread in the grid will call it
g.sync();
...
}
I tried to use thrust::sort but it's extremely slow. For example, with N = 100000, the benchmark result is:
CPU cost = 5.82228
Copy H2D cost = 0.088908
Thrust sort from CPU cost = 0.391211 (running line thrust::sort(thrust::device, d_a, d_a + N);)
Thrust sort inside kernel cost = 116 (running line sort_kernel<<<1,1>>>(d_a, N);)
Copy D2H cost = 0.067639
Why is thrust::sort so slow in this case? I want to find an implementation of sortFunc that is fastest possible (global_pad can be used as temporary memory)
Edit: I'm using 2080ti and CUDA 11.4. The compile command I use is
nvcc -o main main.cu -O3 -std=c++17
You need to turn on dynamic parallelism in the compile command.
Use -rdc=true, nvcc -o main main.cu -O3 -std=c++17 -rdc=true.
Then the 2 block code below are equivalent
__global__
void sort_kernel(float* a, int N)
{
if (blockIdx.x==0 && threadIdx.x==0)
thrust::sort(thrust::device, a, a + N);
__syncthreads();
}
...
sort_kernel<<<1,1>>>(d_a, N);
and
thrust::sort(thrust::device, d_a, d_a + N);

How do I reverse an floating-point array with size inputted by user in individual function using "for" loop?

I know there are so many libraries in this code. That's because I was trying everything and also testing some other things. The problem is that function called "wspak" is not outputted well in my main function. I mean, for example, it outputs the first or last index 10 times instead of outputting all 10 indexes once. I hope you all will understand this code, because I use polish appellations for my variables. This is my code so far:
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <windows.h>
#include <time.h>
#include <stdio.h>
#include <conio.h>
#include <random>
#include <stdlib.h>
#include <algorithm>
using namespace std;
float maxitab(int n, float tab[]); //deklaracja funkcji
float minitab(int n, float tab[]); //deklaracja funkcji
float wspak(int n, float tab[]); //deklaracja funkcji
double rand_double()
{
return ((double)rand()) / ((double)RAND_MAX);
}
double rand_double_interval(double a, double b)
{
return rand_double() * (b - a) + a;
}
int main(int argc, char* argv[])
{
srand(time(NULL));
int n;
float* tab;
cout << "Program wyszuka najwiekszy element tablicy.\n";
cout << "Podaj ilosc elementow tablicy: ";
cin >> n;
tab = new float[n];
for (int i = 0; i < n; i++)
{
float wylosowane_liczby = rand_double_interval(-1, 1);
tab[i] = wylosowane_liczby;
}
cout << "WYLOSOWANE z przedzialu <-1,1> elementy tablicy to: " << endl;
for (int i = 0; i < n; i++)
{
cout << tab[i] << endl;
}
cout << "\nNajwiekszy WYLOSOWANY z przedzialu <-1,1> element tablicy to: " << maxitab(n, tab);
cout << "\nNajmniejszy WYLOSOWANY z przedzialu <-1,1> element tablicy to: " << minitab(n, tab);
cout << "\nTablica od konca do poczatku: " << endl;
for (int i = 0; i < n; i++)
{
cout << wspak(n, tab);
}
delete[] tab;
cout << endl;
cout << endl;
cout << endl;
return 0;
}
float maxitab(int n, float tab[]) //definicja funkcji
{
float maximum = tab[0];
for (int i = 1; i < n; i++)
{
if (tab[i] > maximum)
{
maximum = tab[i];
}
}
return maximum;
}
float minitab(int n, float tab[]) //definicja funkcji
{
float minimum = tab[0];
for (int i = 1; i < n; i++)
{
if (tab[i] <= minimum)
{
minimum = tab[i];
}
}
return minimum;
}
float wspak(int n, float tab[]) //definicja funkcji
{
float wspak = 0;
for (int i = n - 1; i >= 0; i--)
{
if (tab[i] < wspak)
{
wspak = tab[i];
}
}
return wspak;
}
float wspak(int n, float tab[]); //deklaracja funkcji
wspak(n, tab);
void wspak(int n, float tab[]) //definicja funkcji
{
for (int i = n - 1; i >= 0; i--)
{
cout << tab[i] << ' ' << endl;
}
cout << '\n';
}

Why LAPACKE_dsygvd returns error after changing size of matrix?

I am trying to solve the generalized eigenvalue problem for the hydrogen atom by using LAPACKE_dsygvd. For the parameters of the generator functions, I use an interval that starts at 0.01 and takes N steps of 0.01. What I change is the value of N. Everythings fine for N = 14 and below, where I get the answers from the analytical solution. However, when I choose N = 15 and above, I get an error and info is returned with a value > N. After reading the documentation from LAPACK, it says the following:
N: if INFO = N + i, for 1 <= i <= N, then the leading
minor of order i of B is not positive definite.
The factorization of B could not be completed and
no eigenvalues or eigenvectors were computed.
But I have checked my matrix B and it is positive definite. I don't know what is wrong.
Below I show my scripts
#include <cmath>
#include <cstdio>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include "library.h"
#include "mkl.h"
using namespace std;
double Superposition(const double ai, const double aj, const int m);
double Hamiltonian(const double ai, const double aj, const int m);
void print_matrix(double *A, int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
printf("%.7f ", A[i*n + j]);
}
cout << "\n";
}
}
void print_vector(double *vec, int n) {
for (int i = 0; i < n; i++) {
cout << vec[i] << " ";
}
cout << "\n";
}
double* interval(double min, double step) {
double *result;
result = (double *)mkl_malloc( N*sizeof( double ), 64 );
for (int i = 0; i < N; i++) {
result[i] = min + i*step;
}
return result;
}
int main() {
cout << Ry << "\n";
double *S, *H, *I, *eigenvalues;
double alpha, beta;
int i, j, info;
char* uplo = "U"; char* jobz = "V";
I = interval(0.01, 0.01);
alpha = 1.0; beta = 0.0;
S = (double *)mkl_malloc( N*N*sizeof( double ), 64 );
H = (double *)mkl_malloc( N*N*sizeof( double ), 64 );
eigenvalues = (double *)mkl_malloc( N*sizeof( double ), 64 );
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
int index = i*N + j;
if (j < i) {
S[index] = 0.0;
H[index] = 0.0;
}
else {
S[index] = Superposition(I[i], I[j], m);
H[index] = Hamiltonian(I[i], I[j], m);
}
}
}
print_matrix(S, N); cout << "\n";
print_matrix(H, N); cout << "\n" << "\n";
info = LAPACKE_dsygv(LAPACK_ROW_MAJOR, 1, *jobz, *uplo, N,
H, N, S, N, eigenvalues);
//print_matrix(H, N); cout << "\n";
//for (i = 0; i < N; i++) {
// eigenvalues[i] /= Ry;
//}
cout << info << "\n" << "\n";
print_matrix(H, N); cout << "\n";
print_vector(eigenvalues, N);
mkl_free(S);
mkl_free(H);
mkl_free(I);
mkl_free(eigenvalues);
}
*Edit: I used dsygvd as included in MKL, and the same error doesn't occur. However, I get very different results for both functions using the same inputs.

Weird crashing program while debug runs smoothly (Eclipse C++)

I'm writing a program for my algorithmic math class at university and I'm using Win 7 (x64), Eclipse Oxygen.1a Release (4.7.1a) with MinGW 6.3.0.
Whenever I build and run the program it crashes with windows claiming 'Abgabe3.exe stopped working' but when trying to find the problem using the debugger and breakpoints I step trough the whole program and it finishes without errors...
I stripped everything not used by the problematic function and copied everything into a seperate file and the exact problem occurs.
Maybe somebody has a clue what happened at my side. ^^
#include <math.h> /* pow, sqrt */
#include <iostream> /* cin, cout */
#include <new> /* new */
#include <string> /* string */
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
using namespace std;
void NORM(double* res, double* x, int n){
res[0] = 0.0;
for(int i = 0; i < n; i++){
res[0] += pow(x[i], 2);
}
res[0] = sqrt(res[0]);
}
void initRand(double* x, int n){
srand (time(NULL) * rand());
for(int i = 0; i < n; i++){
x[i] = (((double) rand()) / ((double) RAND_MAX));
}
}
void createArray(double* &x, int n){
if (n > 0){
x = new double[n];
initRand(x, n);
}
}
void printArray(double* x, int n){
if (x != NULL){
cout<<"(\n";
for(int i = 0; i < n; i++){
if(i+1 == n) cout<<x[i];
else if ((i % 5) == 0) cout<<x[i];
else if ( ((i+1) % 5) == 0 ){
cout<<", "<<x[i]<<"\n";
}
else {
cout<<", "<<x[i];
}
}
cout<<"\n)\n";
}
else cout<<"\nError: pointer = NULL\n";
}
unsigned long long int bin(unsigned int n, unsigned int k){
unsigned long long res = 1;
if(k == 0) return 1;
else if( n >= k){
for(unsigned long long int i = 1; i <= k; i++){
res *= (n + 1 - i) / i;
}
}
else return 0;
return res;
}
void newArray(double** x, unsigned int v, unsigned int n){
for(unsigned int i = 0; i < v; i++){
double* ptr = x[i];
createArray(ptr,n);
x[i] = ptr;
}
}
void experiment(double** vektorArray){
unsigned int n = 10, v = 20;
cout<<"Dimension n = "<<n<<"\nAnzahl Versuche v = "<<v<<endl;
//Erstellen der Vektoren
cout<<"Erstellen - starte\n";
vektorArray = new double*[n];
newArray(vektorArray, v, n);
cout<<"Erstellen - fertig\n";
for(unsigned int i = 0; i < v; i++){
if(i%10 == 0) printArray(vektorArray[i], n);
}
}
int main(int argc, char** argv){
double** vektorArray = NULL;
experiment(vektorArray);
return 0;
}
vektorArray = new double*[n];
created an array of size n, but
void newArray(double** x, unsigned int v, unsigned int n)
{
for (unsigned int i = 0; i < v; i++)
{
double* ptr = x[i];
createArray(ptr, n);
x[i] = ptr;
}
}
and
for (unsigned int i = 0; i < v; i++)
{
if (i % 10 == 0)
printArray(vektorArray[i], n);
}
index that array with v. Looks like you got your variables crossed. Strongly recommend giving variables better, more descriptive names to help make this more obvious.

Strided vs shuffling reduction

I've recently the watched CppCon talk about using Clang to compile CUDA cuda code, where the speaker after talking a bit about the architecture implements a sum reduction. I was interested in the approach he took which was doing a reduction by a shfl of the elements in the block, so with no working example I used his code modified it a little bit and got a max-reduction.
The thing is that this max reduction is very slow, compared to a CPU implementation of finding the max in 2^22 elements I get times of about ~90ms against ~20ms. Here is the code for the shfl reduction
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
using namespace std;
// Global reduce test
__global__ void d_max_reduce(const int *in, int *out, size_t N) {
int sum = 0;
size_t start = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
for (size_t i = start; i < start + 4 && i < N; i++) {
sum = max(__ldg(in + i), sum);
}
for (int i = 16; i; i >>= 1) {
sum = max(__shfl_down(sum, i), sum);
}
__shared__ int shared_max;
shared_max = 0;
__syncthreads();
if (!(threadIdx.x % 32)) {
atomicMax(&shared_max, sum);
}
__syncthreads();
if (!threadIdx.x) {
atomicMax(out, shared_max);
}
}
int test_max_reduce(std::vector<int> &v) {
int *in, *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 256;
d_max_reduce<<<ceil((float)v.size() / (threads * 4)), threads>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
So I used one of Nvidia's examples of a strided reduction (which is also is a sum) changed it to a max and I got times of about 7ms. Here is the code for the strided reduction
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
__global__ void d_max_reduction(const int *in, int *out, size_t N) {
extern __shared__ int s_data[];
size_t tid = threadIdx.x;
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
s_data[tid] = in[i];
else
s_data[tid] = 0;
__syncthreads();
for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s)
s_data[tid] = max(s_data[tid], s_data[tid + s]);
__syncthreads();
}
if (!tid)
atomicMax(out, s_data[0]);
}
int test_max_reduction(std::vector<int> &v) {
int *in;
int *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 128;
d_max_reduction<<<ceil((float)v.size() / threads),
threads,
threads * sizeof(int)>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
And just in case the rest so there is a MWE.
#include <random>
#include <timer.hpp>
int test_max_reduce(std::vector<int> &v);
int test_max_reduction(std::vector<int> &v);
int main() {
int N = 2000 * 2000; // * 2000;
std::vector<int> vec(N);
std::random_device dev;
std::mt19937 mt(dev());
std::uniform_int_distribution<int> dist(0, N << 2);
for (size_t i = 0; i < vec.size(); i++) {
vec[i] = dist(mt);
}
measure("GPU (shfl)", test_max_reduce, vec);
measure("GPU strided", test_max_reduction, vec);
measure("CPU",
[](std::vector<int> &vec) -> int {
int maximum = 0;
for (size_t i = 0; i < vec.size(); i++) {
maximum = std::max(maximum, vec[i]);
}
return maximum;
},
vec);
return 0;
}
And timer.hpp is
#ifndef TIMER_HPP
#define TIMER_HPP
#include <chrono>
#include <string>
#include <iostream>
template <typename F, typename ...Args>
void measure(std::string msg, F func, Args&&... args) {
auto start = std::chrono::steady_clock::now();
int val = func(std::forward<Args>(args)...);
auto end = std::chrono::steady_clock::now();
std::cout << msg << " Test " << std::endl;
std::cout << " Max Value : " << val << std::endl;
std::cout << " Time : ";
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>
(end - start).count() << std::endl;
}
#endif // TIMER_HPP
I generally get the following times
GPU (shfl) Test
Max Value : 15999999
Time : 86
GPU strided Test
Max Value : 15999999
Time : 7
CPU Test
Max Value : 15999999
Time : 23
EDIT new timings after warmup
GPU (shfl) Test
Max Value : 16000000
Time : 4
GPU strided Test
Max Value : 16000000
Time : 6
CPU Test
Max Value : 16000000
Time : 23
So my more general question is why is the shfl version slower than the strided? Which can be divided in
Am I missing something in the launch parameters/doing/assumed something wrong?
And when is recommended to use shfl intrinsic over a strided loop and viceversa?