I'm busy working on a LS method, I manually implemented a conjugate gradient solver, but after updating my CUDA version, I saw that there is a new function (cusolverDnSSgels) which I assume is faster than my manual implementation. My first task was to try and run it on a test case (see below), I'd expect the result to be: -6.5, 9.7 according to MATlab. Unfortunately I cannot find what I did wrong, I also cannot find an example because it is a relatively new function.
The output says that niter= -3, which would suggest too many iterations according to the documentation, however this would not make sense, as it is a very small matrix which should be easily solvable.
#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "device_launch_parameters.h"
int main()
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int dinfo = 0;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << dinfo << std::endl;
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
//destory handle
return 0;
The results I get are:
Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08
Could someone point out what I am doing wrong?
You have a problem with your dinfo parameter usage. Referring to the documentation, we see that:
Parameters of cusolverDngels() functions
parameter Memory In/out Meaning
dinfo device output Status of the IRS solver on the return. If 0 - solve was successful. If dinfo = -i then i-th argument is not valid.
the dinfo parameter is expected to live in device memory. But you have it in host memory:
int dinfo = 0;
If I move the storage to the proper location, your code outputs the values you indicate as expected:
$ cat
#include <iostream>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main()
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int *dinfo, hinfo;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
cudaMalloc(&dinfo, sizeof(int));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, dinfo);
cudaMemcpy(&hinfo, dinfo, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << hinfo << std::endl;
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
//destory handle
return 0;
$ nvcc -o t143 -lcublas -lcusolver
$ cuda-memcheck ./t143
Status = 0
Status = 0
niter = -51
dinfo = 0
-6.5, 9.7
========= ERROR SUMMARY: 0 errors
I am using CUDA 11.3 for the above. If you are using an earlier version, I strongly recommend you move forward to CUDA 11.3 or newer for usage of this function.
You can get a hint as to the problem by running your code with cuda-memcheck
It was fairly quick to spot the problem by reviewing your parameter usage with the table of parameter locations (host/device) given in the documentation. You had a problem here which was similar in that you could focus in on the problem by reviewing your parameter locations (host/device) against the table given in the documentation. This may be a good thing to check to save yourself time in the future.
I'm trying to understand how numpy can be so fast, based on my shocking comparison with optimized C/C++ code which is still far from reproducing numpy's speed.
Consider the following example:
Given a 2D array with shape=(N, N) and dtype=float32, which represents a list of N vectors of N dimensions, I am computing the pairwise differences between every pair of vectors. Using numpy broadcasting, this simply writes as:
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
Using timeit I can measure the performance for N=512: it takes 88 ms per call on my laptop.
Now, in C/C++ a naive implementation writes as:
#define X(i, j) _X[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _X, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
return _res;
Compiling using gcc 7.3.0 with -O3 flag, I get 195 ms per call for pairwise_sub_naive(X), which is not too bad given the simplicity of the code, but about 2 times slower than numpy.
Now I start getting serious and add some small optimizations, by indexing the row vectors directly:
float* pairwise_sub_better( const float* _X, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
return _res;
The speed stays the same at 195 ms, which means that the compiler was able to figure that much. Let's now use SIMD vector instructions:
float* pairwise_sub_simd( const float* _X, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
return _res;
This only yields a small boost (178 ms instead of 194 ms per function call).
Then I was wondering if a "block-wise" approach, like what is used to optimize dot-products, could be beneficials:
float* pairwise_sub_blocks( const float* _X, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
return _res;
And surprisingly, this is the slowest method so far (258 ms per function call).
To summarize, despite some efforts with some optimized C++ code, I can't come anywhere close the 88 ms / call that numpy achieves effortlessly. Any idea why?
Note: By the way, I am disabling numpy multi-threading and anyway, this kind of operation is not multi-threaded.
Edit: Exact code to benchmark the numpy code:
import numpy as np
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
N = 512
X = np.random.rand(N,N).astype(np.float32)
import timeit
times = timeit.repeat('pairwise_sub_numpy( X )', globals=globals(), number=1, repeat=5)
print(f">> best of 5 = {1000*min(times):.3f} ms")
Full benchmark for C code:
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <time.h>
#define X(i, j) _x[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _x, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
return _res;
float* pairwise_sub_better( const float* _x, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
return _res;
float* pairwise_sub_simd( const float* _x, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
return _res;
float* pairwise_sub_blocks( const float* _x, int N )
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
return _res;
int main()
const int N = 512;
float* _x = (float*) malloc( N * N * sizeof(float) );
for( int i = 0; i < N; i++)
for( int j = 0; j < N; j++)
X(i,j) = ((i+j*j+17*i+101) % N) / float(N);
double best = 9e9;
for( int i = 0; i < 5; i++)
struct timespec start, stop;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//float* res = pairwise_sub_naive( _x, N );
//float* res = pairwise_sub_better( _x, N );
//float* res = pairwise_sub_simd( _x, N );
float* res = pairwise_sub_blocks( _x, N );
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop);
double t = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3; // in microseconds
if (t < best) best = t;
free( res );
printf("Best of 5 = %f ms\n", best / 1000);
free( _x );
return 0;
Compiled using gcc 7.3.0 gcc -Wall -O3 -mavx -msse4.1 -o test_simd test_simd.c
Summary of timings on my machine:
88 ms
C++ naive
194 ms
C++ better
195 ms
178 ms
C++ blocked
258 ms
C++ blocked (gcc 8.3.1)
217 ms
As pointed out by some of the comments numpy uses SIMD in its implementation and it does not allocate memory at the point of computation. If I eliminate the memory allocation from your implementation, pre-allocating all the buffers ahead of the computation then I get a better time compared to numpy even with the scaler version(that is the one without any optimizations).
Also in terms of SIMD and why your implementation does not perform much better than the scaler is because your memory access patterns are not ideal for SIMD usage - you do memcopy and you load into SIMD registers from locations that are far apart from each other - e.g. you fill vectors from line 0 and line 511, which might not play well with the cache or with the SIMD prefetcher.
There is also a mistake in how you load the SIMD registers(if I understood correctly what you're trying to compute): a 256 bit SIMD register can load 8 single-precision floating-point numbers 8 * 32 = 256, but in your loop you jump k by "256/sizeof(float)" which is 256/4 = 64; _x and _res are float pointers and the SIMD intrinsics expect also float pointers as arguments so instead of reading all elements from those lines every 8 floats you read them every 64 floats.
The computation can be optimized further by changing the access patterns but also by observing that you repeat some computations: e.g. when iterating with line0 as a base you compute line0 - line1 but at some future time, when iterating with line1 as a base, you need to compute line1 - line0 which is basically -(line0 - line1), that is for each line after line0 a lot of results could be reused from previous computations.
A lot of times SIMD usage or parallelization requires one to change how data is accessed or reasoned about in order to provide meaningful improvements.
Here is what I have done as a first step based on your initial implementation and it is faster than the numpy(don't mind the OpenMP stuff as it's not how its supposed to be done, I just wanted to see how it behaves trying the naive way).
Time scaler version: 55 ms
Time SIMD version: 53 ms
**Time SIMD 2 version: 33 ms**
Time SIMD 3 version: 168 ms
Time OpenMP version: 59 ms
Python numpy
>> best of 5 = 88.794 ms
#include <cstdlib>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <cstring>
using namespace std;
float* pairwise_sub_naive (const float* input, float* output, int n)
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++)
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
return output;
float* pairwise_sub_simd (const float* input, float* output, int n)
for (int i = 0; i < n; i++)
const int idxi = i * n;
for (int j = 0; j < n; j++)
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
return output;
float* pairwise_sub_simd_2 (const float* input, float* output, int n)
float* line_buffer = (float*) aligned_alloc(32, n * sizeof(float));
for (int i = 0; i < n; i++)
const int idxi = i * n;
for (int j = 0; j < n; j++)
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(line_buffer + k, _mm256_sub_ps( A, B ));
memcpy(output + outidx * n, line_buffer, n);
return output;
float* pairwise_sub_simd_3 (const float* input, float* output, int n)
for (int i = 0; i < n; i++)
const int idxi = i * n;
for (int k = 0; k < n; k += 8)
__m256 A = _mm256_load_ps(input + idxi + k);
for (int j = 0; j < n; j++)
const int idxj = j * n;
const int outidx = (idxi + j) * n;
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx + k, _mm256_sub_ps( A, B ));
return output;
float* pairwise_sub_openmp (const float* input, float* output, int n)
int i, j;
#pragma omp parallel for private(j)
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
const int idxi = i * n;
const int idxj = j * n;
const int outidx = idxi + j;
for (int k = 0; k < n; k += 8)
__m256 A = _mm256_load_ps(input + idxi + k);
__m256 B = _mm256_load_ps(input + idxj + k);
_mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
/*for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
for (int k = 0; k < n; k++)
output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
return output;
int main ()
constexpr size_t n = 512;
constexpr size_t input_size = n * n;
constexpr size_t output_size = n * n * n;
float* input = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_simd = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_simd = (float*) aligned_alloc(32, output_size * sizeof(float));
float* input_par = (float*) aligned_alloc(32, input_size * sizeof(float));
float* output_par = (float*) aligned_alloc(32, output_size * sizeof(float));
iota(input, input + input_size, float(0.0));
fill(output, output + output_size, float(0.0));
iota(input_simd, input_simd + input_size, float(0.0));
fill(output_simd, output_simd + output_size, float(0.0));
iota(input_par, input_par + input_size, float(0.0));
fill(output_par, output_par + output_size, float(0.0));
std::chrono::milliseconds best_scaler{100000};
for (int i = 0; i < 5; ++i)
auto start = chrono::high_resolution_clock::now();
pairwise_sub_naive(input, output, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_scaler)
best_scaler = duration;
cout << "Time scaler version: " << best_scaler.count() << " ms\n";
std::chrono::milliseconds best_simd{100000};
for (int i = 0; i < 5; ++i)
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd)
best_simd = duration;
cout << "Time SIMD version: " << best_simd.count() << " ms\n";
std::chrono::milliseconds best_simd_2{100000};
for (int i = 0; i < 5; ++i)
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_2(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_2)
best_simd_2 = duration;
cout << "Time SIMD 2 version: " << best_simd_2.count() << " ms\n";
std::chrono::milliseconds best_simd_3{100000};
for (int i = 0; i < 5; ++i)
auto start = chrono::high_resolution_clock::now();
pairwise_sub_simd_3(input_simd, output_simd, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_simd_3)
best_simd_3 = duration;
cout << "Time SIMD 3 version: " << best_simd_3.count() << " ms\n";
std::chrono::milliseconds best_par{100000};
for (int i = 0; i < 5; ++i)
auto start = chrono::high_resolution_clock::now();
pairwise_sub_openmp(input_par, output_par, n);
auto stop = chrono::high_resolution_clock::now();
auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
if (duration < best_par)
best_par = duration;
cout << "Time OpenMP version: " << best_par.count() << " ms\n";
cout << "Verification\n";
if (equal(output, output + output_size, output_simd))
cout << "PASSED\n";
cout << "FAILED\n";
return 0;
Edit: Small correction as there was a wrong call related to the second version of SIMD implementation.
As you can see now, the second implementation is the fastest as it behaves the best from the point of view of the locality of reference of the cache. Examples 2 and 3 of SIMD implementations are there to illustrate for you how changing memory access patterns to influence the performance of your SIMD optimizations.
To summarize(knowing that I'm far from being complete in my advice) be mindful of your memory access patterns and of the loads and stores to\from the SIMD unit; the SIMD is a different hardware unit inside the processor's core so there is a penalty in shuffling data back and forth, hence when you load a register from memory try to do as many operations as possible with that data and do not be too eager to store it back(of course, in your example that might be all you need to do with the data). Be mindful also that there is a limited number of SIMD registers available and if you load too many then they will "spill", that is they will be stored back to temporary locations in main memory behind the scenes killing all your gains. SIMD optimization, it's a true balance act!
There is some effort to put a cross-platform intrinsics wrapper into the standard(I developed myself a closed source one in my glorious past) and even it's far from being complete, it's worth taking a look at(read the accompanying papers if you're truly interested to learn how SIMD works).
This is a complement to the answer posted by #celakev .
I think I finally got to understand what exactly was the issue. The issue was not about allocating the memory in the main function that does the computation.
What was actually taking time is to access new (fresh) memory. I believe that the malloc call returns pages of memory which are virtual, i.e. that does not corresponds to actual physical memory -- until it is explicitly accessed. What actually takes time is the process of allocating physical memory on the fly (which I think is OS-level) when it is accessed in the function code.
Here is a proof. Consider the two following trivial functions:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
float* just_alloc( size_t N )
return (float*) aligned_alloc( 32, sizeof(float)*N );
void just_fill( float* _arr, size_t N )
for (size_t i = 0; i < N; i++)
_arr[i] = 1;
#define Time( code_to_benchmark, cleanup_code ) \
do { \
double best = 9e9; \
for( int i = 0; i < 5; i++) { \
struct timespec start, stop; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); \
code_to_benchmark; \
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop); \
double t = (stop.tv_sec - start.tv_sec) * 1e3 + (stop.tv_nsec - start.tv_nsec) / 1e6; \
printf("Time[%d] = %f ms\n", i, t); \
if (t < best) best = t; \
cleanup_code; \
} \
printf("Best of 5 for '" #code_to_benchmark "' = %f ms\n\n", best); \
} while(0)
int main()
const size_t N = 512;
Time( float* arr = just_alloc(N*N*N), free(arr) );
float* arr = just_alloc(N*N*N);
Time( just_fill(arr, N*N*N), ; );
return 0;
I get the following timings, which I now detail for each of the calls:
Time[0] = 0.000931 ms
Time[1] = 0.000540 ms
Time[2] = 0.000523 ms
Time[3] = 0.000524 ms
Time[4] = 0.000521 ms
Best of 5 for 'float* arr = just_alloc(N*N*N)' = 0.000521 ms
Time[0] = 189.822237 ms
Time[1] = 45.041083 ms
Time[2] = 46.331428 ms
Time[3] = 44.729433 ms
Time[4] = 42.241279 ms
Best of 5 for 'just_fill(arr, N*N*N)' = 42.241279 ms
As you can see, allocating memory is blazingly fast, but the first time that the memory is accessed, it is 5 times slower than the other times. So, basically the reason that my code was slow was because i was each time reallocating fresh memory that had no physical address yet. (Correct me if I'm wrong but I think that's the gist of it!)
A bit late to the party, but I wanted to add a pairwise method with Eigen, which is supposed to give C++ a high-level algebra manipulation capability and use SIMD under the hood. Just like numpy.
Here is the implementation
#include <iostream>
#include <vector>
#include <chrono>
#include <algorithm>
#include <Eigen/Dense>
auto pairwise_eigen(const Eigen::MatrixXf &input, std::vector<Eigen::MatrixXf> &output) {
for (int k = 0; k < input.cols(); ++k)
output[k] = input
// subtract matrix with repeated k-th column
- input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
int main() {
constexpr size_t n = 512;
// allocate input and output
Eigen::MatrixXf input = Eigen::MatrixXf::Random(n, n);
std::vector<Eigen::MatrixXf> output(n);
std::chrono::milliseconds best_eigen{100000};
for (int i = 0; i < 5; ++i) {
auto start = std::chrono::high_resolution_clock::now();
pairwise_eigen(input, output);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
if (duration < best_eigen)
best_eigen = duration;
std::cout << "Time Eigen version: " << best_eigen.count() << " ms\n";
return 0;
The full benchmark tests suggested by #celavek on my system are
Time scaler version: 57 ms
Time SIMD version: 58 ms
Time SIMD 2 version: 40 ms
Time SIMD 3 version: 58 ms
Time OpenMP version: 58 ms
Time Eigen version: 76 ms
Numpy >> best of 5 = 118.489 ms
Whit Eigen there is still a noticeable improvement with respect to Numpy, but not so impressive compared to the "raw" implementations (there is certainly some overhead).
An extra optimization is to allocate the output vector with copies of the input and then subtract directly from each vector entry, simply replacing the following lines
// inside the pairwise method
for (int k = 0; k < input.cols(); ++k)
output[k] -= input.col(k) * Eigen::RowVectorXf::Ones(input.cols());
// at allocation time
std::vector<Eigen::MatrixXf> output(n, input);
This pushes the best of 5 down to 60 ms.
I'm using VS2019 and have an NVIDIA GeForce GPU. I tried the code from this link:
However, I want to try using cudaMalloc instead of using managed memory with cudaMallocManaged
I tried the code below:
void add(int n, float* x, float* y)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
int main()
int N = 1 << 20;
float* x, * y;
cudaMalloc(&x, N * sizeof(float));
cudaMalloc(&y, N * sizeof(float));
cudaMemset(x,1.0, N * sizeof(float)); //want to set x as an array of 1.0s
cudaMemset(y,2.0, N * sizeof(float)); //want to set y as an array of 2.0s
int device = -1;
int blockSize = 1024;
int numBlocks = (N + blockSize - 1) / blockSize;
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
std::cout << "duration CUDA: "<<duration;
return 0;
But I'm getting an unhandled exception error at maxError = fmax(maxError, fabs(y[i] - 3.0f));, I'm guessing because I didn't use cudaMemset correctly? How should I modify it?
In no particular order:
Device memory (i.e. memory allocated with cudaMalloc) can't be accessed directly on the host, so your maxError calculations are illegal because y is a pointer in device memory. To perform the error check, you require a copy of y to a local host copy of the memory before running the loop
cudaMemset sets bytes, not words (just like regular memset). You either need to set values on the host and copy them to the device, or in another kernel, or use something like thrust::fill_n.
In the spirit of your previous question, there is typically setup latency in the first call of a kernel, so perform a warm-up before timing
Doing these three things gets me this:
int main()
int N = 1 << 20;
std::vector<float> xh(N, 1.0f);
std::vector<float> yh(N, 2.0f);
float* x, * y;
cudaMalloc(&x, N * sizeof(float));
cudaMemcpy(x, &xh[0], N * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&y, N * sizeof(float));
cudaMemcpy(y, &yh[0], N * sizeof(float), cudaMemcpyHostToDevice);
int blockSize, numBlocks;
cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, add);
for(int rep=0; rep<10; rep++) {
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << rep << " duration CUDA: " << duration <<std::endl;
cudaMemcpy(&yh[0], y, N * sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(yh[i] - 12.0f));
std::cout << "Max error: " << maxError << std::endl;
return 0;
And compiling it and running it gets me this:
$ nvcc -arch=sm_52 -std=c++11 -o devmem
$ ./devmem
0 duration CUDA: 155
1 duration CUDA: 94
2 duration CUDA: 95
3 duration CUDA: 94
4 duration CUDA: 94
5 duration CUDA: 93
6 duration CUDA: 93
7 duration CUDA: 99
8 duration CUDA: 92
9 duration CUDA: 93
Max error: 0
Compared to the timings in my last answer to you, you can see that using device memory provides speedup over managed memory on my system. As always, your results might vary.
I am fairly new to CUDA, and I am trying to offload to the GPU some cumbersome computations I am doing for a performance-critical project. On my computer I have two NVS 510 Graphic cards, but I am currently experimenting with one only.
I have some big column-major matrix (1000-5000 rows x 1-5 M columns) to be filled. I was so far able to write the code to fill the matrix like it were an array, and it works well for matrices of relatively small size.
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_rows;
int row = index % n_rows;
if (row > n_sim || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
The kernel is called:
fl_type *res;
cudaMalloc((void**)&res, n_columns*n_rows*fl_size);
int block_size = 1024;
int num_blocks = (n_rows* n_columns + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (res,[other params], n_rows,n_columns);
and everything works just fine.
If I change the kernel to work with 2D threads:
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, [other params],
int n_rows, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column* n_rows + row;
if (row > n_rows || column > num_cols) return;
d_matrix[index] = …something(row, column,[other params]);
and I invoke it
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (n_columns + block_size2 - 1) / block_size2;
int y_grid = (n_rows + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
interp_kernel2D <<< grid_size2, num_blocks2 >>> (res,[other params], n_rows,n_columns);
the results are all zero and CUDA returns unknown error. What am I missing? the actual code, which compiles without error with VS2015 and CUDA 8.0, can be found here:
Here is the code from the pastebin link:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, int& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_point] * n_sim + row];
return res;
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
d_matrix[index] = res;
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type* node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row > n_sim || column > num_cols) return;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + inter_point] * n_sim];
d_matrix[index] = res;
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (float(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (float(count_zero) / size * 100) << "%)" << std::endl;
std::cout << "Perfect match!" << std::endl;
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
//int block_size2 = 32; //each block will have block_size2*block_size2 threads
//dim3 num_blocks2(block_size2, block_size2);
//int x_grid = (total_action_number + block_size2 - 1) / block_size2;
//int y_grid = (n_sim + block_size2 - 1) / block_size2;
//dim3 grid_size2(x_grid, y_grid);
//std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
//interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
Moreover, I would extremely grateful for any direction on how to improve the performance of the code.
Any time you are having trouble with a CUDA code, I recommend doing proper CUDA error checking (which you mostly seem to be doing), and also run your code with cuda-memcheck. This last utility is similar to "enabling the memory checker" in Nsight VSE, but not quite the same. However the Nsight VSE memory checker may have given you the same indication.
In C (or C++) indexing of arrays generally starts at 0. Therefore, to test for an out-of-bounds index, I must check to see if the generated index is equal to or greater than the size of the array. But in your case you are only testing for greater than:
if (row > n_sim || column > num_cols) return;
You make a similar error in both your 1D kernel and in your 2D kernel, and although you believe your 1D kernel is working correctly, it is actually making out-of-bounds accesses. You can verify this if you run with the aforementioned cuda-memcheck utility (or probably also with the memory checker that can be enabled in Nsight VSE).
When I modify your code in the pastebin link to use proper range/bounds checking, cuda-memcheck reports no errors, and your program reports the correct results. I've tested both cases, but the code below is modified from your pastebin link to uncomment the 2D case, and use that instead of the 1D case:
$ cat | more
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <iostream>
#include <random>
#include <chrono>
typedef float fl_type;
typedef int pos_type;
typedef std::chrono::milliseconds ms;
//declaration of the cuda function
void cuda_interpolation_function(fl_type* interp_value_back, int result_size, fl
_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map,
int total_action_number, int interp_dim, int n_sim);
fl_type iterp_cpu(fl_type* weights, pos_type* node_map, fl_type* grid_values, in
t& row, int& column, int& interp_dim, int& n_sim) {
int w_p = column*interp_dim;
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[node_map[w_p + inter_poi
nt] * n_sim + row];
return res;
__global__ void interp_kernel(fl_type * d_matrix, fl_type* weights, pos_type* no
de_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int column = index / n_sim;
int row = index % n_sim;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
d_matrix[index] = res;
__global__ void interp_kernel2D(fl_type * d_matrix, fl_type* weights, pos_type*
node_map, fl_type* grid_values, int interp_dim, int n_sim, int num_cols) {
int column = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = column*n_sim + row;
int w_p = column*interp_dim;
if (row >= n_sim || column >= num_cols) return; // modified
fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
res += weights[w_p + inter_point] * grid_values[row + node_map[w_p + int
er_point] * n_sim];
d_matrix[index] = res;
void verify(fl_type *host, fl_type *device, int size) {
int count = 0;
int count_zero = 0;
for (int i = 0; i < size; i++) {
if (host[i] != device[i]) {
//std::cout <<"pos: " <<i<< " CPU:" <<h[i] << ", GPU: " << d[
i] <<std::endl;
assert(host[i] == device[i]);
if (device[i] == 0.0)
if (count) {
std::cout << "Non matching: " << count << "out of " << size << "(" << (f
loat(count) / size * 100) << "%)" << std::endl;
std::cout << "Zeros returned from the device: " << count_zero <<"(" << (
float(count_zero) / size * 100) << "%)" << std::endl;
std::cout << "Perfect match!" << std::endl;
int main() {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
int dim = 5; // range: 2-5
int number_nodes = 5500; // range: 10.000-500.000
int max_actions = 12; // range: 6-200
int n_sim = 1000; // range: 1.000-10.000
int interp_dim = std::pow(2, dim);
int grid_values_size = n_sim*number_nodes;
std::default_random_engine generator;
std::normal_distribution<fl_type> normal_dist(0.0, 1);
std::uniform_int_distribution<> uniform_dist(0, number_nodes - 1);
double bit_allocated = 0;
fl_type * grid_values; //flattened 2d array, containing the value of the grid (n_sims x number_nodes)
grid_values = (fl_type *)malloc(grid_values_size * fl_size);
bit_allocated += grid_values_size * fl_size;
for (int i = 0; i < grid_values_size; i++)
grid_values[i] = normal_dist(generator);
pos_type * map_node2values_start; //vector that maps each node to the first column of the result matrix regarding that done
pos_type * map_node2values_how_many; //vector that stores how many action we have per node
map_node2values_start = (pos_type *)malloc(number_nodes * pos_size);
map_node2values_how_many = (pos_type *)malloc(number_nodes * pos_size);
bit_allocated += 2 * (number_nodes * pos_size);
for (int i = 0; i < number_nodes; i++) {
//each node as simply max_actions
map_node2values_start[i] = max_actions*i;
map_node2values_how_many[i] = max_actions;
//total number of actions, which is amount of column of the results
int total_action_number = map_node2values_start[number_nodes - 1] + map_node2values_how_many[number_nodes - 1];
//vector that keep tracks of the columnt to grab, and their weight in the interpolation
fl_type* weights;
pos_type * node_map;
weights = (fl_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * fl_size;
node_map = (pos_type *)malloc(total_action_number*interp_dim * pos_size);
bit_allocated += total_action_number * pos_size;
//filling with random numbers
for (int i = 0; i < total_action_number*interp_dim; i++) {
node_map[i] = uniform_dist(generator); // picking random column
weights[i] = 1.0 / interp_dim; // uniform weights
std::cout << "done filling!" << std::endl;
std::cout << bit_allocated / 8 / 1024 / 1024 << "MB allocated" << std::endl;
int result_size = n_sim*total_action_number;
fl_type *interp_value_cpu;
bit_allocated += result_size* fl_size;
interp_value_cpu = (fl_type *)malloc(result_size* fl_size);
auto start = std::chrono::steady_clock::now();
for (int row = 0; row < n_sim; row++) {
for (int column = 0; column < total_action_number; column++) {
auto zz = iterp_cpu(weights, node_map, grid_values, row, column, interp_dim, n_sim);
interp_value_cpu[column*n_sim + row] = zz;
auto elapsed_cpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the CPU (serial): " << std::chrono::duration_cast<ms>(elapsed_cpu).count() / 1000.0 << "s" << std::endl;
int * pp;
cudaMalloc((void**)&pp, sizeof(int)); //initializing the device, to not affect the benchmark
fl_type *interp_value_gpu;
interp_value_gpu = (fl_type *)malloc(result_size* fl_size);
start = std::chrono::steady_clock::now();
cuda_interpolation_function(interp_value_gpu, result_size, grid_values, grid_values_size, weights, node_map, total_action_number, interp_dim, n_sim);
auto elapsed_gpu = std::chrono::steady_clock::now() - start;
std::cout << "Crunching values on the GPU: " << std::chrono::duration_cast<ms>(elapsed_gpu).count() / 1000.0 << "s" << std::endl;
float ms_cpu = std::chrono::duration_cast<ms>(elapsed_cpu).count();
float ms_gpu = std::chrono::duration_cast<ms>(elapsed_gpu).count();
int n_proc = 4;
std::cout << "Performance: " << (ms_gpu- ms_cpu / n_proc) / (ms_cpu / n_proc) * 100 << " % less time than parallel CPU!" << std::endl;
verify(interp_value_cpu, interp_value_gpu, result_size);
void cuda_interpolation_function(fl_type* interp_value_gpu, int result_size, fl_type * grid_values, int grid_values_size, fl_type* weights, pos_type* node_map, int total_action_number, int interp_dim, int n_sim) {
int fl_size = sizeof(fl_type);
int pos_size = sizeof(pos_type);
auto start = std::chrono::steady_clock::now();
//device versions of the inputs
fl_type * grid_values_device;
fl_type* weights_device;
pos_type * node_map_device;
fl_type *interp_value_device;
int lenght_node_map = interp_dim*total_action_number;
std::cout << "size grid_values: " << grid_values_size <<std::endl;
std::cout << "size weights: " << lenght_node_map << std::endl;
std::cout << "size interp_value: " << result_size << std::endl;
//allocating and moving to the GPU the inputs
auto error_code=cudaMalloc((void**)&grid_values_device, grid_values_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the grid_values" << std::endl;
error_code=cudaMemcpy(grid_values_device, grid_values, grid_values_size*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the grid_values" << std::endl;
error_code=cudaMalloc((void**)&weights_device, lenght_node_map*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of the weights" << std::endl;
error_code=cudaMemcpy(weights_device, weights, lenght_node_map*fl_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of the weights" << std::endl;
error_code=cudaMalloc((void**)&node_map_device, lenght_node_map*pos_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of node_map" << std::endl;
error_code=cudaMemcpy(node_map_device, node_map, lenght_node_map*pos_size, cudaMemcpyHostToDevice);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMemcpy of node_map" << std::endl;
error_code=cudaMalloc((void**)&interp_value_device, result_size*fl_size);
if (error_code != cudaSuccess) {
std::cout << "Error during cudaMalloc of interp_value_device " << std::endl;
auto elapsed_moving = std::chrono::steady_clock::now() - start;
float ms_moving = std::chrono::duration_cast<ms>(elapsed_moving).count();
#if 0
int block_size = 1024;
int num_blocks = (result_size + block_size - 1) / block_size;
std::cout << "num_blocks:" << num_blocks << std::endl;
interp_kernel << < num_blocks, block_size >> > (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
int block_size2 = 32; //each block will have block_size2*block_size2 threads
dim3 num_blocks2(block_size2, block_size2);
int x_grid = (total_action_number + block_size2 - 1) / block_size2;
int y_grid = (n_sim + block_size2 - 1) / block_size2;
dim3 grid_size2(x_grid, y_grid);
std::cout <<"grid:"<< x_grid<<" x "<< y_grid<<std::endl;
interp_kernel2D <<< grid_size2, num_blocks2 >>> (interp_value_device, weights_device, node_map_device, grid_values_device, interp_dim, n_sim, total_action_number);
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
std::cout << "Cuda kernel failed! " << cudaGetErrorString(err) <<std::endl;
start = std::chrono::steady_clock::now();
cudaMemcpy(interp_value_gpu, interp_value_device, result_size*fl_size, cudaMemcpyDeviceToHost);
auto elapsed_moving_back = std::chrono::steady_clock::now() - start;
float ms_moving_back = std::chrono::duration_cast<ms>(elapsed_moving_back).count();
std::cout << "Time spent moving the data to the GPU:" << ms_moving << " ms"<<std::endl;
std::cout << "Time spent moving the results back to the host: " << ms_moving_back << " ms" << std::endl;
$ nvcc -arch=sm_52 -o t375 -std=c++11
$ cuda-memcheck ./t375
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.081s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:31 ms
Time spent moving the results back to the host: 335 ms
Crunching values on the GPU: 7.089s
Performance: -5.73452 % less time than parallel CPU!
Perfect match!
========= ERROR SUMMARY: 0 errors
Note that cuda-memcheck slows down the execution of your program on the GPU to do rigorous memory bounds checking. Therefore the performance may not match the ordinary case. This is what an "ordinary" run looks like:
$ ./t375
done filling!
2.69079MB allocated
Crunching values on the CPU (serial): 30.273s
size grid_values: 5500000
size weights: 2112000
size interp_value: 66000000
grid:2063 x 32
Time spent moving the data to the GPU:32 ms
Time spent moving the results back to the host: 332 ms
Crunching values on the GPU: 1.161s
Performance: -84.6596 % less time than parallel CPU!
Perfect match!
You are accessing memory beyond the allocated chunk. To check if row and column indices are within the range:
if (row >= n_rows || column >= num_cols) return; // Do this
if (row > n_rows || column > num_cols) return; // Instead of this
In flat version this int row = index % n_rows; makes row stay below the n_rows. You only access one column beyond the allocated memory, which for small matrix could still be withing the memory alignment. Python demo.
The second version does access an extra column plus and extra element, and one extra element for each row (the first element of the following row), as this:
int row = blockIdx.y * blockDim.y + threadIdx.y;
no longer keeps row index within the valid range. Python demo.
Looking at your pastebin, this is probably the place where it breaks:
44. fl_type res = weights[w_p] * grid_values[row + node_map[w_p] * n_sim];
45. for (int inter_point = 1; inter_point < interp_dim; inter_point++) {
46. res += weights[w_p + inter_point] * \
grid_values[row + node_map[w_p + inter_point] * n_sim];
47. }
Suppose we have four float arrays to be used on the host side, as well as its four counterparts to be used on the device side:
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
Now assume that we have a very simple kernel, taken from one of the examples at NVIDIA's blog:
void saxpy(int n, float a, float *x, float *y)
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
y[i] = a*x[i] + y[i];
Such kernel is to be called by the host side inside a for-loop, like the following:
for (int r = 0; r < LOOP_N; r++)
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
And then I compare the execution time of such loop against its pure-CPU version:
for (int r = 0; r < LOOP_N; r++)
for (int i = 0; i < ARRAYS_SIZE; i++) {
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
Now, what I don't understand is the following. For instance with ARRAYS_SIZE = 1000000 and for LOOP_N = 1000, when I run both loops in the versions shown above, I get a ratio between the execution time of CPU version and CUDA version that is around 6. It is, the CUDA version is approximately 6 times faster.
However, if I comment out one of the calls to saxpy that is inside the CUDA version of the loop and one of the calculations inside the CPU version of the loop, the ratio between CPU and CUDA becomes around 210. It is, the CUDA version is approximately 210 times faster.
What is the technical reason for such performance loss when merely repeating the call to a kernel, if no memory is being transferred to / from the device? Are there any workarounds to this?
A (hopefully) fully reproducible code example goes below:
#include <algorithm>
#include <chrono>
#include <iostream>
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Typedef and constant variables
typedef std::chrono::high_resolution_clock::time_point timers;
const int LOOP_N = 1000;
const int ARRAYS_SIZE = 1000000;
//Pretty simple kernel, from the example in Nvidia's blog
void saxpy(int n, float a, float *x, float *y)
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
y[i] = a*x[i] + y[i];
// Main loop
int main(void)
timers t0, t1, t2;
timers tfinal0, tfinal1, tfinal2;
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
//Initializing arrays at the host side:
for (int i = 0; i < ARRAYS_SIZE; i++) {
x[i] = 1.0f;
x2[i] = 1.0f;
y[i] = 2.0f;
y2[i] = 2.0f;
// GPU memory allocation:
cudaMalloc(&d_x, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_x2, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y2, ARRAYS_SIZE * sizeof(float));
// Transfering arrays from host to device:
cudaMemcpy(d_x, x, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
// CPU run //
t0 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
for (int i = 0; i < ARRAYS_SIZE; i++) {
//comment one of the following out to see the point of my question:
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
tfinal0 = std::chrono::high_resolution_clock::now();
auto time0 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal0 - t0).count();
std::cout << "CPU: " << (float)time0 << " microseconds" << std::endl;
// GPU-CUDA run //
// Perform SAXPY kernel on ARRAYS_SIZE elements, for LOOP_N times
t1 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
//comment one of the following out to see the point of my question:
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
tfinal1 = std::chrono::high_resolution_clock::now();
auto time1 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal1 - t1).count();
std::cout << "CUDA: " << (float)time1 << " microseconds" << std::endl;
//Display performance ratio CPU / GPU-CUDA
std::cout << "Ratio CPU/CUDA: " << (float)time0 / (float)time1 << std::endl;
//Freeing memory used by arrays:
return 0;
You are not waiting for the kernels to be finished. As all kernel launches are asynchronous, you need to explicitly call cudaDeviceSynchronize() before stopping your timer.
The differences you are observing with variants of your current code likely stem from the fact that the queue for kernels to launch is finite, so at some point your code will start waiting for part of your kernels anyways.
On Windows kernel batching also plays into this, up to some number (or a timeout) the driver will not even start to launch kernels.
A simple change solves the problem, but I would still very much appreciate learning the technical reasons for all this.
The solution is to merely change, in my toy example above, the kernel to:
void saxpy(int n, float a, float *x, float *y, float *x2, float *y2)
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
y[i] = a*x[i] + y[i];
y2[i] = a*x2[i] + y2[i];
And then call it only once, like the following:
for (int r = 0; r < LOOP_N; r++)
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y, d_x2, d_y2);
Now the performance difference against the CPU implementation is just the same - which should be expected.
If someone can jump in with an answer to why this makes a difference, please post it that I will favor it over mine.