Convert Eigen::SparseMatrix to cuSparse and vice versa - c++

I am having trouble figuring out how to convert Eigen::SparseMatrix to cuSparse due to how little documentation and examples are online. For dense matrices, converting from Eigen to CUDA for cublas is fairly straight forward
Eigen::MatrixXd A = Eigen::MatrixXd::Identity(3,3);
double *d_A;
cudaMalloc(reinterpret_cast<void **>(&d_A), 3 * 3 * sizeof(double));
cudaMemcpy(d_A,, sizeof(double) * 3 * 3, cudaMemcpyHostToDevice);
// do cublas operations on d_A
How to do the equivalent for the sparse matrices?
std::vector<Eigen::Triplet<double>> trip;
trip.emplace_back(0, 0, 1);
trip.emplace_back(1, 1, 1);
trip.emplace_back(2, 2, 1);
Eigen::SparseMatrix<double> A(3, 3);
A.setFromTriplets(trip.begin(), trip.end());
double *d_A;
// cudaMalloc?
// cudaMemcpy? some conversion?
// do cusparse operations

Just in case people are interested, I figured it out. The tricky part is Eigen's sparse matrix is in CSC format, whereas cuSparse is in CSR format. Fortunately, the conversion can be done by simply transpose CSC into CSR.
void EigenSparseToCuSparseTranspose(
const Eigen::SparseMatrix<double> &mat, int *row, int *col, double *val)
const int num_non0 = mat.nonZeros();
const int num_outer = mat.cols() + 1;
sizeof(int) * num_outer,
col, mat.innerIndexPtr(), sizeof(int) * num_non0, cudaMemcpyHostToDevice);
val, mat.valuePtr(), sizeof(double) * num_non0, cudaMemcpyHostToDevice);
void CuSparseTransposeToEigenSparse(
const int *row,
const int *col,
const double *val,
const int num_non0,
const int mat_row,
const int mat_col,
Eigen::SparseMatrix<double> &mat)
std::vector<int> outer(mat_col + 1);
std::vector<int> inner(num_non0);
std::vector<double> value(num_non0);
cudaMemcpy(, row, sizeof(int) * (mat_col + 1), cudaMemcpyDeviceToHost);
cudaMemcpy(, col, sizeof(int) * num_non0, cudaMemcpyDeviceToHost);
cudaMemcpy(, val, sizeof(double) * num_non0, cudaMemcpyDeviceToHost);
Eigen::Map<Eigen::SparseMatrix<double>> mat_map(
mat_row, mat_col, num_non0,,,;
mat = mat_map.eval();


Did CUDA implement std::complex operator =?

I have modified the int version of vector add to two complex vector to add, below code can work, but I am confused:
#include <stdio.h>
#include <complex>
#define N (2048*2048)
__global__ void add(std::complex<double> *a, std::complex<double> *b, std::complex<double> *c)
int index = threadIdx.x + blockIdx.x * blockDim.x;
// c[index] = a[index] + b[index];
// c[index] = a[index].real();
c[index] = a[index];
int main()
// host side
std::complex<double> *a;
std::complex<double> *b;
std::complex<double> *c;
// device side
std::complex<double> *d_a;
std::complex<double> *d_b;
std::complex<double> *d_c;
int size = N * sizeof(std::complex<double>);
/* allocate space for device copies of a, b, c */
cudaMalloc( (void **) &d_a, size );
cudaMalloc( (void **) &d_b, size );
cudaMalloc( (void **) &d_c, size );
/* allocate space for host copies of a, b, c and setup input values */
a = (std::complex<double>*)malloc( size );
b = (std::complex<double>*)malloc( size );
c = (std::complex<double>*)malloc( size );
for( int i = 0; i < N; i++ )
a[i] = b[i] = i;
c[i] = 0;
cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );
add<<< std::ceil(N / (double)THREADS_PER_BLOCK), THREADS_PER_BLOCK>>>( d_a, d_b, d_c );
cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost);
bool success = true;
for( int i = 0; i < N; i++ )
// if( c[i] != a[i] + b[i])
if( c[i] != a[i] )
printf("c[%d] = %d\n",i,c[i] );
success = false;
printf("%s\n", success ? "success" : "fail");
cudaFree( d_a );
cudaFree( d_b );
cudaFree( d_c );
return 0;
for the kernel function:
__global__ void add(std::complex<double> *a, std::complex<double> *b, std::complex<double> *c)
int index = threadIdx.x + blockIdx.x * blockDim.x;
// c[index] = a[index] + b[index];
// c[index] = a[index].real();
c[index] = a[index];
c[index] = a[index];
will call std::complex operator =, this can pass compile,
but when change to compile with line:
c[index] = a[index] + b[index]; // first one
c[index] = a[index].real(); // second one
It will just cannot compile, the error message for the first one is: error: calling a host function("std::operator
+ ") from a global function("add") is not allowed error: identifier "std::operator + " is
undefined in device code
the error message when change to use the second one is like: error: calling a constexpr host function("real")
from a global function("add") is not allowed. The experimental
flag '--expt-relaxed-constexpr' can be used to allow this.
1 error detected in the compilation of
The compile command I used:
/usr/local/cuda-10.2/bin/nvcc -o complex
I knew well that device code cannot call host code, and real() and + function for std::complex is both host code, so they cannot be called in kernel function, however, I am not understand why std::complex operator = can pass compile in my kernel function?
After overload the operator+ for std::complex, above code can achieve desired result:
__host__ __device__ std::complex<double> operator+(const std::complex<double>& a, const std::complex<double>& b)
const double* aArg = reinterpret_cast<const double*>(&a);
const double* bArg = reinterpret_cast<const double*>(&b);
double retVal[2] = { aArg[0] + bArg[0], aArg[1] + bArg[1] };
return std::move(*reinterpret_cast<std::complex<double>*>(retVal));
The root cause is that the underline struct of std::complex is in fact a array of 2 data types you defined, like double[2], the benefit is that we can have same function parameter at host/device side. However, I still recommand to use thrust/complex or other complex library in CUDA.
No, CUDA C++ does not implement std::complex<T>::operator+() as a built-in.
The std::complex<T> type is not implemented for the GPU; all of its methods are written host-only. Exceptions are constexpr methods, and as #RobertCrovella noted, the compiler willingness to treat some/all implicitly-declared methods as __host__ __device__ - e.g. copy constructors or assignment operators . This is why c[index] = a[index] works: It uses an implicitly-defined assignment operator.
For using complex numbers on the device side, consider this question:
CUDA - How to work with complex numbers?

transpose different matrices in parallel

I have like 3 different size of matrices and want to transpose them parallel.
Firstly I put these in a 2D array using malloc and then use cudaMalloc to transfer array from host(h_B) to device (d_B).
Using threadIdx to find each address of matrix in the array. The cublas function is used.
Here are my code.
The code can be compiled but I cannot get result. It seems that in global function float *A = new float[m*n] is not a good way.
Dose anyone have ideas of this?
Thanks so much!
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
/* Includes, cuda helper functions */
#include <helper_cuda.h>
__global__ void transposeCublasSgeam(int *M_A, int *N_A, float *ptrA, float *ptrC, const int N, int *address)
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
const float d_alpha = 1.0f;
const float d_beta = 0.0f;
int idx = threadIdx.x;
int m = M_A[idx]; //A_row
int n = N_A[idx]; //A_col
float *A = new float[m*n];
float *C = new float[m*n];
A = ptrA+address[idx];
C = ptrC+address[idx];
cublasSgeam(cnpHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &d_alpha, (const float*)A, n, &d_beta, (const float *)A, n, C, m);
delete[] A;
delete[] C;
int main()
const int N = 3;
int M_B[N] = { 2,3,2 }; //row number of matrices
int N_B[N] = { 3,2,4 }; //col number of matrices
float a[6] = { 1,2,3,
4,5,6 };
float b[6] = { 1,2,
float c[8] = { 1,2,3,1,
2,3,4,5 };
float **h_B = (float**)malloc(N * sizeof(float*));
float **h_BT = (float**)malloc(N * sizeof(float*));
h_B[0] = a, h_BT[0] = a;
h_B[1] = b, h_BT[1] = b;
h_B[2] = c, h_BT[2] = c;
int NUM_B = 20; // total number of elements
int address[] = {0,6,12};
float *d_B, *d_BT;
checkCudaErrors(cudaMalloc((void **)&d_B, NUM_B * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_BT, NUM_B * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_B, h_B, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_BT, h_BT, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
transposeCublasSgeam<<<1,N>>>(M_B, N_B, d_B,d_BT, N,address);
checkCudaErrors(cudaMemcpy(h_BT, d_BT, NUM_B * sizeof(float), cudaMemcpyDeviceToHost));
delete[] h_B;
delete[] h_BT;
return 0;
There were a number of errors in your code. I will probably miss some in my description.
Note that this cublas-in-device-code functionality is no longer available in newer CUDA versions.
Every pointer that is passed to device code needs an allocation with cudaMalloc. You had done cudaMalloc for a few pointers, but not all of them.
You're confused about pointers and arrays of pointers. I won't be able to sort all of that out for you. Your kernel design really doesn't need the complexity of using arrays of pointers. So I've removed all that.
In CUDA dynamic parallelism (CDP), pointers to the local address space cannot be passed to child kernels. You can't use alpha and beta in the local address space, and pass pointers to those to CUBLAS in CDP.
To do a pure transpose, study the CUBLAS Sgeam documentation for the recommended parameters to use.
I believe there were other things I fixed. Please study this example:
$ cat
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
/* Includes, cuda helper functions */
#include <helper_cuda.h>
__global__ void transposeCublasSgeam(int *M_A, int *N_A, float *ptrA, float *ptrC, const int N, int *address)
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
printf("thread: %d, error1: %d\n", threadIdx.x, (int)status);
float *d_alpha = new float; // a pointer to device-heap, not local memory
*d_alpha = 1.0f;
float *d_beta = new float;
*d_beta = 0.0f;
int idx = threadIdx.x;
int m = M_A[idx]; //A_row
int n = N_A[idx]; //A_col
status = cublasSgeam(cnpHandle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, d_alpha, ptrA+address[idx], n, d_beta, ptrC+address[idx], m, ptrC+address[idx], m);
printf("thread: %d, error2: %d\n", threadIdx.x, (int)status);
int main()
const int N = 3;
int M_B[N] = { 2,3,2 }; //row number of matrices
int N_B[N] = { 3,2,4 }; //col number of matrices
float a[6] = { 1,2,3,
4,5,6 };
float b[6] = { 1,2,
float c[8] = { 1,2,3,1,
2,3,4,5 };
float *h_Bdata = (float *)malloc(sizeof(a)+sizeof(b)+sizeof(c));
float *h_BTdata = (float *)malloc(sizeof(a)+sizeof(b)+sizeof(c));
memcpy(h_Bdata, a, sizeof(a));
memcpy(h_Bdata+(sizeof(a)/sizeof(a[0])), b, sizeof(b));
memcpy(h_Bdata+(sizeof(a)/sizeof(a[0]))+(sizeof(b)/sizeof(b[0])), c, sizeof(c));
int NUM_B = 20; // total number of elements
int address[] = {0,6,12};
int *d_address;
cudaMalloc(&d_address, sizeof(address));
cudaMemcpy(d_address, address, sizeof(address), cudaMemcpyHostToDevice);
int *d_M_B, *d_N_B;
cudaMalloc(&d_M_B, sizeof(M_B));
cudaMalloc(&d_N_B, sizeof(N_B));
cudaMemcpy(d_M_B, M_B, sizeof(M_B), cudaMemcpyHostToDevice);
cudaMemcpy(d_N_B, N_B, sizeof(N_B), cudaMemcpyHostToDevice);
float *d_B, *d_BT;
checkCudaErrors(cudaMalloc((void **)&d_B, NUM_B * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_BT, NUM_B * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_B, h_Bdata, NUM_B * sizeof(float), cudaMemcpyHostToDevice));
transposeCublasSgeam<<<1,N>>>(d_M_B, d_N_B, d_B,d_BT, N,d_address);
checkCudaErrors(cudaMemcpy(h_BTdata, d_BT, NUM_B * sizeof(float), cudaMemcpyDeviceToHost));
std::cout << "B , BT" << std::endl;
for (int i = 0; i < NUM_B; i++){
std::cout << h_Bdata[i] << " , " << h_BTdata[i] << std::endl;}
return 0;
$ /usr/local/cuda-8.0/bin/nvcc -I/usr/local/cuda-8.0/samples/common/inc -rdc=true -lcublas_device -lcudadevrt -arch=sm_35 -o t1433
$ LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64 CUDA_VISIBLE_DEVICES="3" cuda-memcheck ./t1433
B , BT
1 , 1
2 , 4
3 , 2
4 , 5
5 , 3
6 , 6
1 , 1
2 , 3
3 , 5
4 , 2
5 , 4
6 , 6
1 , 1
2 , 2
3 , 2
1 , 3
2 , 3
3 , 4
4 , 1
5 , 5
========= ERROR SUMMARY: 0 errors

CUDA Simple Array Search - Shared Memory

I am writing a function to search for the first occurrence that matches a specific criteria. In my particular problem, I am interested in finding the array index corresponding to a circle that intersects with a given point. I have three arrays that collectively describe the circles: x coordinates, y coordinates, and radii (cx, cy, cz). Given an input point I calculate whether or not it will intersect each of the circles defined by the 3 arrays. I chose to declare the x, y, radii arrays in global memory because my search function will be called often and these arrays will not change.
This function seems pretty straight forward, but I'm getting the following error:
cudaMemcpy(&res, dev_idx, sizeof(int), cudaMemcpyDeviceToHost) returned the launch timed out and was terminated(6)
static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define HANDLE_ERROR(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
// global memory arrays on the device
__device__ __constant__ double* cx;
__device__ __constant__ double* cy;
__device__ __constant__ double* cr;
__global__ void _cuda_find_containing_circle(double px, double py, int* fidx, int count){
// px, py: x and y coordinates of the search point
// fidx: a device variable to return the index of the matching circle
// count: total number of circle elements in the device arrays
// get the current thread id
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < count){
// calculate the hypotenuse of the point and grab the radius
float hypot = (float)hypot(cx[tid] - px, cy[tid] - py);
float radius = (float)cr[tid];
// if the hypotenuse is within the radius, return the current index
// this looks like the problem, i.e. hardcoding to "if(tid = 10){" does not result in an error
if (hypot <= radius){
atomicMin(fidx, tid);
// I've also tried setting the idx directly
//*fidx = tid;
// increment thread id
tid += blockDim.x * gridDim.x;
void main(){
// define a search point for testing
int px = 100;
int py = 150;
// initialize cx, cy, cz device values using the following arrays
double *circlex;
double *circley;
double *circler;
int count = 100;
circlex = (double *) malloc(sizeof(double) * count);
circley = (double *) malloc(sizeof(double) * count);
circler = (double *) malloc(sizeof(double) * count);
// populate arrays with values that will not pass the search criteria
for (int i = 0; i < count; i++) {
circlex[i] = 2.1;
circley[i] = 3.2;
circler[i] = 0.0;
// add a single value that will pass the search criteria (for testing)
circlex[count - 5] = 101.0;
circley[count - 5] = 160.0;
circler[count - 5] = 11.0; //hypot should result in 10.0498 < 11
// copy these data onto the device
HANDLE_ERROR(cudaMemcpyToSymbol(cx, &circlex, sizeof(circlex), 0, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpyToSymbol(cy, &circley, sizeof(circley), 0, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpyToSymbol(cr, &circler, sizeof(circler), 0, cudaMemcpyHostToDevice));
// create an object on the device to store the search index result
int* dev_idx;
int idx = 999; // initial condition. If -999 is returned then I know that a match was not found
HANDLE_ERROR(cudaMalloc((void **) &dev_idx, sizeof(int)));
HANDLE_ERROR(cudaMemcpy(dev_idx, &idx, sizeof(int), cudaMemcpyHostToDevice));
// call the search function
_cuda_find_containing_circle <<<128, 128>>> (px, py, dev_idx, count);
// get the search result
// this line throws the following error: cudaMemcpy(&res, dev_idx, sizeof(int), cudaMemcpyDeviceToHost) returned the launch timed out and was terminated(6)
int res;
HANDLE_ERROR(cudaMemcpy(&res, dev_idx, sizeof(int), cudaMemcpyDeviceToHost));
cout << "IDX = " << res << endl;
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
if (err == cudaSuccess)
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
Is there something fundamentally incorrect with my approach?
The core problem is this:
cudaMemcpyToSymbol(cx, &circlex, sizeof(circlex), 0, cudaMemcpyHostToDevice);
cx is a unintialised pointer to which you are copying sizeof(double *) bytes from a source array of double values, leaving it containing a nonsense address which results in an illegal memory operation in your kernel.
Do something like this instead:
double * _cx; cudaMalloc((void **)&_cx, sizeof(double) * count);
cudaMemcpy(_cx, circlex, sizeof(double) * count, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(cx, &_cx, sizeof(_cx));
i.e. allocate a device memory buffer, copy the host source data to that buffer, then copy the address of that buffer to the constant memory pointer.

Numerical error in cuda/cublas simple kernel using particular input

I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
dest[i] = source[i] * value;
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.
Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
cudaError_t checkCuda(cudaError_t result)
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
return result;
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
dest[i] = source[i] * value;
float check_eq(float* dev, float* host, u32 len)
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
if (dev[i]!=host[i])
//printf("diff %d %f %f\n", i, dev[i], host[i]);
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
printf("%f\n", sum);
return sum;
void div_host(float* a, float v, u32 len)
for (u32 i = 0; i < len; ++i)
int main()
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
cublasHandle_t ch;
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
return 0;

cublasSdot is working slower than cublasSgemm

In my toy example I first multiply matrices of size 32x32, 100 000 times, and after that I calculate scalar products of two vectors of size 1024, 100 000 times again. For the first I used cublasSgemm, for the second - cublasSdot.
As a result, time for first calculation is 530 msec, for the second - 10 000 msec. However, in order to multiply matrices we need to perform 32^3 operations (multiply-add), and for scalar product just 1024=32^2 operations.
So why am I getting such result? Here is the code:
__device__ float res;
void randomInit(float *data, int size)
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
int main(){
cublasHandle_t handle;
float out;
cudaError_t cudaerr;
cudaEvent_t start1, stop1,start2,stop2;
cublasStatus_t stat;
int size = 32;
int num = 100000;
float *h_A = new float[size*size];
float *h_B = new float[size*size];
float *h_C = new float[size*size];
float *d_A, *d_B, *d_C;
const float alpha = 1.0f;
const float beta = 0.0f;
randomInit(h_A, size*size);
randomInit(h_B, size*size);
cudaMalloc((void **)&d_A, size *size *sizeof(float));
cudaMalloc((void **)&d_B, size *size * sizeof(float));
cudaMalloc((void **)&d_C, size *size * sizeof(float));
stat = cublasCreate(&handle);
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha, d_A, size,
d_B, size, &beta, d_C, size);
cudaEventRecord(start1, NULL);
cudaMemcpy(d_A, h_A, size *size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size *size * sizeof(float), cudaMemcpyHostToDevice);
for (int i = 0; i < num; i++){
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &alpha, d_A,
size, d_B, size, &beta, d_C, size);
cudaMemcpy(h_C, d_C, size*size*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop1, NULL);
float msecTotal1 = 0.0f;
cudaEventElapsedTime(&msecTotal1, start1, stop1);
std::cout <<"total time for MAtMul:" << msecTotal1 << "\n";
cudaEventRecord(start2, NULL);
cudaMemcpy(d_A, h_A, size *size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size *size * sizeof(float), cudaMemcpyHostToDevice);
for (int i = 0; i < num; i++){
cublasSdot(handle, 1024, d_A , 1, d_B , 1, &res);
cudaEventRecord(stop2, NULL);
float msecTotal2 = 0.0f;
cudaEventElapsedTime(&msecTotal2, start2, stop2);
std::cout << "total time for dotVec:" << msecTotal2 << "\n";
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 1;
Update: I tried also to perform dot product with cublasSgemm by treating vector as 1 by 1024 matrix. The result is 3550 msec, which is better, but still 7 times more then in the first calculation.
One problem is that you're not handling the pointer mode correctly for the call to cublasSdot.
You'll want to read this section of the manual.
Furthermore this:
cublasSdot(handle, 1024, d_A , 1, d_B , 1, &res);
is illegal under any circumstances. It is not legal in CUDA to take the address of a device variable in host code. You can certainly do it, but the results are garbage.
When I modify your code as follows:
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
float *dres;
cudaMalloc(&dres, sizeof(float));
cudaEventRecord(start2, NULL);
cudaMemcpy(d_A, h_A, size *size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size *size * sizeof(float), cudaMemcpyHostToDevice);
for (int i = 0; i < num; i++){
if(cublasSdot(handle, 1024, d_A , 1, d_B , 1, dres) != CUBLAS_STATUS_SUCCESS) {std::cout << ".";}
I get about a 2:1 ratio of execution time for cublasSdot to cublasSgemm which may be plausible, particularly for these sizes. Under the hood, the dot operation implies a parallel reduction. 1024 threads can compute the partial results, but then a 1024-thread-wide parallel reduction is required. The gemm does not need a parallel reduction, and so may be quicker. 1024 threads can be assigned to produce the 1024 results each in a single thread. For a memory-bound algorithm, the difference between 32^2 and 32^3 operations may not be that significant, but the parallel reduction implies significant additional operations. When I then change size in your program from 32 to 128, I see the ratio reverse, and the matrix multiply does indeed become 3x longer than the dot product.