I tried to make a device functor that essentially performs (unoptimized) matrix-vector multiplication like so
namespace cusolve
template <class value_type,
class matrix_type = value_type*,
class vector_type = value_type*>
struct linear_operator
const matrix_type matrix;
const size_t width;
linear_operator(const matrix_type matrix, size_t width)
: matrix(matrix), width(width) { }
void operator()(const vector_type x, vector_type x_out)
auto col = blockIdx.x * blockDim.x + threadIdx.x;
auto row = blockIdx.y * blockDim.y + threadIdx.y;
x_out[row] = 0;
if (row < width)
for (size_t i = 0; i < width; i++)
x_out[row] += matrix[row*width + i] * x[i];
So, this assumes that matrix, x, and x_out are device pointers. So, to test it I tried to call it from a simple kernel
operateKernel(double *d_matrix,
double *d_vector, double *d_vector_out,
size_t width)
cusolve::linear_operator<double> matmul(d_matrix, width);
matmul(d_vector, d_vector_out);
operate(double *matrix, double *vector, double *vector_out, size_t width)
const dim3 blockConfig(16, 16);
const size_t gridWidth = (size_t) ((double) width) / 16.0l;
const dim3 gridConfig(gridWidth, gridWidth);
double *d_matrix, *d_vector, *d_vector_out;
auto mem_vector = width * sizeof(double);
auto mem_matrix = mem_vector * width;
cudaMalloc((void **) &d_matrix, mem_matrix);
cudaMalloc((void **) &d_vector, mem_vector);
cudaMalloc((void **) &d_vector_out, mem_vector);
cudaMemcpy(d_matrix, matrix, mem_matrix, cudaMemcpyHostToDevice);
cudaMemcpy(d_vector, vector, mem_vector, cudaMemcpyHostToDevice);
operateKernel<<<gridConfig, blockConfig>>>(d_matrix, d_vector, d_vector_out, width);
cudaMemcpy(vector_out, d_vector_out, mem_vector, cudaMemcpyDeviceToHost);
But, when I try to call operate() from main() using allocated and initialized to non-null vectors and a matrix, the output is all zeros. I have been whacking my head over this for quite a while now and have not been able to figure out what it is that I am doing wrong.
P.S: I am deliberately trying to do this without thrust as a learning exercise.
Forgot to use ceil when calculating grid dimensions.
const size_t gridWidth = ceil( ((double) width) / 16.0l );
I have two vectors a and b. Each vector contains the coordinates of a 3d points x, y, z vector3f.
struct Vector3f
float x;
float y;
float z;
vector a has a size of n = 5000 points and vector b has a size of m = 4000. I need to do a tensor vector product between them like on the right side of the picture. the resulted vector should have a length size of 5000 * 4000 and contain float point where results are stored at c.
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// int j = blockIdy.y * blockDim.y + threadIdx.y;
//check if the idx is out of range
if (i < n) {
for (int j = 0; j < m; j++) {
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
dim3 blockSize(32, 1, 1);
dim3 gridSize((n + blockSize.x - 1) / blockSize.x, 1, 1);
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
I get high execution time on Volta arch which is a lot.
My question is how can I optimize the kernel to reduce time which is mainly because of the for loop inside the kernel. I know here that all global reads and writes are not coalesced.
You can make the kernel move through both a and b simultaneously, like this:
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c)
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdy.y * blockDim.y + threadIdx.y;
if (i < n && j < m)
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
dim3 blockSize(32, 32);
dim3 gridSize((int)ceil(n / 32.0), (int)ceil(m / 32.0));
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
I tried to modify the code to use a single array with and without shared memory, the code without shared memory with always faster 3 or 4 times.
With shared memory:
#define BLOCK_SIZE 32
void tensor3dProdcut(const int n, const int m, const float* a, const float* b, float* c)
float* d_a;
size_t size = (uint64_t)n * 3 * sizeof(float);
cudaMalloc(&d_a, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
float* d_b;
size = (uint64_t)m * 3 * sizeof(float);
cudaMalloc(&d_b, size);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
float* d_c;
size = (uint64_t)n * m * sizeof(float);
cudaMalloc(&d_c, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((int)ceil((double)n / BLOCK_SIZE), (int)ceil((double)m / BLOCK_SIZE));
tensor3dProdcutKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n, m);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
__shared__ double as[BLOCK_SIZE][3];
__shared__ double bs[BLOCK_SIZE][3];
for (i = 0; i < 3; i++)
as[row][i] = a[(BLOCK_SIZE * blockRow + row) * 3 + i];
bs[col][i] = b[(BLOCK_SIZE * blockCol + col) * 3 + i];
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += as[row][i] * bs[col][i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
Without shared memory:
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += a[(BLOCK_SIZE * blockRow + row) * 3 + i] * b[(BLOCK_SIZE * blockCol + col) * 3 + i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
I have a large array A with size_A rows and 6 columns. I am going to check the 3rd element of each row, and if that is not zero, copy the row into another array B. Can I have the index to the entries of B without using a for loop, please see the below code?
I probably would need to define b_ptr somehow to make it static (similar to the what we have in C), but I think that is not allowed.
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
if (A[x + 3] != 0)
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
The trick is to launch as many threads as there are elements in your array. If we assume tid (renamed from your x) ranges from 0 to size_A * 6, then we can remove the loop entirely. We do need to first determine what rows must be copied, so a shared array filter is introduced. Assuming you can fit int[size_A] into memory for a single block and have as many threads as entries, you can use the following code, with hints for how you might do this if size_A is big enough to need multiple blocks.
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
// TODO: For multiple blocks, we still need to densely pack B. (see below)
Continuing: as is, filtered needs to be shared across the kernel, so this only works within a single block. With multiple blocks, I would filter a portion of B per block (that is, keep the code above, changing where I note), record how much was filtered with size_B now being an array, cumulatively sum size_B, and then in-place copy B to be more dense (or download from device the dense parts from each portion using size_B).
From the comments, the invoking code:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
Which we can then test using GTEST:
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
This problem is complicated and can't be done with CUDA in one step, you can't search for the desired rows and put them in array B hoping that they will be in the correct order, as CUDA kernels don't necessarily check the rows in order. However, there is a multi-step solution that can do the trick. First, you will run a kernel that will locate the zeros within the third column, whose index is 2 not 3 by the way, then mark these rows with value of 1 in an array P. After that, a simple for loop will count these locations and store them in another array Ind. Finally, a second kernel will copy the required rows from A to B.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
__global__ void get_indeces(float* A, int* P, int size_A)
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
Disclaimer: Im a cuda beginner.
typedef struct
int row_;
int col_;
float* element_;
int step;
#define BLOCK_SIZE 64
__device__ float getElement(const Matrix_t A, int row, int col);
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col);
__device__ void setElement(Matrix_t A, int row, int col, float value);
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_);
float Matrix_dot_(float* M_dev_1, float* M_dev_2, int Number_of_cols, int Number_of_rows, int step);
the Matrix_t are used to link a cv::cuda::GpuMat to the C interface via the ptr() operator to get the GPU pointer to element.
__device__ float getElement(const Matrix_t A, int row, int col)
return A.element_[row* A.step + col];
__device__ void setElement(Matrix_t A, int row, int col, float value)
A.element_[row*A.step + col] = value;
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col)
Matrix_t A_sub;
A_sub.row_ = BLOCK_SIZE;
A_sub.col_ = BLOCK_SIZE;
A_sub.step = A.step;
A_sub.element_ = &A.element_[A.step * BLOCK_SIZE * row + BLOCK_SIZE * col];
return A_sub;
Here is the kernel:
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
float SubDotValue = 0.0f;
int row = threadIdx.y;
int col = threadIdx.x;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
//get subA & subB
Matrix_t A_sub = getSubMat(A, blockRow, m);
Matrix_t B_sub = getSubMat(B, blockRow, m);
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
SubDotValue += ASub[row][el_] * BSub[row][el_];
dot_[row] = SubDotValue;
and the wrapper:
float Matrix_dot_(float* M_dev_1,float* M_dev_2, int Number_of_cols, int Number_of_rows, int step)
float retval = 0;
float* retval_partial;
float* retval_device;
Matrix_t A;
A.col_ = Number_of_cols;
A.row_ = Number_of_rows;
A.element_ = M_dev_1;
A.step = step;
Matrix_t B;
B.col_ = Number_of_cols;
B.row_ = Number_of_rows;
B.element_ = M_dev_2;
B.step = step;
retval_partial = (float*)malloc( B.row_*sizeof(float) );
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
printf("\n Cuda malloc: %s", cudaGetErrorString(err));
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.row_ / BLOCK_SIZE, B.col_ / BLOCK_SIZE);
MatrixDot<<<dimGrid, dimBlock>>>(A, B, retval_device);
err = cudaThreadSynchronize();
printf("\n Cuda kernel run: %s", cudaGetErrorString(err));
err = cudaMemcpy(retval_partial, retval_device, B.row_ / BLOCK_SIZE* sizeof(float), cudaMemcpyDeviceToHost);
printf("\n Cuda cudaMemcpy: %s", cudaGetErrorString(err));
err = cudaFree(retval_device);
printf("\n Cuda cudaFree: %s", cudaGetErrorString(err));
for(int i = 0; i<B.row_/ BLOCK_SIZE ; ++i)
return retval;
and the main:
int main(int argc, const char * argv[])
cv::cuda::DeviceInfo devInfo;
cv::Mat cudatestA = cv::Mat(64*3, 64*3, CV_32FC1, 2);
cv::Mat cudatestB = cv::Mat(64*3, 64*3, CV_32FC1, 2);
double tr = (double) cv::getTickCount();
double res = cudatestA.dot(cudatestB);
tr = ((double)cv::getTickCount()-tr)/(double)cv::getTickFrequency();
cv::cuda::GpuMat ctA(cudatestA);
cv::cuda::GpuMat ctB(cudatestB);
double tm_ = (double) cv::getTickCount();
float res_m = 0;
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step);
tm_ = ((double)cv::getTickCount()-tm_)/(double)cv::getTickFrequency();
printf("\nCPU: %0.4fms, res: %0.4f\nGPU_M: %0.4fms, res: %0.4f\n", tr*1000.0f, res, tm_*1000.0f,res_m);
return 0;
I'm currently stuck on various points:
1) it always output 0.
2) it can only work for matrix M*N Multiple of the defined BLOCK_SIZE (64).
for 1) I can't figure where my logic break, I could get the dot product to work on vector without any troubles but the matrix problem induced by the stride between each row prevent me to use the code (code deleted as the site tell me that there is too much code).
Partial answer:
In your kernel you aren't doing the good sum, nor taking the good elements, and your dim seems inverted
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
//int blockRow = blockIdx.y;
//int blockCol = blockIdx.x;
int blockRow = blockIdx.x;
int blockCol = blockIdx.y;
float SubDotValue = 0.0f;
//int row = threadIdx.y;
//int col = threadIdx.x;
int row = threadIdx.x;
int col = threadIdx.y;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
//get subA & subB
Matrix_t A_sub = getSubMat(A, m, blockCol);//getSubMat(A, blockRow, m)
Matrix_t B_sub = getSubMat(B, m, blockCol);//getSubMat(B, blockRow, m)
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
SubDotValue += ASub[row][el_] * BSub[row][el_];
dot_[blockRow*BLOCK_SIZE + row] = SubDotValue; //dot_[row] = SubDotValue;
And your wrapper isn't also allocating the size you need:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
should be:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_*sizeof(float) );
Note that other allocation related have to change too (Lazy me).
And your call in main need to divide the GpuMat step by the size of one element of the GpuMat
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step/ctA.elemsize1());
You might also want to change your Matrix_t structure to use const float* instead of float to be able to use:
instead of:
Note that for a matrix of N rows you are starting N^2 threads doing the same thing. I don't have enough knowledge on Cuda to fix that.
How can I allocate 3D, 4D, 5D arrays with one malloc in a contigious way and access the individual items?
Something like this:
int* array = malloc(sizeof(int) * width * height);
int item = array[x + y * width];
A 3D array is an array of 2D arrays. A 4D array is an array of 3D arrays. You just multiply by your other dimensions. For example, a 3D array can be allocated in this way:
int *array = malloc(sizeof(int) * width * height * depth);
A 4D array can be made by multiplying by your other dimension:
int *array = malloc(sizeof(int) * width * height * depth * other_dimension);
and so on for 5D, 6D, etc. arrays.
You can access elements by using something like this (for 3D arrays, easily extended), assuming you have access to the width and height of the array:
int get_element(int x, int y, int z)
return array[(z * width * height) + (y * width) + x];
For 4D arrays:
int get_element(int x, int y, int z, int dimension_4)
return array[(dimension_4 * width * height * depth) + (z * width * height) + (y * width) + x];
As answered here (Setting pointer to arbitrary dimension array?
Look specially computeIndex/computeIndexes.
#include <cstddef>
#include <vector>
template <typename T>
class MultiArray
explicit MultiArray(const std::vector<size_t>& dimensions) :
const T& get(const std::vector<size_t>& indexes) const
return values[computeIndex(indexes)];
T& get(const std::vector<size_t>& indexes)
return values[computeIndex(indexes)];
size_t computeIndex(const std::vector<size_t>& indexes) const
assert(indexes.size() == dimensions.size());
size_t index = 0;
size_t mul = 1;
for (size_t i = 0; i != dimensions.size(); ++i) {
assert(indexes[i] < dimensions[i]);
index += indexes[i] * mul;
mul *= dimensions[i];
assert(index < values.size());
return index;
std::vector<size_t> computeIndexes(size_t index) const
assert(index < values.size());
std::vector<size_t> res(dimensions.size());
size_t mul = values.size();
for (size_t i = dimensions.size(); i != 0; --i) {
mul /= dimensions[i - 1];
res[i - 1] = index / mul;
assert(res[i - 1] < dimensions[i - 1]);
index -= res[i - 1] * mul;
return res;
size_t computeTotalSize(const std::vector<size_t>& dimensions) const
size_t totalSize = 1;
for (auto i : dimensions) {
totalSize *= i;
return totalSize;
std::vector<size_t> dimensions;
std::vector<T> values;
int main()
MultiArray<int> m({3, 2, 4});
m.get({0, 0, 3}) = 42;
m.get({2, 1, 3}) = 42;
for (size_t i = 0; i != 24; ++i) {
assert(m.computeIndex(m.computeIndexes(i)) == i);
Arrays are by nature allocated as a single dimension. You bestow dimensionality on them via the way you compute indexes to them. The size you need to allocate is the size of a scalar element multiplied by the number of elements in each of however many dimensions you intend to use, e.g., if you want a 10 x 20 x 30 array of 4-byte elements, multiply 4 x 10 x 20 x 30 to get the size of the malloc you need. Then, I'd probably write a function such as my_index(int i, int j, int k) that would compute the one-dimensional index for any valid (i,j,k) combination. This idea can be extended into as many dimensions as you wish.
I have read this post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D among many others including NVIDIA docs and I can't get cudaMallocPitch to work together with cudaMemcpy2D.
I need to copy a very big matrix in an array format (Matrix[width*height]) along with a simple array to perform Matrix * vector operations. It is not optional for me to use cudaMallocPitch in order to avoid conflicts and have a better performance.
So, I started by just trying to copy the matrix (vector in my case) to the device and check if it was correctly copied but my code does not print anything. If I use cudaMalloc and cudaMemcpy everything works fine. But I do not know what to do with cudaMallocPitch and cudaMemcpy2D.
What can I do to fix this?
#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < N)
double e = *(double *)(((char *) A + idx * mpitch) + N);
printf("(%f)", e);
int main()
int N = 1500;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
A[i] = i;
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
delete [] A;
return 0;
Error checking can make a big difference in debugging. You should always use it before coming here.
It wasn't clear if you wanted a row or column vector i.e. a matrix of [1xN] or [Nx1]
I've added an explanation on Talomnies suggestion, but first the 'working slabs of code'
Here's [Nx1]
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
double e = *(double *)(((char *) A + idx * mpitch));
printf("(%f)", e);
int main()
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
A[i] = i;
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
delete [] A;
return 0;
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
int row=0;//only one row
double *row_ptr = (double *)( (char *) (A + mpitch * row) );
double e = row_ptr[idx];
printf("(%f)", e);
int main()
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
A[i] = i;
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
delete [] A;
return 0;
Firslty, Error Handling:
Considering how easy error handling is in CUDA there isn't a good excuse not to put it in.
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
Second, you didn't specify if you wanted a column vector or a row vector. Since a row vector is simply a 1-D array in linear memory and you don't need pitched memory to do that, I will assume for this explanation that you meant a column vector.
The reoccurring problem you were having was "misaligned address" in the kernel. This indicates that the problem is book-keeping, so lets walk through the three major steps of handling an aligned 2D array (even though our arrays will be either a column or row vector).
Your allocation was written out as
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
This is correct for the row vector as the API is cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) However if we would like to do a column vector correct call is
cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
For accessing you were mixing up accessing a row, and accessing an element in the row.
double e = *(double *)(((char *) A + idx * mpitch) + N);
Once again stick to the documentation. The API documentation for cudaMallocPitch includes
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
for us this translates into
int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;
I've used column = 0 for completeness since we do not have more than one column.
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
For this case this is correct. API for cudaMemcpy2D is
cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);