How to copy dynamic matrix to device memory in CUDA? - c++

In my code I have dynamic matrix.
int ** file_data = (int **)malloc(TRANSACTIONS * sizeof(int *));
file_data[0] = (int *)malloc((a_size+1) * sizeof(int));
file_data[1] = (int *)malloc((a_size+1) * sizeof(int));
file_data[2] = (int *)malloc((a_size+1) * sizeof(int));
................................................................
I want to copy it to device global memory only once.
I have used:
__device__ int raw_data[][];
...................................
...................................
...................................
cudaMemcpyToSymbol(raw_data[i], file_data[i], (a_size+1)*sizeof(int));
But this do not works.
How can I do it?

You need to Flatten the data
If you're only working with rectangular matrices in the first place, I'd recommend always storing your Matrixes like this anyways, but either way, you'll need to get it into this form before trying to push this data to your device memory.
template<typename T>
class Matrix {
std::vector<T> _data;
size_t rows, columns;
public:
Matrix(size_t rows, size_t columns) :rows(rows), columns(columns) {
_data.resize(rows * columns);
}
T & operator()(size_t row, size_t column) & {
return _data.at(row * columns + column); //Row-Major Ordering
}
T const& operator()(size_t row, size_t column) const& {
return _data.at(row * columns + column);
}
T operator() size_t row, size_t column) const {
return _data.at(row * columns + column);
}
T * data() & {
return _data.data();
}
T const* data() const& {
return _data.data();
}
std::pair<size_t, size_t> size() const {
return {rows, columns};
}
size_t flat_size() const {
return rows * columns;
}
size_t byte_size() const {
return flat_size() * sizeof(T);
}
};
int ** file_data = (int **)malloc(TRANSACTIONS * sizeof(int *));
file_data[0] = (int *)malloc((a_size+1) * sizeof(int));
file_data[1] = (int *)malloc((a_size+1) * sizeof(int));
file_data[2] = (int *)malloc((a_size+1) * sizeof(int));
//................................................................
Matrix<int> flat_data(TRANSACTIONS, a_size + 1);
for(size_t row = 0; row < TRANSACTIONS; row++) {
for(size_t column = 0; column < a_size + 1; column++) {
flat_data(row, column) = file_data[row][column];
}
}
//ALTERNATIVE: use this instead of your manual mallocs in the first place!
cudaMemcpyToSymbol(flat_data.data(), /*buffer name*/, flat_data.byte_size());
This has the major advantage that you're not having to copy each row individually into their own buffers, you can put all of them together in memory, saving memory and reducing the number of API calls you need to make. And a class designed specifically to handle your functionality won't break when you inevitably make a mistake trying to manually handle all the pointer management in your original code.

Related

Argument of type "float" is incompatible with parameter of type "const void *"

I am trying to design a basic matrix class having data allocated in device. I am having some problems when inserting an element into the matrix given its row and col. This is my actual code:
template <typename CellType_>
class CUDAMatrix {
public:
using CellType = CellType_;
CUDAMatrix(size_t rows_, size_t cols_) {
_rows = 0;
_cols = 0;
resize(rows_, cols_);
}
~CUDAMatrix() {
cudaFree(_data);
}
inline size_t rows() const {
return _rows;
}
inline size_t cols() const {
return _cols;
}
inline void _init() {
_capacity = _rows * _cols;
CUDA_CHECK(cudaMalloc((void**) &_data, sizeof(CellType) * _capacity));
}
inline void resize(size_t rows_, size_t cols_) {
// if size is ok, do nothing
if (_rows == rows_ && _cols == cols_)
return;
_rows = rows_;
_cols = cols_;
if (rows_ * cols_ <= _capacity)
return;
cudaFree(_data);
_init();
}
inline void set(const size_t row, const size_t col, const CellType& val) {
if (row > _rows || col > _cols) {
throw std::out_of_range("[Matrix::at] index out of range");
}
CUDA_CHECK(cudaMemcpy(_data[row * _cols + col], &val, sizeof(CellType), cudaMemcpyHostToDevice));
}
inline void clear() {
cudaFree(_data);
_capacity = 0;
_cols = 0;
_rows = 0;
}
protected:
size_t _cols;
size_t _rows;
size_t _capacity;
CellType* _data;
};
int main() {
const size_t rows = 620;
const size_t cols = 480;
using MyMat = CUDAMatrix<float>;
MyMat mymat(rows, cols);
for (size_t r = 0; r < mymat.rows(); ++r) {
for (size_t c = 0; r < mymat.cols(); ++c) {
mymat.set(r, c, 1.f);
}
}
}
How I can take an element from host and copy it to device? I am getting the error argument of type "float" is incompatible with parameter of type "const void *". The error arise here when I call cudaMalloc.
cudaMemcpy(_data[row * _cols + col], val, sizeof(CellType), cudaMemcpyHostToDevice)
While almost all possible memcpy require pointers for sources and targets, you are attempting to pass cell and val values. You should pass the pointer to cell and &val.
cudaMemcpy(&_data[row * _cols + col], &val, sizeof(CellType), cudaMemcpyHostToDevice)
cudaMemcpy requires pointer as src and dst, _data[index] does not return a pointer. The right way should be:
cudaMemcpy(&_data[row * _cols + col], &val, sizeof(CellType), cudaMemcpyHostToDevice)

CUDA: Using device functors in kernels

I tried to make a device functor that essentially performs (unoptimized) matrix-vector multiplication like so
namespace cusolve
{
template <class value_type,
class matrix_type = value_type*,
class vector_type = value_type*>
struct linear_operator
{
const matrix_type matrix;
const size_t width;
__device__
linear_operator(const matrix_type matrix, size_t width)
: matrix(matrix), width(width) { }
__device__
void operator()(const vector_type x, vector_type x_out)
{
auto col = blockIdx.x * blockDim.x + threadIdx.x;
auto row = blockIdx.y * blockDim.y + threadIdx.y;
x_out[row] = 0;
if (row < width)
{
for (size_t i = 0; i < width; i++)
{
x_out[row] += matrix[row*width + i] * x[i];
}
}
return;
}
};
So, this assumes that matrix, x, and x_out are device pointers. So, to test it I tried to call it from a simple kernel
__global__
void
operateKernel(double *d_matrix,
double *d_vector, double *d_vector_out,
size_t width)
{
cusolve::linear_operator<double> matmul(d_matrix, width);
matmul(d_vector, d_vector_out);
}
void
operate(double *matrix, double *vector, double *vector_out, size_t width)
{
const dim3 blockConfig(16, 16);
const size_t gridWidth = (size_t) ((double) width) / 16.0l;
const dim3 gridConfig(gridWidth, gridWidth);
double *d_matrix, *d_vector, *d_vector_out;
auto mem_vector = width * sizeof(double);
auto mem_matrix = mem_vector * width;
cudaMalloc((void **) &d_matrix, mem_matrix);
cudaMalloc((void **) &d_vector, mem_vector);
cudaMalloc((void **) &d_vector_out, mem_vector);
cudaMemcpy(d_matrix, matrix, mem_matrix, cudaMemcpyHostToDevice);
cudaMemcpy(d_vector, vector, mem_vector, cudaMemcpyHostToDevice);
operateKernel<<<gridConfig, blockConfig>>>(d_matrix, d_vector, d_vector_out, width);
cudaMemcpy(vector_out, d_vector_out, mem_vector, cudaMemcpyDeviceToHost);
cudaFree(d_vector);
cudaFree(d_matrix);
cudaFree(d_vector_out);
}
But, when I try to call operate() from main() using allocated and initialized to non-null vectors and a matrix, the output is all zeros. I have been whacking my head over this for quite a while now and have not been able to figure out what it is that I am doing wrong.
P.S: I am deliberately trying to do this without thrust as a learning exercise.
Forgot to use ceil when calculating grid dimensions.
const size_t gridWidth = ceil( ((double) width) / 16.0l );

Can I convert my 1D vector to a 2D vector faster than this?

The question is quite straightforward. After some trials, here is the most efficient code I found:
//For the sake of the example, I initialize every entry as zero.
vector<float> vector1D(1024 * 768, 0);
vector<vector<float>> vector2D(768, vector<float>(1024,0));
int counter = 0;
for (int i = 0; i < 768; i++) {
for (int j = 0; j < 1024; j++) {
vector2D[i][j] = vector1D[counter++];
}
}
Is there a faster way?
Yes.
You can remap the way you access the elements without needing to copy them. You can create a "view" class to achieve that:
template<typename T>
class two_dee_view
{
public:
two_dee_view(std::vector<T>& v, std::size_t row, std::size_t col)
: v(v), stride(col) { if(v.size() < row * col) v.resize(row * col); }
T& operator()(std::size_t row, std::size_t col)
{ return v[(row * stride) + col]; }
T const& operator()(std::size_t row, std::size_t col) const
{ return v[(row * stride) + col]; }
std::size_t col_size() const { return stride; }
std::size_t row_size() const { return v.size() / stride; }
private:
std::vector<T>& v;
std::size_t stride;
};
int main()
{
std::vector<double> v {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
two_dee_view<double> v2d(v, 2, 3);
for(auto row = 0U; row < v2d.row_size(); ++row)
for(auto col = 0U; col < v2d.col_size(); ++col)
std::cout << row << ", " << col << ": " << v2d(row, col) << '\n';
}
Output:
0, 0: 1
0, 1: 2
0, 2: 3
1, 0: 4
1, 1: 5
1, 2: 6
The class simply maintains a reference to the std::vector you pass in to the constructor. You should only use the two_dee_view as long as the original std::vector lives but no longer.
It might be faster by using memcpy, as that is the lowest possible level of an API for copying memory and is likely that there are compiler optimizations which may use specific instructions, etc. and make if faster:
for (int i = 0; i < 768; i++) {
memcpy(vector2D[i].data(), &vector1D[i * 1024], sizeof(float) * 1024);
}
Keep in mind that you shouldn't be using memcpy for anything but trivially-copiable data. That is, it will work fine for float and int but not for classes as the copy constructor will not be called.
If you have to use a vector of vectors for some reason, using memcpy or memmove is faster (because it's a single step, as described in another reply). But you should use the STL instead of doing it by yourself.
vector<float> vector1D(1024 * 768, 0);
vector<vector<float>> vector2D(768, vector<float>(1024, 0));
for (int i = 0; i < 768; i++) {
vector2D[i].assign(next(vector1D.cbegin(), 1024 * i),
next(vector1D.cbegin(), 1024 * (i + 1)));
}
This results in a straight memmove (depending on the STL implementation) but is much more safe, optimized and (possibly) readable.

Return double pointer 2D matrix in mex function

I am working with 2D arrays defined with double pointers, e.g,
double** array;
array = (double**) calloc(numRows, sizeof(double*));
for (int i = 0; i < numRows; i++)
{
array[i] = (double*) calloc(numCols, sizeof(double));
/* then array[i][j] = values */
}
// code to return matlab array
plhs[0] = mxCreateDoubleMatrix(numRows, numCols, mxREAL);
// memory copy
// ?????????
for (i = 0; i < numRows; i++){
free(array[i]);
}
free(array);
I want to return array in matlab. An execution I have until now for the // memory copy part and I think it is fine, please correct me is:
stacked1D = mxGetPr(plhs[0]);
int n = 0;
for ( int r = 0; r < max_degree; r++ )
for ( int c = 0; c < n_vars; c++ )
stacked1D[n++] = stacked2D[r][c];
I am wondered if we can do it with a mem-copy function like this mxSetPr(OUT, *stacked2D); which is not working in this syntax.
Could you please give a hint-explanation or possible answer?
Row and column iterations should be reversed in your code, and what PaulMcKenzie suggested, although it's a good idea in principle, will not work with Mex matrices (they are laid out column-by-column, so with that method you would have to access your matrix with M[column][row] which is un-natural and confusing).
Alternatively, you could use a simple wrapper like the following:
template <class T, class I = unsigned>
struct MatrixWrapper
{
T *data;
I nrows, ncols;
MatrixWrapper()
{ clear(); }
MatrixWrapper( T *data_, I nrows_, I ncols_ )
{ set(data_,nrows_,ncols_); }
inline void clear()
{ data = NULL; nrows = ncols = 0; }
inline void set( T *data_, I nrows_, I ncols_ )
{ data = data_; nrows = nrows_; ncols = ncols_; }
inline T& operator() ( I r, I c ) const
{ return data[ r + nrows*c ]; }
};
and your Mex function would look like this:
// allocate a temporary matrix
double *M_data = new double[ nrows*ncols ];
MatrixWrapper<double> M( M_data, nrows, ncols );
// access values with M(i,j) as a "normal" matrix
// allocate a Mex output
plhs[0] = mxCreateDoubleMatrix( nrows, ncols, mxREAL );
MatrixWrapper<double> out0( mxGetPr(plhs[0]), nrows, ncols );
// copy to the Mex output
for ( unsigned c = 0; c < ncols; c++ )
for ( unsigned r = 0; r < nrows; r++ )
out0(r,c) = M(r,c);
// free temporary allocation
M.clear();
delete[] M_data;

How to allocate & access 3D, 4D, 5D arrays?

How can I allocate 3D, 4D, 5D arrays with one malloc in a contigious way and access the individual items?
Something like this:
int* array = malloc(sizeof(int) * width * height);
int item = array[x + y * width];
A 3D array is an array of 2D arrays. A 4D array is an array of 3D arrays. You just multiply by your other dimensions. For example, a 3D array can be allocated in this way:
int *array = malloc(sizeof(int) * width * height * depth);
A 4D array can be made by multiplying by your other dimension:
int *array = malloc(sizeof(int) * width * height * depth * other_dimension);
and so on for 5D, 6D, etc. arrays.
You can access elements by using something like this (for 3D arrays, easily extended), assuming you have access to the width and height of the array:
int get_element(int x, int y, int z)
{
return array[(z * width * height) + (y * width) + x];
}
For 4D arrays:
int get_element(int x, int y, int z, int dimension_4)
{
return array[(dimension_4 * width * height * depth) + (z * width * height) + (y * width) + x];
}
As answered here (Setting pointer to arbitrary dimension array?
):
Look specially computeIndex/computeIndexes.
#include <cstddef>
#include <vector>
template <typename T>
class MultiArray
{
public:
explicit MultiArray(const std::vector<size_t>& dimensions) :
dimensions(dimensions),
values(computeTotalSize(dimensions))
{
assert(!dimensions.empty());
assert(!values.empty());
}
const T& get(const std::vector<size_t>& indexes) const
{
return values[computeIndex(indexes)];
}
T& get(const std::vector<size_t>& indexes)
{
return values[computeIndex(indexes)];
}
size_t computeIndex(const std::vector<size_t>& indexes) const
{
assert(indexes.size() == dimensions.size());
size_t index = 0;
size_t mul = 1;
for (size_t i = 0; i != dimensions.size(); ++i) {
assert(indexes[i] < dimensions[i]);
index += indexes[i] * mul;
mul *= dimensions[i];
}
assert(index < values.size());
return index;
}
std::vector<size_t> computeIndexes(size_t index) const
{
assert(index < values.size());
std::vector<size_t> res(dimensions.size());
size_t mul = values.size();
for (size_t i = dimensions.size(); i != 0; --i) {
mul /= dimensions[i - 1];
res[i - 1] = index / mul;
assert(res[i - 1] < dimensions[i - 1]);
index -= res[i - 1] * mul;
}
return res;
}
private:
size_t computeTotalSize(const std::vector<size_t>& dimensions) const
{
size_t totalSize = 1;
for (auto i : dimensions) {
totalSize *= i;
}
return totalSize;
}
private:
std::vector<size_t> dimensions;
std::vector<T> values;
};
int main()
{
MultiArray<int> m({3, 2, 4});
m.get({0, 0, 3}) = 42;
m.get({2, 1, 3}) = 42;
for (size_t i = 0; i != 24; ++i) {
assert(m.computeIndex(m.computeIndexes(i)) == i);
}
}
Demo
Arrays are by nature allocated as a single dimension. You bestow dimensionality on them via the way you compute indexes to them. The size you need to allocate is the size of a scalar element multiplied by the number of elements in each of however many dimensions you intend to use, e.g., if you want a 10 x 20 x 30 array of 4-byte elements, multiply 4 x 10 x 20 x 30 to get the size of the malloc you need. Then, I'd probably write a function such as my_index(int i, int j, int k) that would compute the one-dimensional index for any valid (i,j,k) combination. This idea can be extended into as many dimensions as you wish.