How to allocate & access 3D, 4D, 5D arrays? - c++

How can I allocate 3D, 4D, 5D arrays with one malloc in a contigious way and access the individual items?
Something like this:
int* array = malloc(sizeof(int) * width * height);
int item = array[x + y * width];

A 3D array is an array of 2D arrays. A 4D array is an array of 3D arrays. You just multiply by your other dimensions. For example, a 3D array can be allocated in this way:
int *array = malloc(sizeof(int) * width * height * depth);
A 4D array can be made by multiplying by your other dimension:
int *array = malloc(sizeof(int) * width * height * depth * other_dimension);
and so on for 5D, 6D, etc. arrays.
You can access elements by using something like this (for 3D arrays, easily extended), assuming you have access to the width and height of the array:
int get_element(int x, int y, int z)
{
return array[(z * width * height) + (y * width) + x];
}
For 4D arrays:
int get_element(int x, int y, int z, int dimension_4)
{
return array[(dimension_4 * width * height * depth) + (z * width * height) + (y * width) + x];
}

As answered here (Setting pointer to arbitrary dimension array?
):
Look specially computeIndex/computeIndexes.
#include <cstddef>
#include <vector>
template <typename T>
class MultiArray
{
public:
explicit MultiArray(const std::vector<size_t>& dimensions) :
dimensions(dimensions),
values(computeTotalSize(dimensions))
{
assert(!dimensions.empty());
assert(!values.empty());
}
const T& get(const std::vector<size_t>& indexes) const
{
return values[computeIndex(indexes)];
}
T& get(const std::vector<size_t>& indexes)
{
return values[computeIndex(indexes)];
}
size_t computeIndex(const std::vector<size_t>& indexes) const
{
assert(indexes.size() == dimensions.size());
size_t index = 0;
size_t mul = 1;
for (size_t i = 0; i != dimensions.size(); ++i) {
assert(indexes[i] < dimensions[i]);
index += indexes[i] * mul;
mul *= dimensions[i];
}
assert(index < values.size());
return index;
}
std::vector<size_t> computeIndexes(size_t index) const
{
assert(index < values.size());
std::vector<size_t> res(dimensions.size());
size_t mul = values.size();
for (size_t i = dimensions.size(); i != 0; --i) {
mul /= dimensions[i - 1];
res[i - 1] = index / mul;
assert(res[i - 1] < dimensions[i - 1]);
index -= res[i - 1] * mul;
}
return res;
}
private:
size_t computeTotalSize(const std::vector<size_t>& dimensions) const
{
size_t totalSize = 1;
for (auto i : dimensions) {
totalSize *= i;
}
return totalSize;
}
private:
std::vector<size_t> dimensions;
std::vector<T> values;
};
int main()
{
MultiArray<int> m({3, 2, 4});
m.get({0, 0, 3}) = 42;
m.get({2, 1, 3}) = 42;
for (size_t i = 0; i != 24; ++i) {
assert(m.computeIndex(m.computeIndexes(i)) == i);
}
}
Demo

Arrays are by nature allocated as a single dimension. You bestow dimensionality on them via the way you compute indexes to them. The size you need to allocate is the size of a scalar element multiplied by the number of elements in each of however many dimensions you intend to use, e.g., if you want a 10 x 20 x 30 array of 4-byte elements, multiply 4 x 10 x 20 x 30 to get the size of the malloc you need. Then, I'd probably write a function such as my_index(int i, int j, int k) that would compute the one-dimensional index for any valid (i,j,k) combination. This idea can be extended into as many dimensions as you wish.

Related

How to access 1D data (or reshape it) from pointer into something like multidimensional array in C++?

I have a pointer that points to the beginning of a 1000+ elements array that is initialized as below:
int numElements = 1200;
auto data = std::unique_ptr<float>{new float[numElements]};
Now I want to 'reshape' it into something like a (20,30,20) tensor, so I can access it the way I want (I can still read while it's 1-D as well but it feels weird). I want to access like this:
data[1][10][12] = 1337.0f;
Is there an efficient way of doing this (fast and short code)?
In the meantime, this is how I do it...
#include <iostream>
using std::cout;
using std::endl;
#include <vector>
using std::vector;
size_t get_index(const size_t x, const size_t y, const size_t z, const size_t x_res, const size_t y_res, const size_t z_res)
{
return z * y_res * x_res + y * x_res + x;
}
int main(void)
{
const size_t x_res = 10;
const size_t y_res = 10;
const size_t z_res = 10;
// Use new[] to allocate, and memset to clear
//float* vf = new float[x_res * y_res * z_res];
//memset(vf, 0, sizeof(float) * x_res * y_res * z_res);
// Better yet, use a vector
vector<float> vf(x_res*y_res*z_res, 0.0f);
for (size_t x = 0; x < x_res; x++)
{
for (size_t y = 0; y < y_res; y++)
{
for (size_t z = 0; z < z_res; z++)
{
size_t index = get_index(x, y, z, x_res, y_res, z_res);
// Do stuff with vf[index] here...
}
}
}
// Make sure to deallocate memory
// delete[] vf;
return 0;
}

CUDA: Using device functors in kernels

I tried to make a device functor that essentially performs (unoptimized) matrix-vector multiplication like so
namespace cusolve
{
template <class value_type,
class matrix_type = value_type*,
class vector_type = value_type*>
struct linear_operator
{
const matrix_type matrix;
const size_t width;
__device__
linear_operator(const matrix_type matrix, size_t width)
: matrix(matrix), width(width) { }
__device__
void operator()(const vector_type x, vector_type x_out)
{
auto col = blockIdx.x * blockDim.x + threadIdx.x;
auto row = blockIdx.y * blockDim.y + threadIdx.y;
x_out[row] = 0;
if (row < width)
{
for (size_t i = 0; i < width; i++)
{
x_out[row] += matrix[row*width + i] * x[i];
}
}
return;
}
};
So, this assumes that matrix, x, and x_out are device pointers. So, to test it I tried to call it from a simple kernel
__global__
void
operateKernel(double *d_matrix,
double *d_vector, double *d_vector_out,
size_t width)
{
cusolve::linear_operator<double> matmul(d_matrix, width);
matmul(d_vector, d_vector_out);
}
void
operate(double *matrix, double *vector, double *vector_out, size_t width)
{
const dim3 blockConfig(16, 16);
const size_t gridWidth = (size_t) ((double) width) / 16.0l;
const dim3 gridConfig(gridWidth, gridWidth);
double *d_matrix, *d_vector, *d_vector_out;
auto mem_vector = width * sizeof(double);
auto mem_matrix = mem_vector * width;
cudaMalloc((void **) &d_matrix, mem_matrix);
cudaMalloc((void **) &d_vector, mem_vector);
cudaMalloc((void **) &d_vector_out, mem_vector);
cudaMemcpy(d_matrix, matrix, mem_matrix, cudaMemcpyHostToDevice);
cudaMemcpy(d_vector, vector, mem_vector, cudaMemcpyHostToDevice);
operateKernel<<<gridConfig, blockConfig>>>(d_matrix, d_vector, d_vector_out, width);
cudaMemcpy(vector_out, d_vector_out, mem_vector, cudaMemcpyDeviceToHost);
cudaFree(d_vector);
cudaFree(d_matrix);
cudaFree(d_vector_out);
}
But, when I try to call operate() from main() using allocated and initialized to non-null vectors and a matrix, the output is all zeros. I have been whacking my head over this for quite a while now and have not been able to figure out what it is that I am doing wrong.
P.S: I am deliberately trying to do this without thrust as a learning exercise.
Forgot to use ceil when calculating grid dimensions.
const size_t gridWidth = ceil( ((double) width) / 16.0l );

Passing 2D array into function and filling it with numbers

I'm trying to do task in C++. I need create this function:
void fillArray(std::array<std::array<int, maxColumns>, maxRows> array, size_t rows, size_t columns) {
}
Right now my example code looks like this:
#include <iostream>
#include <array>
constexpr int maxColumns = 42;
constexpr int maxRows = 334;
void fillArray(std::array<std::array<int, maxColumns>, maxRows> array, size_t rows, size_t columns) {
}
int main()
{
}
I need to fill the array with numbers from 1 to rows*columns starting from [0][0] and diagonally. How to declare and initialize the function with array in this example and then fill it diagonally? Any help would be greatly appreciated!
It should be
template <std::size_t maxColumns, std::size_t maxRows>
void fillArray(std::array<std::array<int, maxColumns>, maxRows>& array) {
// ...
}
Demo
Let's suppose you use a simple one-dimensional valarray (or array if you insist) of the size width * height wrapped in a class:
class Matrix
{
private:
std::valarray<int> _data;
int _width, _height;
public:
Matrix(int width, int height) : _width(width), _height(height), _data(width * height)
{
}
}
Then you can add a member function that maps x, y coordinates to an item reference:
int& item(int x, int y) { return _data[x + _width * y]; }
... and another one for filling it diagonally like this:
void fillDiagonally(int value = 0, int step = 1)
{
for (int i = 0; i < _height + _width; ++i) {
// calculate starting coordinates (along left edge and then bottom edge)
int row = i < _height ? i : _height - 1;
int col = i < _height ? 0 : i - _height + 1;
// move diagonally up and right until you reach margins, while filling-in values
for (int j = 0; j < _width - col && j <= row; ++j) {
item(col + j, row - j) = value;
value += step;
}
}
}
and use it like this:
int main()
{
Matrix m(8, 5);
m.fillDiagonally(1);
}
This way, you don't need to pass the array as an argument, because it's a part of the class. Otherwise you would have to pass it by reference, like you were suggested above.

Can I convert my 1D vector to a 2D vector faster than this?

The question is quite straightforward. After some trials, here is the most efficient code I found:
//For the sake of the example, I initialize every entry as zero.
vector<float> vector1D(1024 * 768, 0);
vector<vector<float>> vector2D(768, vector<float>(1024,0));
int counter = 0;
for (int i = 0; i < 768; i++) {
for (int j = 0; j < 1024; j++) {
vector2D[i][j] = vector1D[counter++];
}
}
Is there a faster way?
Yes.
You can remap the way you access the elements without needing to copy them. You can create a "view" class to achieve that:
template<typename T>
class two_dee_view
{
public:
two_dee_view(std::vector<T>& v, std::size_t row, std::size_t col)
: v(v), stride(col) { if(v.size() < row * col) v.resize(row * col); }
T& operator()(std::size_t row, std::size_t col)
{ return v[(row * stride) + col]; }
T const& operator()(std::size_t row, std::size_t col) const
{ return v[(row * stride) + col]; }
std::size_t col_size() const { return stride; }
std::size_t row_size() const { return v.size() / stride; }
private:
std::vector<T>& v;
std::size_t stride;
};
int main()
{
std::vector<double> v {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
two_dee_view<double> v2d(v, 2, 3);
for(auto row = 0U; row < v2d.row_size(); ++row)
for(auto col = 0U; col < v2d.col_size(); ++col)
std::cout << row << ", " << col << ": " << v2d(row, col) << '\n';
}
Output:
0, 0: 1
0, 1: 2
0, 2: 3
1, 0: 4
1, 1: 5
1, 2: 6
The class simply maintains a reference to the std::vector you pass in to the constructor. You should only use the two_dee_view as long as the original std::vector lives but no longer.
It might be faster by using memcpy, as that is the lowest possible level of an API for copying memory and is likely that there are compiler optimizations which may use specific instructions, etc. and make if faster:
for (int i = 0; i < 768; i++) {
memcpy(vector2D[i].data(), &vector1D[i * 1024], sizeof(float) * 1024);
}
Keep in mind that you shouldn't be using memcpy for anything but trivially-copiable data. That is, it will work fine for float and int but not for classes as the copy constructor will not be called.
If you have to use a vector of vectors for some reason, using memcpy or memmove is faster (because it's a single step, as described in another reply). But you should use the STL instead of doing it by yourself.
vector<float> vector1D(1024 * 768, 0);
vector<vector<float>> vector2D(768, vector<float>(1024, 0));
for (int i = 0; i < 768; i++) {
vector2D[i].assign(next(vector1D.cbegin(), 1024 * i),
next(vector1D.cbegin(), 1024 * (i + 1)));
}
This results in a straight memmove (depending on the STL implementation) but is much more safe, optimized and (possibly) readable.

How to copy dynamic matrix to device memory in CUDA?

In my code I have dynamic matrix.
int ** file_data = (int **)malloc(TRANSACTIONS * sizeof(int *));
file_data[0] = (int *)malloc((a_size+1) * sizeof(int));
file_data[1] = (int *)malloc((a_size+1) * sizeof(int));
file_data[2] = (int *)malloc((a_size+1) * sizeof(int));
................................................................
I want to copy it to device global memory only once.
I have used:
__device__ int raw_data[][];
...................................
...................................
...................................
cudaMemcpyToSymbol(raw_data[i], file_data[i], (a_size+1)*sizeof(int));
But this do not works.
How can I do it?
You need to Flatten the data
If you're only working with rectangular matrices in the first place, I'd recommend always storing your Matrixes like this anyways, but either way, you'll need to get it into this form before trying to push this data to your device memory.
template<typename T>
class Matrix {
std::vector<T> _data;
size_t rows, columns;
public:
Matrix(size_t rows, size_t columns) :rows(rows), columns(columns) {
_data.resize(rows * columns);
}
T & operator()(size_t row, size_t column) & {
return _data.at(row * columns + column); //Row-Major Ordering
}
T const& operator()(size_t row, size_t column) const& {
return _data.at(row * columns + column);
}
T operator() size_t row, size_t column) const {
return _data.at(row * columns + column);
}
T * data() & {
return _data.data();
}
T const* data() const& {
return _data.data();
}
std::pair<size_t, size_t> size() const {
return {rows, columns};
}
size_t flat_size() const {
return rows * columns;
}
size_t byte_size() const {
return flat_size() * sizeof(T);
}
};
int ** file_data = (int **)malloc(TRANSACTIONS * sizeof(int *));
file_data[0] = (int *)malloc((a_size+1) * sizeof(int));
file_data[1] = (int *)malloc((a_size+1) * sizeof(int));
file_data[2] = (int *)malloc((a_size+1) * sizeof(int));
//................................................................
Matrix<int> flat_data(TRANSACTIONS, a_size + 1);
for(size_t row = 0; row < TRANSACTIONS; row++) {
for(size_t column = 0; column < a_size + 1; column++) {
flat_data(row, column) = file_data[row][column];
}
}
//ALTERNATIVE: use this instead of your manual mallocs in the first place!
cudaMemcpyToSymbol(flat_data.data(), /*buffer name*/, flat_data.byte_size());
This has the major advantage that you're not having to copy each row individually into their own buffers, you can put all of them together in memory, saving memory and reducing the number of API calls you need to make. And a class designed specifically to handle your functionality won't break when you inevitably make a mistake trying to manually handle all the pointer management in your original code.