I am looking for a convenient design in order to be able to use a class on the device which has unknown compile-time size.
Only one instance of this class needs to be sent to the device, for which there should be a single call to cudaMalloc and cudaMemcpy (ideally).
The host version of the class would look like this:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
}
double get(int i) const {
// return some processed element from table
}
private:
std::vector<int> table;
};
The kernel:
__global__ void kernel(const A *a){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
a->get(idx); // do something useful with it
}
So far, the way I would design the device version of the class is like that:
const int sizeMax = 1000;
Class A {
public:
A(int size) {
// size checking + some useful initialization of table
}
__host__ __device__
double get(int i) const {
//
}
private:
int table[sizeMax];
};
And the client code:
A a(128);
A* da;
cudaMalloc((void**)&da, sizeof(A));
cudaMemcpy(da, &a, sizeof(A), cudaMemcpyHostToDevice);
kernel<<<1, 32>>>(da);
cudaDeviceSynchronize();
cudaFree(da);
This is rather ugly because:
it wastes bandwith by having to use too large a sizeMax in order to
be on the safe side
the class is not closed for modification, the value of sizeMax will
inevitably need to be raised at some point
Is there any other way to achieve the same thing in a cleaner way without negative performance impact? To be clear, I only need the device version of the class, the first version is just the equivalent non-CUDA code to illustrate the fact that the table size should be dynamic.
In my comment, I said:
separate host and device storage for table, contained in the class, both of which are allocated dynamically. 2. dynamic allocation of table storage size in the constructor, rather than in your client code. This could also include resizing if necessary. 3. differentiation in class methods to use either the host copy of the data or the device copy (i.e. pointer) to the data, depending on whether the method is being executed in host or device code 4. A method to copy data from host to device or vice versa, as the class context is moved from host to device or vice versa.
Here's an example of what I had in mind:
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <iostream>
template <typename T>
class gpuvec{
private:
T *h_vec = NULL;
T *d_vec = NULL;
size_t vsize = 0;
bool iscopy;
public:
__host__ __device__
T * data(){
#ifndef __CUDA_ARCH__
return h_vec;
#else
return d_vec;
#endif
}
__host__ __device__
T& operator[](size_t i) {
assert(i < vsize);
return data()[i];}
void to_device(){
assert(cudaMemcpy(d_vec, h_vec, vsize*sizeof(T), cudaMemcpyHostToDevice) == cudaSuccess);}
void to_host(){
assert(cudaMemcpy(h_vec, d_vec, vsize*sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess);}
gpuvec(gpuvec &o){
h_vec = o.h_vec;
d_vec = o.d_vec;
vsize = o.vsize;
iscopy = true;}
void copy(gpuvec &o){
free();
iscopy = false;
vsize = o.vsize;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);
memcpy(h_vec, o.h_vec, vsize*sizeof(T));
assert(cudaMemcpy(d_vec, o.d_vec, vsize*sizeof(T), cudaMemcpyDeviceToDevice) == cudaSuccess);}
gpuvec(size_t ds) {
assert(ds > 0);
iscopy = false;
vsize = ds;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);}
gpuvec(){
iscopy = false;
}
~gpuvec(){
if (!iscopy) free();}
void free(){
if (d_vec != NULL) cudaFree(d_vec);
d_vec = NULL;
if (h_vec != NULL) ::free(h_vec);
h_vec = NULL;}
__host__ __device__
size_t size() {
return vsize;}
};
template <typename T>
__global__ void test(gpuvec<T> d){
for (int i = 0; i < d.size(); i++){
d[i] += 1;
}
}
int main(){
size_t ds = 10;
gpuvec<int> A(ds);
A.to_device();
test<<<1,1>>>(A);
A.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << A[i];
std::cout << std::endl;
gpuvec<int> B;
B.copy(A);
A.free();
B.to_device();
test<<<1,1>>>(B);
B.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << B[i];
std::cout << std::endl;
B.free();
}
I'm sure quite a few criticisms could be made. This may not adhere to any particular opinion of what "vector syntax" should be. Furthermore I'm sure there are use cases it does not cover, and it may contain outright defects. To create a robust host/device vector realization may require as much work and complexity as thrust host and device vectors. I'm not suggesting that thrust vectors are a drop-in answer for what the question seems to be asking, however.
Based on Robert Crovella's answer, here is a simplified (device only, so ignoring points 3 & 4) working solution:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
cudaMalloc((void**)&dTable, sizeof(int) * size);
cudaMemcpy(dTable, &table[0], sizeof(int) * size, cudaMemcpyHostToDevice);
}
~A() {
cudaFree(dTable);
}
__device__
double get(int i) const {
// return some processed element of dTable
}
private:
std::vector<int> table;
int *dTable;
};
Kernel and client code stay exactly the same.
Related
I need an efficient implementation of a vector with multiple rows, each having the same number of columns, which is not too ugly in C++. Currently I have the following:
class BaseVector {
protected: // variables
int64_t _capacity;
int64_t _nColumns;
protected:
template<typename taItem> void Allocate(taItem * &p, const int64_t nItems) {
p = static_cast<taItem*>(MemPool::Instance().Acquire(sizeof(taItem)*nItems));
if (p == nullptr) {
__debugbreak();
}
}
template<typename taItem> void Reallocate(taItem * &p, const int64_t newCap) {
taItem *np;
Allocate(np, newCap);
Utils::AlignedNocachingCopy(np, p, _nColumns * sizeof(taItem));
MemPool::Instance().Release(p, _capacity * sizeof(taItem));
p = np;
}
// Etc for Release() operation
public:
explicit BaseVector(const int64_t initCap) : _capacity(initCap), _nColumns(0) { }
void Clear() { _nColumns = 0; }
int64_t Size() const { return _nColumns; }
};
class DerivedVector : public BaseVector {
__m256d *_pRowA;
__m256i *_pRowB;
uint64_t *_pRowC;
uint8_t *_pRowD;
// Etc. for other rows
public:
DerivedVector(const int64_t nColumns) : BaseVector(nColumns) {
Allocate(_pRowA, nColumns);
Allocate(_pRowB, nColumns);
Allocate(_pRowC, nColumns);
Allocate(_pRowD, nColumns);
// Etc. for the other rows
}
void IncSize() {
if(_nColumns >= _capacity) {
const int64_t newCap = _capacity + (_capacity >> 1) + 1;
Reallocate(_pRowA, newCap);
Reallocate(_pRowB, newCap);
Reallocate(_pRowC, newCap);
Reallocate(_pRowD, newCap);
// Etc. for other rows
_capacity = newCap;
}
_nColumns++;
}
~DerivedVector() {
// Call here the Release() operation for all rows
}
};
The problem with this approach is that there can be 30 rows, so I have to type manually (and repeat myself) 30 times Allocate, 30 times Reallocate, 30 times Release, etc.
So is there a way in C++ to keep this code DRY and fast? I am ok with macros, but not heavy polymorphism in each access to a cell in the vector because this would kill performance.
I am trying to use the vector::data() pointer when using cudaMalloc, cudaMemcpy, and cublasSgemm but I can't seem to get it to work. If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory. Using the latter does work, but not the data() pointer.
Here is the code I am working on:
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width); //resizing of the vector of elements for Matrix C
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
The GetPointer() member function returns vector::data() of the vector of elements for that Matrix object. Size is the vector element's size in memory.
The vector of Matrix C returns all zeros when using the data() pointer, and returns the product of Matrix A and B when using T* aArray pointers without vectors.
Is it actually possible to use vectors to store the array of elements and then the data() pointer to initialize the device copy of the array, or am I forced to use the C style array storage on the host? Also, I have tried using thrust::device_vector and that works but I would like to stay away from creating raw_pointer_casts.
Thanks for your help!
Edit:
For those having trouble with copy and pasting, here is the complete example:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cublas_v2.h>
#include <vector>
#include <iostream>
using namespace std;
template<typename T> class Matrix
{
public:
~Matrix();
Matrix();
Matrix(int rows, int columns);
int width;
int height;
int stride;
size_t size;
T &GetElement(int row, int column);
void SetElement(int row, int column, T value);
void SetElements(vector<T> value);
vector<T>& GetElements();
T* GetPointer();
Matrix<T> cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C);
private:
vector<T> elements;
T* firstElement;
};
template<typename T>
Matrix<T>::~Matrix()
{
}
template<typename T>
Matrix<T>::Matrix()
{
}
template<typename T>
Matrix<T>::Matrix(int rows, int columns)
{
height = rows;
width = columns;
stride = columns; //in row major order this is equal to the # of columns
elements.resize(rows*columns);
firstElement = elements.data();
size = height*width*sizeof(T);
}
template<typename T>
T &Matrix<T>::GetElement(int row, int column)
{
return elements[row*width + column]; //row major order return
}
template<typename T>
vector<T>& Matrix<T>::GetElements()
{
return elements; //row major order return
}
template<typename T>
void Matrix<T>::SetElement(int row, int column, T value)
{
elements[row*width + column] = value; //row major order return
}
template<typename T>
void Matrix<T>::SetElements(vector<T> value)
{
elements = value;
}
template<typename T>
T* Matrix<T>::GetPointer()
{
return firstElement;
}
template<typename T>
//Matrix Multiplication using CUDA
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width);
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
//Thrust usage
/*thrust::device_vector<T> d_A = A.GetElements();
T* d_a = thrust::raw_pointer_cast(&d_A[0]);
thrust::device_vector<T> d_B = B.GetElements();
T* d_b = thrust::raw_pointer_cast(&d_B[0]);
thrust::device_vector<T> d_C = C.GetElements();
T* d_c = thrust::raw_pointer_cast(&d_C[0]);*/
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,C.GetPointer(),C.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
//thrust::copy(d_C.begin(), d_C.end(), C.GetElements().begin());
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return C;
}
int main()
{
Matrix<float> A(2,2);
Matrix<float> B(2,2);
Matrix<float> C;
vector<float> aE(4,2);
vector<float> bE(4,4);
A.SetElements(aE);
B.SetElements(bE);
C = C.cudaProd(A, B, C); //function call to cudaProd()
for(int row = 0; row < A.height; ++row)
{
for(int col = 0; col < A.width; ++col)
{
cout<<A.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < B.height; ++row)
{
for(int col = 0; col < B.width; ++col)
{
cout<<B.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < C.height; ++row)
{
for(int col = 0; col < C.width; ++col)
{
cout<<C.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
}
If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory.
The std::vector class is an owning resource class. It means that trying to manage the underlying resource yourself with the data pointer will make you enter a world of pain.
For this very same reason:
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
and:
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
and:
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cannot possibly work.
From the std::vector::data documentation, data() returns both const and non-const qualified pointers, depending on the fact that the vector is qualified as const or not. Quoting the documentation
If the vector object is const-qualified, the function returns a pointer to const value_type. Otherwise, it returns a pointer to value_type.
Accordingly, using
firstElement = elements.data();
in the Matrix constructor is fine to read/write the data.
The main problem with your code is that you are declaring C in the main, passing a reference to C to the cudaProd method and then internally using
C = Matrix<T>(A.height, B.width);
which will redeclare the Matrix.
If you change the definition of the cudaProd method to
template<typename T>
void cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
remove the
return C;
statement and allocate space for C in the main as
Matrix<float> C(2,2);
vector<float> cE(4,10);
C.SetElements(cE);
your code should work correctly.
I want to create a function that returns a contiguous 2D array in C++.
It is not a problem to create the array using the command:
int (*v)[cols] = new (int[rows][cols]);
However, I am not sure how to return this array as a general type for a function. The function is:
NOT_SURE_WHAT_TYPE create_array(int rows, int cols)
{
int (*v)[cols] = new (int[rows][cols]);
return v;
}
I tried double*[] and double** and both don't work. I wouldn't want to use double*, since I want to access this array from outside as a 2D array.
Related question: How do I declare a 2d array in C++ using new?
If you want to create an array where the data is contiguous and you don't want a 1-dimensional array (i.e. you want to use the [][] syntax), then the following should work. It creates an array of pointers, and each pointer points to a position into a pool of memory.
#include <iostream>
#include <exception>
template <typename T>
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
if (nrows == 0)
throw std::invalid_argument("number of rows is 0");
if (ncols == 0)
throw std::invalid_argument("number of columns is 0");
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{val}; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols )
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete [] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
template <typename T>
void delete2DArray(T** arr)
{
delete [] arr[0]; // remove the pool
delete [] arr; // remove the pointers
}
int main()
{
try
{
double **dPtr = create2DArray<double>(10,10);
dPtr[0][0] = 10; // for example
delete2DArray(dPtr); // free the memory
}
catch(std::bad_alloc& ex)
{
std::cout << "Could not allocate array";
}
}
Note that only 2 allocations are done. Not only is this more efficient due to the lesser amounts of allocations done, we now have a better chance of doing a rollback of the allocated memory if a memory allocation fails, unlike the "traditional" way of allocating a 2D array in non-contiguous memory:
// The "traditional" non-contiguous allocation of a 2D array (assume N x M)
T** ptr;
ptr = new T*[N];
for (int i = 0; i < N; ++i)
ptr[i] = new T [M]; // <<-- What happens if new[] throws at some iteration?
If new[] throws an exception somewhere during the operation of the for loop, you have to roll back all of the successful calls to new[] that happened previously -- that requires more code and adds complexity.
Note how you deallocate the memory in the contiguous version -- just two calls to delete[] when allocated contiguously instead of a loop calling delete[] for each row.
Also, since the data is in contiguous memory, algorithms, functions, etc. that assume that the data is in contiguous memory, just like a one-dimensional array, can now be used by specifying the start and end range for the M*N matrix:
[&array[0][0], &array[M-1][N])
For example:
std::sort(&myArray[0][0], &myArray[M-1][N]);
will sort the entire matrix in ascending order, starting from index [0][0] up until the last index [M-1][N-1].
You can improve on the design by making this a true class instead of having allocation / deallocation as 2 separate functions.
Edit: The class is not RAII-like, just as the comment says. I leave that as an exercise for the reader. One thing missing from the code above is the check that nRows and nCols are > 0 when creating such an array.
Edit 2: Added a try-catch to ensure a proper roll back of the memory allocation is done if a std::bad_alloc exception is thrown attempting to allocate memory.
Edit: For a 3 dimensional array example of code similar to the above see this answer. Included is code to roll back allocations if the allocation fails.
Edit: Rudimentary RAII class added:
template <typename T>
class Array2D
{
T** data_ptr;
unsigned m_rows;
unsigned m_cols;
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{ val }; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols)
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete[] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
public:
typedef T value_type;
T** data() {
return data_ptr;
}
unsigned get_rows() const {
return m_rows;
}
unsigned get_cols() const {
return m_cols;
}
Array2D() : data_ptr(nullptr), m_rows(0), m_cols(0) {}
Array2D(unsigned rows, unsigned cols, const T& val = T())
{
if (rows == 0)
throw std::invalid_argument("number of rows is 0");
if (cols == 0)
throw std::invalid_argument("number of columns is 0");
data_ptr = create2DArray(rows, cols, val);
m_rows = rows;
m_cols = cols;
}
~Array2D()
{
if (data_ptr)
{
delete[] data_ptr[0]; // remove the pool
delete[] data_ptr; // remove the pointers
}
}
Array2D(const Array2D& rhs) : m_rows(rhs.m_rows), m_cols(rhs.m_cols)
{
data_ptr = create2DArray(m_rows, m_cols);
std::copy(&rhs.data_ptr[0][0], &rhs.data_ptr[m_rows-1][m_cols], &data_ptr[0][0]);
}
Array2D(Array2D&& rhs) noexcept
{
data_ptr = rhs.data_ptr;
m_rows = rhs.m_rows;
m_cols = rhs.m_cols;
rhs.data_ptr = nullptr;
}
Array2D& operator=(Array2D&& rhs) noexcept
{
if (&rhs != this)
{
swap(rhs, *this);
rhs.data_ptr = nullptr;
}
return *this;
}
void swap(Array2D& left, Array2D& right)
{
std::swap(left.data_ptr, right.data_ptr);
std::swap(left.m_cols, right.m_cols);
std::swap(left.m_rows, right.m_rows);
}
Array2D& operator = (const Array2D& rhs)
{
if (&rhs != this)
{
Array2D temp(rhs);
swap(*this, temp);
}
return *this;
}
T* operator[](unsigned row)
{
return data_ptr[row];
}
const T* operator[](unsigned row) const
{
return data_ptr[row];
}
void create(unsigned rows, unsigned cols, const T& val = T())
{
*this = Array2D(rows, cols, val);
}
};
int main()
{
try
{
Array2D<double> dPtr(10, 10);
std::cout << dPtr[0][0] << " " << dPtr[1][1] << "\n";
}
catch (std::exception& ex)
{
std::cout << ex.what();
}
}
Unless the size of the two dimensions is known at compile time, your don't have much choice: allocate a single rows*cols array of ints, and roll your own 2D indexing with integer multiplication and addition. Wrapping this in a class can produce a nice-looking syntax for accessing array elements with square bracket operator. Since your array is 2D, you will need to use proxy (AKA "surrogate") objects for the first level of data access.
Here is a small sample code that uses std::vector<T> for maintaining a contiguous memory region in dynamic memory:
template<class T>
class Array2D {
vector<T> data;
size_t cols;
public:
// This is the surrogate object for the second-level indexing
template <class U>
class Array2DIndexer {
size_t offset;
vector<U> &data;
public:
Array2DIndexer(size_t o, vector<U> &dt) : offset(o), data(dt) {}
// Second-level indexing is done in this function
T& operator[](size_t index) {
return data[offset+index];
}
};
Array2D(size_t r, size_t c) : data (r*c), cols(c) {}
// First-level indexing is done in this function.
Array2DIndexer<T> operator[](size_t index) {
return Array2DIndexer<T>(index*cols, data);
}
};
You can now use Array2D<int> as if it were a built-in C++ array:
Array2D<int> a2d(10, 20);
for (int r = 0 ; r != 10 ; r++) {
for (int c = 0 ; c != 20 ; c++) {
a2d[r][c] = r+2*c+1;
}
}
Running demo on ideone.
Since you're using C++ and not C, I would recommend to use one vector instead of messing around with new/delete.
You can define one contiguous block of memory like this:
std::vector<int> my_matrix(rows*cols);
And now you access this vector in a 2d-array-like way with the formula i*n + j, with i being the row index, j the column index and n the length of a row:
my_matrix[i*n + j];
That's the same as accessing a 2d array with array[i][j]. But now you have the advantage of one contiguous block of memory, you don't need to bother about new/delete and you can easily share and return this vector object with functions.
handling raw memory ressources is often icky. Best shot is a simple wrapper as :
struct array2D : private std::vector<int>
{
typedef std::vector<int> base_type;
array2D() : base_type(), height_(0), width_(0) {}
array2D(std::size_t h, std::size_t w) : base_type(h*w), height_(h), width_(w);
int operator()(std::size_t i, std::size_t j) const
{
return base_type::operator[](i+j*height_);
}
int& operator()(std::size_t i, std::size_t j)
{
return base_type::operator[](i+j*height_);
}
std::size_t rows() const { return height_; }
std::size_t cols() const { return width_; }
private:
std::size_t height_, width_;
}
private inheritance let you grab all the goodies from vector, just add your 2D constructor. Ressources management is free as vector ctor/dtor will do their magic. Obviously, the i+h*j can be changed to whateever storage order you want.
vector< vector< int > > is 2D but won't be contiguous in memory.
Your function then become :
array2D create_array(int rows, int cols)
{
return array2D(cols,rows);
}
EDIT:
You can also retrieve other vector interface parts like begin/end or size with the usign clause to make the private inherited member functions public again.
None of the ways of defining a 2D dynamic array in standard C++ are entirely satisfactory in my opinion.
You end up having to roll your own solutions. Luckily there is already a solution in Boost. boost::multi_array:
#include "boost/multi_array.hpp"
template<typename T>
boost::multi_array<T, 2> create_array(int rows, int cols) {
auto dims = boost::extents[rows][cols];
return boost::multi_array<T, 2>(dims);
}
int main() {
auto array = create_array<int>(4, 3);
array[3][2] = 0;
}
Live demo.
The "Rudimentary RAll" class provided by PaulMcKenzie is an excellent solution. In my use of it I did find a memory leak which is fixed in the version shown below.
The memory leak was due to an issue with
Array2D& operator=(Array2D&& rhs) noexcept.
The statement rhs.m_dataPtr = nullPtr needed to be removed in order to allow the rhs destructor to delete the original data (pool and pointers) swapped from lhs.
Here is the corrected code for the "Rudimentary RAll" class provided by PaulMcKenzie
template <typename T>
class Array2D
{
T** data_ptr;
unsigned m_rows;
unsigned m_cols;
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{ val }; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols)
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete[] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
public:
typedef T value_type;
T** data() {
return data_ptr;
}
unsigned get_rows() const {
return m_rows;
}
unsigned get_cols() const {
return m_cols;
}
Array2D() : data_ptr(nullptr), m_rows(0), m_cols(0) {}
Array2D(unsigned rows, unsigned cols, const T& val = T())
{
if (rows == 0)
throw std::invalid_argument("number of rows is 0");
if (cols == 0)
throw std::invalid_argument("number of columns is 0");
data_ptr = create2DArray(rows, cols, val);
m_rows = rows;
m_cols = cols;
}
~Array2D()
{
if (data_ptr)
{
delete[] data_ptr[0]; // remove the pool
delete[] data_ptr; // remove the pointers
}
}
Array2D(const Array2D& rhs) : m_rows(rhs.m_rows), m_cols(rhs.m_cols)
{
data_ptr = create2DArray(m_rows, m_cols);
std::copy(&rhs.data_ptr[0][0], &rhs.data_ptr[m_rows-1][m_cols], &data_ptr[0][0]);
}
Array2D(Array2D&& rhs) noexcept
{
data_ptr = rhs.data_ptr;
m_rows = rhs.m_rows;
m_cols = rhs.m_cols;
rhs.data_ptr = nullptr;
}
Array2D& operator=(Array2D&& rhs) noexcept
{
if (&rhs != this)
{
swap(rhs, *this);
}
return *this;
}
void swap(Array2D& left, Array2D& right)
{
std::swap(left.data_ptr, right.data_ptr);
std::swap(left.m_cols, right.m_cols);
std::swap(left.m_rows, right.m_rows);
}
Array2D& operator = (const Array2D& rhs)
{
if (&rhs != this)
{
Array2D temp(rhs);
swap(*this, temp);
}
return *this;
}
T* operator[](unsigned row)
{
return data_ptr[row];
}
const T* operator[](unsigned row) const
{
return data_ptr[row];
}
void create(unsigned rows, unsigned cols, const T& val = T())
{
*this = Array2D(rows, cols, val);
}
};
int main()
{
try
{
Array2D<double> dPtr(10, 10);
std::cout << dPtr[0][0] << " " << a2[0][0] << "\n";
}
catch (std::exception& ex)
{
std::cout << ex.what();
}
}
I think you should write a simple class to wrap a 1-dim array. Then you can implement a 2-dim array with operator() overloading for getting values and deconstruct func for release the memory. Code as below:
#include <assert.h>
template <typename T>
class Array_2D
{
private:
T *data_inside;
public:
int size[2];
Array_2D(int row, int column);
~Array_2D();
//
T operator()(int index1, int index2){
return data_inside[get_index(index1, index2)];
}
int get_index(int index1, int index2){
if(index1>=0 and index1<size[0] and index2>=0 and index2<=size[1]){
return index1*size[0] + index2;
}else{
assert("wrong index for array!" == "True");
}
}
};
template <typename T>
Array_2D<T>::Array_2D(int row, int column)
{
size[0] = row;
size[1] = column;
data_inside = new T[row*column];
}
template <typename T>
Array_2D<T>::~Array_2D()
{
// 使用析构函数,自动释放资源
delete[] data_inside;
}
Apologies in advance for what may be a silly first post on well-trodden ground. While there is plenty of material on the subject, very little of it is definitive and/or intelligible to me.
I have an AlignedArray template class to dynamically allocate memory on the heap with arbitrary alignment (I need 32-byte alignment for AVX assembly routines). This requires some ugly pointer manipulation.
Agner Fog provides a sample class in cppexamples.zip that abuses a union to do so (http://www.agner.org/optimize/optimization_manuals.zip). However, I know that writing to one member of a union and then reading from another results in UB.
AFAICT it is safe to alias any pointer type to a char *, but only in one direction. This is where my understanding gets fuzzy. Here's an abridged version of my AlignedArray
class (essentially a rewrite of Agner's, to help my understanding):
template <typename T, size_t alignment = 32>
class AlignedArray
{
size_t m_size;
char * m_unaligned;
T * m_aligned;
public:
AlignedArray (size_t const size)
: m_size(0)
, m_unaligned(0)
, m_aligned(0)
{
this->size(size);
}
~AlignedArray ()
{
this->size(0);
}
T const & operator [] (size_t const i) const { return m_aligned[i]; }
T & operator [] (size_t const i) { return m_aligned[i]; }
size_t const size () { return m_size; }
void size (size_t const size)
{
if (size > 0)
{
if (size != m_size)
{
char * unaligned = 0;
unaligned = new char [size * sizeof(T) + alignment - 1];
if (unaligned)
{
// Agner:
/*
union {
char * c;
T * t;
size_t s;
} aligned;
aligned.c = unaligned + alignment - 1;
aligned.s &= ~(alignment - 1);
*/
// Me:
T * aligned = reinterpret_cast<T *>((reinterpret_cast<size_t>(unaligned) + alignment - 1) & ~(alignment - 1));
if (m_unaligned)
{
// Agner:
//memcpy(aligned.c, m_aligned, std::min(size, m_size));
// Me:
memcpy(aligned, m_aligned, std::min(size, m_size));
delete [] m_unaligned;
}
m_size = size;
m_unaligned = unaligned;
// Agner:
//m_aligned = aligned.t;
// Me:
m_aligned = aligned;
}
return;
}
return;
}
if (m_unaligned)
{
delete [] m_unaligned;
m_size = 0;
m_unaligned = 0;
m_aligned = 0;
}
}
};
So which method is safe(r)?
I have code that implements the (replacement) new and delete operators, suitable for SIMD (i.e., SSE / AVX). It uses the following functions that you might find useful:
static inline void *G0__SIMD_malloc (size_t size)
{
constexpr size_t align = G0_SIMD_ALIGN;
void *ptr, *uptr;
static_assert(G0_SIMD_ALIGN >= sizeof(void *),
"insufficient alignment for pointer storage");
static_assert((G0_SIMD_ALIGN & (G0_SIMD_ALIGN - 1)) == 0,
"G0_SIMD_ALIGN value must be a power of (2)");
size += align; // raw pointer storage with alignment padding.
if ((uptr = malloc(size)) == nullptr)
return nullptr;
// size_t addr = reinterpret_cast<size_t>(uptr);
uintptr_t addr = reinterpret_cast<uintptr_t>(uptr);
ptr = reinterpret_cast<void *>
((addr + align) & ~(align - 1));
*(reinterpret_cast<void **>(ptr) - 1) = uptr; // (raw ptr)
return ptr;
}
static inline void G0__SIMD_free (void *ptr)
{
if (ptr != nullptr)
free(*(reinterpret_cast<void **>(ptr) - 1)); // (raw ptr)
}
This should be easy to adapt. Obviously you would replace malloc and free, since you're using the global new and delete for raw (char) storage. It assumes that size_t is sufficiently wide for address arithmetic - true in practice, but uintptr_t from <cstdint> would be more correct.
To answer your question, both of those methods are just as safe. The only two operations that are really stinky there are the cast to size_t and new char[stuff]. You should at least be using uintptr_t from <cstdint> for the first. The second operation creates your only pointer aliasing issue as technically the char constructor is run on each char element and that constitutes accessing the data through the char pointer. You should use malloc instead.
The other supposed 'pointer aliasing' isn't an issue. And that's because other than the new operation you aren't accessing any data through the aliased pointers. You are only accessing data through the T * you get after alignment.
Of course, you have to remember to construct all of your array elements. This is true even in your version. Who knows what kind of T people will put there. And, of course, if you do that, you'll have to remember to call their destructors, and have to remember to handle exceptions when you copy them (memcpy doesn't cut it).
If you have a particular C++11 feature, you do not need to do this. C++11 has a function specifically for aligning pointers to arbitrary boundaries. The interface is a little funky, but it should do the job. The call is ::std::align defined in <memory>.Thanks to R. Martinho Fernandes for pointing it out.
Here is a version of your function with the suggested fixed:
#include <cstdint> // For uintptr_t
#include <cstdlib> // For malloc
#include <algorithm>
template <typename T, size_t alignment = 32>
class AlignedArray
{
size_t m_size;
void * m_unaligned;
T * m_aligned;
public:
AlignedArray (size_t const size)
: m_size(0)
, m_unaligned(0)
, m_aligned(0)
{
this->size(size);
}
~AlignedArray ()
{
this->size(0);
}
T const & operator [] (size_t const i) const { return m_aligned[i]; }
T & operator [] (size_t const i) { return m_aligned[i]; }
size_t size() const { return m_size; }
void size (size_t const size)
{
using ::std::uintptr_t;
using ::std::malloc;
if (size > 0)
{
if (size != m_size)
{
void * unaligned = 0;
unaligned = malloc(size * sizeof(T) + alignment - 1);
if (unaligned)
{
T * aligned = reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(unaligned) + alignment - 1) & ~(alignment - 1));
if (m_unaligned)
{
::std::size_t constructed = 0;
const ::std::size_t num_to_copy = ::std::min(size, m_size);
try {
for (constructed = 0; constructed < num_to_copy; ++constructed) {
new(aligned + constructed) T(m_aligned[constructed]);
}
for (; constructed < size; ++constructed) {
new(aligned + constructed) T;
}
} catch (...) {
for (::std::size_t i = 0; i < constructed; ++i) {
aligned[i].T::~T();
}
::std::free(unaligned);
throw;
}
for (size_t i = 0; i < m_size; ++i) {
m_aligned[i].T::~T();
}
free(m_unaligned);
}
m_size = size;
m_unaligned = unaligned;
m_aligned = aligned;
}
}
} else if (m_unaligned) { // and size <= 0
for (::std::size_t i = 0; i < m_size; ++i) {
m_aligned[i].T::~T();
}
::std::free(m_unaligned);
m_size = 0;
m_unaligned = 0;
m_aligned = 0;
}
}
};
Hey i'm new to c++ and still working out its perticularities. I'm having the darnedest time trying to figure out whats going wrong with this code. I've stepped through it and everything is calculating correctly. The issue is that value_array in the base class doesn't seem to be retaining the values once the derived class Calculate function ends. I think i've declared and allocated the array properly. I'm stumped...
#include <iostream>
class Indicator
{
protected:
double * value_array;
double * input_array;
int input_size;
public:
Indicator(double input[], int size)
{
input_array = input;
input_size = size;
value_array = new double[size]; // issue with value_array
}
double operator[] (int index) { return value_array[index]; }
void virtual Calculate() {}
~Indicator() { delete[] value_array; }
};
class SMA : public Indicator
{
private:
int nperiod;
double sum;
public:
SMA(double input[], int size, int period) : Indicator(input, size)
{
nperiod = period;
sum = 0;
Calculate();
}
void Calculate();
};
void SMA::Calculate()
{
for (int i=0; i<input_size; i++)
{
if (i > nperiod - 1)
{
sum += input_array[i] - input_array[i-nperiod];
value_array[i] = sum / nperiod;
}
else
{
sum += input_array[i];
value_array[i] = sum / (i+1);
}
}
}
int main(int argc, const char *argv[]) {
double input[] = {1,2,3,4,5,6,7,8,9,10};
Indicator indicator = SMA(input,10,5);
double value = indicator[0];
std::cout << "value: " << value << std::endl;
std::cin.get();
exit(0);
}
Update:
Here is the code implemented with vectors. I wanted to leave the input as double[] to be consistent with other libraries, any other potential issues I should be aware of?
#include <iostream>
#include <vector>
class Indicator
{
protected:
std::vector<double> value_vector;
double * input_array;
int input_size;
public:
Indicator(double input[], int size)
{
input_array = input;
input_size = size;
value_vector.reserve(size);
}
double operator[] (int index) { return value_vector[index]; }
void virtual Calculate() {}
};
class SMA : public Indicator
{
private:
int nperiod;
double sum;
public:
SMA(double input[], int size, int period) : Indicator(input, size)
{
nperiod = period;
sum = 0;
Calculate();
}
void Calculate();
};
void SMA::Calculate()
{
for (int i=0; i<input_size; i++)
{
if (i > nperiod - 1)
{
sum += input_array[i] - input_array[i-nperiod];
value_vector.push_back(sum / nperiod);
}
else
{
sum += input_array[i];
value_vector.push_back(sum / (i+1));
}
std::cout << "sma: " << value_vector[i] << std::endl;
}
}
int main(int argc, const char *argv[]) {
double input[] = {1,2,3,4,5,6,7,8,9,10};
Indicator indicator = SMA(input,10,5);
for (int i=0; i<10; i++)
{
std::cout << "main: " << indicator[i] << std::endl;
}
std::cin.get();
exit(0);
}
That's because you're violating the Rule of Three. Since your class manages a resource, it needs a copy constructor and an assignment operator. I strongly suggest replacing any T* data member with a std::vector<T> data member. Then you don't need to write those special member functions manually.
Hia,
a few things are wrong.
As FredOverflow says you need a copy constructor and assignment, something like:
Indicator::Indicator(const Indicator& other)
{
input_size = other.input_size;
//direct copy of reference as indicator doesn't own this data
//Note a shared pointer (such as boost::shared_ptr) would be better than a naked reference
input_array = other.input_array;
//construct a new set of data
value_array = new double[input_size];
//do you want to copy the data too? maybe a memcpy follows?
memcpy(value_array, other.value_array, input_size*sizeof(double));
}
Then you need an assignment
Indicator&
Indicator::operator=(const Indicator& other)
{
//make sure you are not assigning itself
if(this != &other)
{
input_size = other.input_size;
//direct copy of reference as indicator doesn't own this data
//Note a shared pointer (such as boost::shared_ptr) would be better than a naked reference
input_array = other.input_array;
//destroy old data and construct a new set of data
delete[] value_array;
value_array = new double[input_size];
//do you want to copy the data too? maybe a memcpy follows?
memcpy(value_array, other.value_array, input_size*sizeof(double));
}
return *this;
}
You probably also want to make the destructor virtual - see here for why -
it helps prevent memory leaks in the destructor of SMA
virtual ~Indicator() { delete[] value_array; }
Use std::vector instead of raw arrays.
std::vector handles all the memory management and copying and so forth.
Cheers & hth.,