Efficient multi-row vector - c++

I need an efficient implementation of a vector with multiple rows, each having the same number of columns, which is not too ugly in C++. Currently I have the following:
class BaseVector {
protected: // variables
int64_t _capacity;
int64_t _nColumns;
protected:
template<typename taItem> void Allocate(taItem * &p, const int64_t nItems) {
p = static_cast<taItem*>(MemPool::Instance().Acquire(sizeof(taItem)*nItems));
if (p == nullptr) {
__debugbreak();
}
}
template<typename taItem> void Reallocate(taItem * &p, const int64_t newCap) {
taItem *np;
Allocate(np, newCap);
Utils::AlignedNocachingCopy(np, p, _nColumns * sizeof(taItem));
MemPool::Instance().Release(p, _capacity * sizeof(taItem));
p = np;
}
// Etc for Release() operation
public:
explicit BaseVector(const int64_t initCap) : _capacity(initCap), _nColumns(0) { }
void Clear() { _nColumns = 0; }
int64_t Size() const { return _nColumns; }
};
class DerivedVector : public BaseVector {
__m256d *_pRowA;
__m256i *_pRowB;
uint64_t *_pRowC;
uint8_t *_pRowD;
// Etc. for other rows
public:
DerivedVector(const int64_t nColumns) : BaseVector(nColumns) {
Allocate(_pRowA, nColumns);
Allocate(_pRowB, nColumns);
Allocate(_pRowC, nColumns);
Allocate(_pRowD, nColumns);
// Etc. for the other rows
}
void IncSize() {
if(_nColumns >= _capacity) {
const int64_t newCap = _capacity + (_capacity >> 1) + 1;
Reallocate(_pRowA, newCap);
Reallocate(_pRowB, newCap);
Reallocate(_pRowC, newCap);
Reallocate(_pRowD, newCap);
// Etc. for other rows
_capacity = newCap;
}
_nColumns++;
}
~DerivedVector() {
// Call here the Release() operation for all rows
}
};
The problem with this approach is that there can be 30 rows, so I have to type manually (and repeat myself) 30 times Allocate, 30 times Reallocate, 30 times Release, etc.
So is there a way in C++ to keep this code DRY and fast? I am ok with macros, but not heavy polymorphism in each access to a cell in the vector because this would kill performance.

Related

C++ Random Vector/Stream Errors

I have created a structure that contains a vector of unsigned integers for bitstream handling. Essentially, values are written to this vector in sets that are no more than 64 bits long.
The strange behavior comes into play with different images used. Running the entire code on various images yields strange results. Some images (no correlation to size) will properly encode and write to the ofstream while others will cause a vector emplace_back error at the line "ints.emplace_back(field())".
As I am sure most reading this will suggest, I have added a .reserve line to increase the capacity. This does fix this problem, but then (for the same images) a new problem occurs when writing the vector to the file. Now an error occurs in fstream at line
_Count -= _CSTD fwrite(_Ptr, sizeof(_Elem), static_cast<size_t>(_Count), _Myfile);
But again, this error does not happen for most of the other images that are both the same dimensions or larger or smaller. Any reason for this strange behavior? Or perhaps suggestions on a way to fix the fwrite error? I tried shrink_to_fit() but that causes the same malloc error when shrinking to fit.
Here is the relevant code:
(structures)
struct field {
private:
uint32_t bits;
public:
field() : bits(0) {}
operator uint32_t() { return bits; }
inline uint32_t pack() { return bits; }
field& set_bits(uint32_t val, uint8_t pos) {
bits |= (val << pos);
return *this;
}
uint32_t get_bits(uint8_t a, uint8_t b) {
return (bits >> a) & ~(0xFFFFFFFF << (b - a));
}
};
struct bstream {
private:
std::vector<field> ints;
size_t position;
size_t length;
public:
bstream() : position(0), ints(0), length(0) {}
bstream(const bstream& p1) { position = p1.position; ints = p1.ints; length = p1.length; }
field* data() { return ints.data(); }
int get_size() { return (ceil(position / 32.0)); } // returns how many bytes there are
int max_size() { return ints.max_size(); }
int size() { return ints.size(); }
int capacity() { return ints.capacity(); }
bstream& resize(int size) { ints.resize(floor(size / 4)); length = floor(size / 4); return *this; }
bstream& reserve(int size) { ints.reserve(size); return *this; }
bstream& shrink_to_fit() { ints.shrink_to_fit(); return *this; }
bstream& push_chunk(uint32_t C, int Q) {
const int a = (position % 32);
const int b = a + Q;
if (a == 0)
ints.emplace_back(field());
if (b > 32) {
ints.back().set_bits(C, a);
position += (32 - a);
this->push_chunk(C >> (32 - a), b - 32);
}
else {
ints.back().set_bits(C, a);
position += Q;
}
return *this;
}
uint32_t pop_chunk(int Q) {}
bstream& write_int(int C, int Q, const int qmin, bool is_signed) {
this->push_chunk(abs(C) >> qmin, Q - qmin);
if (is_signed && (abs(C) >> qmin) != 0)
this->push_chunk(int(0) < C, 1);
return *this;
}
int read_int(int Q, const int qmin, bool is_signed) {}
bstream& write_descent(int A, int B, const int qmin) {}
int read_descent(int A, const int qmin) {}
bstream& encode_group(int* Q, int* C, int index, int qmin, int sizeQ, int sizeC) {}
bstream& encode_coeffs(int Qg, int Q, int* C, int index, int qmin) {}
void decode_group(int* Q, int* C, int index, int qmin, int sizeQ, int sizeC) {}
void decode_coeffs(int Qg, int& Q, int* C, int index, int qmin) {}
void print() {}
};
(Line used to write to ofstream)
outFile->write(reinterpret_cast<char*>(STREAM.data()), 4.0 * STREAM.get_size());

Cuda C++ design: reusable class with unknown compile-time size

I am looking for a convenient design in order to be able to use a class on the device which has unknown compile-time size.
Only one instance of this class needs to be sent to the device, for which there should be a single call to cudaMalloc and cudaMemcpy (ideally).
The host version of the class would look like this:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
}
double get(int i) const {
// return some processed element from table
}
private:
std::vector<int> table;
};
The kernel:
__global__ void kernel(const A *a){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
a->get(idx); // do something useful with it
}
So far, the way I would design the device version of the class is like that:
const int sizeMax = 1000;
Class A {
public:
A(int size) {
// size checking + some useful initialization of table
}
__host__ __device__
double get(int i) const {
//
}
private:
int table[sizeMax];
};
And the client code:
A a(128);
A* da;
cudaMalloc((void**)&da, sizeof(A));
cudaMemcpy(da, &a, sizeof(A), cudaMemcpyHostToDevice);
kernel<<<1, 32>>>(da);
cudaDeviceSynchronize();
cudaFree(da);
This is rather ugly because:
it wastes bandwith by having to use too large a sizeMax in order to
be on the safe side
the class is not closed for modification, the value of sizeMax will
inevitably need to be raised at some point
Is there any other way to achieve the same thing in a cleaner way without negative performance impact? To be clear, I only need the device version of the class, the first version is just the equivalent non-CUDA code to illustrate the fact that the table size should be dynamic.
In my comment, I said:
separate host and device storage for table, contained in the class, both of which are allocated dynamically. 2. dynamic allocation of table storage size in the constructor, rather than in your client code. This could also include resizing if necessary. 3. differentiation in class methods to use either the host copy of the data or the device copy (i.e. pointer) to the data, depending on whether the method is being executed in host or device code 4. A method to copy data from host to device or vice versa, as the class context is moved from host to device or vice versa.
Here's an example of what I had in mind:
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <iostream>
template <typename T>
class gpuvec{
private:
T *h_vec = NULL;
T *d_vec = NULL;
size_t vsize = 0;
bool iscopy;
public:
__host__ __device__
T * data(){
#ifndef __CUDA_ARCH__
return h_vec;
#else
return d_vec;
#endif
}
__host__ __device__
T& operator[](size_t i) {
assert(i < vsize);
return data()[i];}
void to_device(){
assert(cudaMemcpy(d_vec, h_vec, vsize*sizeof(T), cudaMemcpyHostToDevice) == cudaSuccess);}
void to_host(){
assert(cudaMemcpy(h_vec, d_vec, vsize*sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess);}
gpuvec(gpuvec &o){
h_vec = o.h_vec;
d_vec = o.d_vec;
vsize = o.vsize;
iscopy = true;}
void copy(gpuvec &o){
free();
iscopy = false;
vsize = o.vsize;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);
memcpy(h_vec, o.h_vec, vsize*sizeof(T));
assert(cudaMemcpy(d_vec, o.d_vec, vsize*sizeof(T), cudaMemcpyDeviceToDevice) == cudaSuccess);}
gpuvec(size_t ds) {
assert(ds > 0);
iscopy = false;
vsize = ds;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);}
gpuvec(){
iscopy = false;
}
~gpuvec(){
if (!iscopy) free();}
void free(){
if (d_vec != NULL) cudaFree(d_vec);
d_vec = NULL;
if (h_vec != NULL) ::free(h_vec);
h_vec = NULL;}
__host__ __device__
size_t size() {
return vsize;}
};
template <typename T>
__global__ void test(gpuvec<T> d){
for (int i = 0; i < d.size(); i++){
d[i] += 1;
}
}
int main(){
size_t ds = 10;
gpuvec<int> A(ds);
A.to_device();
test<<<1,1>>>(A);
A.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << A[i];
std::cout << std::endl;
gpuvec<int> B;
B.copy(A);
A.free();
B.to_device();
test<<<1,1>>>(B);
B.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << B[i];
std::cout << std::endl;
B.free();
}
I'm sure quite a few criticisms could be made. This may not adhere to any particular opinion of what "vector syntax" should be. Furthermore I'm sure there are use cases it does not cover, and it may contain outright defects. To create a robust host/device vector realization may require as much work and complexity as thrust host and device vectors. I'm not suggesting that thrust vectors are a drop-in answer for what the question seems to be asking, however.
Based on Robert Crovella's answer, here is a simplified (device only, so ignoring points 3 & 4) working solution:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
cudaMalloc((void**)&dTable, sizeof(int) * size);
cudaMemcpy(dTable, &table[0], sizeof(int) * size, cudaMemcpyHostToDevice);
}
~A() {
cudaFree(dTable);
}
__device__
double get(int i) const {
// return some processed element of dTable
}
private:
std::vector<int> table;
int *dTable;
};
Kernel and client code stay exactly the same.

Efficient generic buffer queue for sequential processing

I have a producer-consumer queue which is being updated by parallel programs. The queue is queried for various statistics like mean or standard deviation or variance or something else on the current queue contents. For mean, this is the code, I use
class BufferQueue {
const int nMaxQueueSize_;
int* values;
int head, tail;
double sum;
::utils::FastMutex queue_mutex;
public:
BufferQueue(const int nMaxQueueSize) :
nMaxQueueSize_(nMaxQueueSize) {
head = tail = 0;
sum = 0;
values = new int[nMaxQueueSize_];
}
void enqueue(int val) {
values[head] = val;
if ((head + 1) % nMaxQueueSize_ == tail) {
queue_mutex.lock();
sum = val.value_point - values[tail].value_point;
utils::memory_barrier();
head = (1 + head) % nMaxQueueSize_;
tail = (1 + tail) % nMaxQueueSize_;
queue_mutex.unlock();
} else {
queue_mutex.lock();
sum += val.value_point;
utils::memory_barrier();
head = (1 + head) % nMaxQueueSize_;
queue_mutex.unlock();
}
}
bool dequeue() {
if (head != tail) {
queue_mutex.lock();
sum -= values[tail].value_point;
utils::memory_barrier();
tail = (1 + tail) % nMaxQueueSize_;
queue_mutex.unlock();
return true;
} else {
sum = 0;
return false;
}
}
MarketSpreadPoint& operator[](int i) {
return values[ (tail + i) % nMaxQueueSize_ ];
}
inline int getSize() {
return (head - tail + nMaxQueueSize_) % nMaxQueueSize_;
}
inline double average() {
queue_mutex.lock();
double result = sum / getSize();
queue_mutex.unlock();
return result;
}
~BufferQueue() {
delete values;
}
};
NOTE: One important thing to remember is that only one operation is being performed. Neither do I want to repeat code by writing separate implementations like BufferQueueAverage, BufferQueueVariance etc. I want very limit code redundancy(compiler optimizations). Even conditioning on type of queue for every update seems sub-optimal.
inline double average() {
queue_mutex.lock();
if(type_is_average){
double result = sum / getSize();
}else if(type_is_variance){
/// update accordingly.
}
double result = sum / getSize();
queue_mutex.unlock();
return result;
}
What can be a good alternative to this idea ?
Note: In this implementation, if queue is full, head automatically make the tail to move forward. In other words, the oldest element is deleted automatically.
Thanks
So you want to separate the queue from the statistics. I see two possible solutions:
Use a pattern like Template Method or Strategy to factor out the dependency.
Use a template that does this.
Assuming that all statistics you gather can gathered incrementally, the latter could look similar to the following (just meant as pseudo code):
class StatisticsMean
{
private:
int n = 0;
double mean = 0.0;
public:
void addSample(int s) { ++n; mean += (s - mean) / n; }
void removeSample(int s) { ... }
double getStatistic() const { return mean; }
}
template <typename TStatistics>
class BufferQueue
{
TStatistics statistics;
...
void enqueue(int val)
{
...
statistics.addSample(val);
}
...
double getStatistic() const { return statistics.getStatistic(); }
}
The template approach gives you full compile-time optimization. You can achieve the same with the Template Method pattern. This would also allow you to have distinct names for the getters (getStatistic() in the above example).
This could look similar to this:
class AbstractBufferQueue
{
virtual void addSample(int s) = 0;
virtual void removeSample(int s) = 0;
void enqueue(int val)
{
...
addSample(val);
}
}
class BufferQueueAverage : public AbstractBufferQueue
{
int n;
double mean;
void addSample(int s) { ++n; mean += (s - mean) / n; }
void removeSample(int s) { ... }
double getAverage() const { return mean; }
}
One way to do what you're asking is by using template classes.
First, decide on a common interface that an accumulator will have. It might be something like:
class accumulator
{
public:
typedef double value_type;
public:
void push(int v); // Called when pushing a new value.
void pop(int v); // Called when popping a new value;
value_type result(size_t n) const; // Returns the current accumulation.
};
As a special case, mean_accumulator could be this:
class mean_accumulator
{
public:
typedef double value_type;
public:
mean_accumulator() : m_sum{0} {}
void push(int v) { m_sum += v; }
void pop(int v); { m_sum -= v; }
double result(size_t n) const { return m_sum / n; };
private:
int m_sum;
};
Now, parameterize your queue by Accumulator, and call it when necessary (while you're at it, note that boost::circular_buffer has much of what you need for the implementation:
template<class Accumulator>
class queue
{
private:
boost::circular_buffer<int> m_buf;
std::mutex m_m;
public:
void push(int v)
{
// Lock the mutex, push to the circular buffer, and the accumulator
}
bool pop()
{
// Lock the mutex; if relevant, update the accumulator and pop the circular buffer
}
typename Accumulator::value_type result() const
{
// Lock the mutex and return the accumulator's result.
}
};

how to create a contiguous 2d array in c++?

I want to create a function that returns a contiguous 2D array in C++.
It is not a problem to create the array using the command:
int (*v)[cols] = new (int[rows][cols]);
However, I am not sure how to return this array as a general type for a function. The function is:
NOT_SURE_WHAT_TYPE create_array(int rows, int cols)
{
int (*v)[cols] = new (int[rows][cols]);
return v;
}
I tried double*[] and double** and both don't work. I wouldn't want to use double*, since I want to access this array from outside as a 2D array.
Related question: How do I declare a 2d array in C++ using new?
If you want to create an array where the data is contiguous and you don't want a 1-dimensional array (i.e. you want to use the [][] syntax), then the following should work. It creates an array of pointers, and each pointer points to a position into a pool of memory.
#include <iostream>
#include <exception>
template <typename T>
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
if (nrows == 0)
throw std::invalid_argument("number of rows is 0");
if (ncols == 0)
throw std::invalid_argument("number of columns is 0");
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{val}; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols )
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete [] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
template <typename T>
void delete2DArray(T** arr)
{
delete [] arr[0]; // remove the pool
delete [] arr; // remove the pointers
}
int main()
{
try
{
double **dPtr = create2DArray<double>(10,10);
dPtr[0][0] = 10; // for example
delete2DArray(dPtr); // free the memory
}
catch(std::bad_alloc& ex)
{
std::cout << "Could not allocate array";
}
}
Note that only 2 allocations are done. Not only is this more efficient due to the lesser amounts of allocations done, we now have a better chance of doing a rollback of the allocated memory if a memory allocation fails, unlike the "traditional" way of allocating a 2D array in non-contiguous memory:
// The "traditional" non-contiguous allocation of a 2D array (assume N x M)
T** ptr;
ptr = new T*[N];
for (int i = 0; i < N; ++i)
ptr[i] = new T [M]; // <<-- What happens if new[] throws at some iteration?
If new[] throws an exception somewhere during the operation of the for loop, you have to roll back all of the successful calls to new[] that happened previously -- that requires more code and adds complexity.
Note how you deallocate the memory in the contiguous version -- just two calls to delete[] when allocated contiguously instead of a loop calling delete[] for each row.
Also, since the data is in contiguous memory, algorithms, functions, etc. that assume that the data is in contiguous memory, just like a one-dimensional array, can now be used by specifying the start and end range for the M*N matrix:
[&array[0][0], &array[M-1][N])
For example:
std::sort(&myArray[0][0], &myArray[M-1][N]);
will sort the entire matrix in ascending order, starting from index [0][0] up until the last index [M-1][N-1].
You can improve on the design by making this a true class instead of having allocation / deallocation as 2 separate functions.
Edit: The class is not RAII-like, just as the comment says. I leave that as an exercise for the reader. One thing missing from the code above is the check that nRows and nCols are > 0 when creating such an array.
Edit 2: Added a try-catch to ensure a proper roll back of the memory allocation is done if a std::bad_alloc exception is thrown attempting to allocate memory.
Edit: For a 3 dimensional array example of code similar to the above see this answer. Included is code to roll back allocations if the allocation fails.
Edit: Rudimentary RAII class added:
template <typename T>
class Array2D
{
T** data_ptr;
unsigned m_rows;
unsigned m_cols;
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{ val }; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols)
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete[] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
public:
typedef T value_type;
T** data() {
return data_ptr;
}
unsigned get_rows() const {
return m_rows;
}
unsigned get_cols() const {
return m_cols;
}
Array2D() : data_ptr(nullptr), m_rows(0), m_cols(0) {}
Array2D(unsigned rows, unsigned cols, const T& val = T())
{
if (rows == 0)
throw std::invalid_argument("number of rows is 0");
if (cols == 0)
throw std::invalid_argument("number of columns is 0");
data_ptr = create2DArray(rows, cols, val);
m_rows = rows;
m_cols = cols;
}
~Array2D()
{
if (data_ptr)
{
delete[] data_ptr[0]; // remove the pool
delete[] data_ptr; // remove the pointers
}
}
Array2D(const Array2D& rhs) : m_rows(rhs.m_rows), m_cols(rhs.m_cols)
{
data_ptr = create2DArray(m_rows, m_cols);
std::copy(&rhs.data_ptr[0][0], &rhs.data_ptr[m_rows-1][m_cols], &data_ptr[0][0]);
}
Array2D(Array2D&& rhs) noexcept
{
data_ptr = rhs.data_ptr;
m_rows = rhs.m_rows;
m_cols = rhs.m_cols;
rhs.data_ptr = nullptr;
}
Array2D& operator=(Array2D&& rhs) noexcept
{
if (&rhs != this)
{
swap(rhs, *this);
rhs.data_ptr = nullptr;
}
return *this;
}
void swap(Array2D& left, Array2D& right)
{
std::swap(left.data_ptr, right.data_ptr);
std::swap(left.m_cols, right.m_cols);
std::swap(left.m_rows, right.m_rows);
}
Array2D& operator = (const Array2D& rhs)
{
if (&rhs != this)
{
Array2D temp(rhs);
swap(*this, temp);
}
return *this;
}
T* operator[](unsigned row)
{
return data_ptr[row];
}
const T* operator[](unsigned row) const
{
return data_ptr[row];
}
void create(unsigned rows, unsigned cols, const T& val = T())
{
*this = Array2D(rows, cols, val);
}
};
int main()
{
try
{
Array2D<double> dPtr(10, 10);
std::cout << dPtr[0][0] << " " << dPtr[1][1] << "\n";
}
catch (std::exception& ex)
{
std::cout << ex.what();
}
}
Unless the size of the two dimensions is known at compile time, your don't have much choice: allocate a single rows*cols array of ints, and roll your own 2D indexing with integer multiplication and addition. Wrapping this in a class can produce a nice-looking syntax for accessing array elements with square bracket operator. Since your array is 2D, you will need to use proxy (AKA "surrogate") objects for the first level of data access.
Here is a small sample code that uses std::vector<T> for maintaining a contiguous memory region in dynamic memory:
template<class T>
class Array2D {
vector<T> data;
size_t cols;
public:
// This is the surrogate object for the second-level indexing
template <class U>
class Array2DIndexer {
size_t offset;
vector<U> &data;
public:
Array2DIndexer(size_t o, vector<U> &dt) : offset(o), data(dt) {}
// Second-level indexing is done in this function
T& operator[](size_t index) {
return data[offset+index];
}
};
Array2D(size_t r, size_t c) : data (r*c), cols(c) {}
// First-level indexing is done in this function.
Array2DIndexer<T> operator[](size_t index) {
return Array2DIndexer<T>(index*cols, data);
}
};
You can now use Array2D<int> as if it were a built-in C++ array:
Array2D<int> a2d(10, 20);
for (int r = 0 ; r != 10 ; r++) {
for (int c = 0 ; c != 20 ; c++) {
a2d[r][c] = r+2*c+1;
}
}
Running demo on ideone.
Since you're using C++ and not C, I would recommend to use one vector instead of messing around with new/delete.
You can define one contiguous block of memory like this:
std::vector<int> my_matrix(rows*cols);
And now you access this vector in a 2d-array-like way with the formula i*n + j, with i being the row index, j the column index and n the length of a row:
my_matrix[i*n + j];
That's the same as accessing a 2d array with array[i][j]. But now you have the advantage of one contiguous block of memory, you don't need to bother about new/delete and you can easily share and return this vector object with functions.
handling raw memory ressources is often icky. Best shot is a simple wrapper as :
struct array2D : private std::vector<int>
{
typedef std::vector<int> base_type;
array2D() : base_type(), height_(0), width_(0) {}
array2D(std::size_t h, std::size_t w) : base_type(h*w), height_(h), width_(w);
int operator()(std::size_t i, std::size_t j) const
{
return base_type::operator[](i+j*height_);
}
int& operator()(std::size_t i, std::size_t j)
{
return base_type::operator[](i+j*height_);
}
std::size_t rows() const { return height_; }
std::size_t cols() const { return width_; }
private:
std::size_t height_, width_;
}
private inheritance let you grab all the goodies from vector, just add your 2D constructor. Ressources management is free as vector ctor/dtor will do their magic. Obviously, the i+h*j can be changed to whateever storage order you want.
vector< vector< int > > is 2D but won't be contiguous in memory.
Your function then become :
array2D create_array(int rows, int cols)
{
return array2D(cols,rows);
}
EDIT:
You can also retrieve other vector interface parts like begin/end or size with the usign clause to make the private inherited member functions public again.
None of the ways of defining a 2D dynamic array in standard C++ are entirely satisfactory in my opinion.
You end up having to roll your own solutions. Luckily there is already a solution in Boost. boost::multi_array:
#include "boost/multi_array.hpp"
template<typename T>
boost::multi_array<T, 2> create_array(int rows, int cols) {
auto dims = boost::extents[rows][cols];
return boost::multi_array<T, 2>(dims);
}
int main() {
auto array = create_array<int>(4, 3);
array[3][2] = 0;
}
Live demo.
The "Rudimentary RAll" class provided by PaulMcKenzie is an excellent solution. In my use of it I did find a memory leak which is fixed in the version shown below.
The memory leak was due to an issue with
Array2D& operator=(Array2D&& rhs) noexcept.
The statement rhs.m_dataPtr = nullPtr needed to be removed in order to allow the rhs destructor to delete the original data (pool and pointers) swapped from lhs.
Here is the corrected code for the "Rudimentary RAll" class provided by PaulMcKenzie
template <typename T>
class Array2D
{
T** data_ptr;
unsigned m_rows;
unsigned m_cols;
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{ val }; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols)
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete[] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
public:
typedef T value_type;
T** data() {
return data_ptr;
}
unsigned get_rows() const {
return m_rows;
}
unsigned get_cols() const {
return m_cols;
}
Array2D() : data_ptr(nullptr), m_rows(0), m_cols(0) {}
Array2D(unsigned rows, unsigned cols, const T& val = T())
{
if (rows == 0)
throw std::invalid_argument("number of rows is 0");
if (cols == 0)
throw std::invalid_argument("number of columns is 0");
data_ptr = create2DArray(rows, cols, val);
m_rows = rows;
m_cols = cols;
}
~Array2D()
{
if (data_ptr)
{
delete[] data_ptr[0]; // remove the pool
delete[] data_ptr; // remove the pointers
}
}
Array2D(const Array2D& rhs) : m_rows(rhs.m_rows), m_cols(rhs.m_cols)
{
data_ptr = create2DArray(m_rows, m_cols);
std::copy(&rhs.data_ptr[0][0], &rhs.data_ptr[m_rows-1][m_cols], &data_ptr[0][0]);
}
Array2D(Array2D&& rhs) noexcept
{
data_ptr = rhs.data_ptr;
m_rows = rhs.m_rows;
m_cols = rhs.m_cols;
rhs.data_ptr = nullptr;
}
Array2D& operator=(Array2D&& rhs) noexcept
{
if (&rhs != this)
{
swap(rhs, *this);
}
return *this;
}
void swap(Array2D& left, Array2D& right)
{
std::swap(left.data_ptr, right.data_ptr);
std::swap(left.m_cols, right.m_cols);
std::swap(left.m_rows, right.m_rows);
}
Array2D& operator = (const Array2D& rhs)
{
if (&rhs != this)
{
Array2D temp(rhs);
swap(*this, temp);
}
return *this;
}
T* operator[](unsigned row)
{
return data_ptr[row];
}
const T* operator[](unsigned row) const
{
return data_ptr[row];
}
void create(unsigned rows, unsigned cols, const T& val = T())
{
*this = Array2D(rows, cols, val);
}
};
int main()
{
try
{
Array2D<double> dPtr(10, 10);
std::cout << dPtr[0][0] << " " << a2[0][0] << "\n";
}
catch (std::exception& ex)
{
std::cout << ex.what();
}
}
I think you should write a simple class to wrap a 1-dim array. Then you can implement a 2-dim array with operator() overloading for getting values and deconstruct func for release the memory. Code as below:
#include <assert.h>
template <typename T>
class Array_2D
{
private:
T *data_inside;
public:
int size[2];
Array_2D(int row, int column);
~Array_2D();
//
T operator()(int index1, int index2){
return data_inside[get_index(index1, index2)];
}
int get_index(int index1, int index2){
if(index1>=0 and index1<size[0] and index2>=0 and index2<=size[1]){
return index1*size[0] + index2;
}else{
assert("wrong index for array!" == "True");
}
}
};
template <typename T>
Array_2D<T>::Array_2D(int row, int column)
{
size[0] = row;
size[1] = column;
data_inside = new T[row*column];
}
template <typename T>
Array_2D<T>::~Array_2D()
{
// 使用析构函数,自动释放资源
delete[] data_inside;
}

C++: Strict aliasing vs union abuse

Apologies in advance for what may be a silly first post on well-trodden ground. While there is plenty of material on the subject, very little of it is definitive and/or intelligible to me.
I have an AlignedArray template class to dynamically allocate memory on the heap with arbitrary alignment (I need 32-byte alignment for AVX assembly routines). This requires some ugly pointer manipulation.
Agner Fog provides a sample class in cppexamples.zip that abuses a union to do so (http://www.agner.org/optimize/optimization_manuals.zip). However, I know that writing to one member of a union and then reading from another results in UB.
AFAICT it is safe to alias any pointer type to a char *, but only in one direction. This is where my understanding gets fuzzy. Here's an abridged version of my AlignedArray
class (essentially a rewrite of Agner's, to help my understanding):
template <typename T, size_t alignment = 32>
class AlignedArray
{
size_t m_size;
char * m_unaligned;
T * m_aligned;
public:
AlignedArray (size_t const size)
: m_size(0)
, m_unaligned(0)
, m_aligned(0)
{
this->size(size);
}
~AlignedArray ()
{
this->size(0);
}
T const & operator [] (size_t const i) const { return m_aligned[i]; }
T & operator [] (size_t const i) { return m_aligned[i]; }
size_t const size () { return m_size; }
void size (size_t const size)
{
if (size > 0)
{
if (size != m_size)
{
char * unaligned = 0;
unaligned = new char [size * sizeof(T) + alignment - 1];
if (unaligned)
{
// Agner:
/*
union {
char * c;
T * t;
size_t s;
} aligned;
aligned.c = unaligned + alignment - 1;
aligned.s &= ~(alignment - 1);
*/
// Me:
T * aligned = reinterpret_cast<T *>((reinterpret_cast<size_t>(unaligned) + alignment - 1) & ~(alignment - 1));
if (m_unaligned)
{
// Agner:
//memcpy(aligned.c, m_aligned, std::min(size, m_size));
// Me:
memcpy(aligned, m_aligned, std::min(size, m_size));
delete [] m_unaligned;
}
m_size = size;
m_unaligned = unaligned;
// Agner:
//m_aligned = aligned.t;
// Me:
m_aligned = aligned;
}
return;
}
return;
}
if (m_unaligned)
{
delete [] m_unaligned;
m_size = 0;
m_unaligned = 0;
m_aligned = 0;
}
}
};
So which method is safe(r)?
I have code that implements the (replacement) new and delete operators, suitable for SIMD (i.e., SSE / AVX). It uses the following functions that you might find useful:
static inline void *G0__SIMD_malloc (size_t size)
{
constexpr size_t align = G0_SIMD_ALIGN;
void *ptr, *uptr;
static_assert(G0_SIMD_ALIGN >= sizeof(void *),
"insufficient alignment for pointer storage");
static_assert((G0_SIMD_ALIGN & (G0_SIMD_ALIGN - 1)) == 0,
"G0_SIMD_ALIGN value must be a power of (2)");
size += align; // raw pointer storage with alignment padding.
if ((uptr = malloc(size)) == nullptr)
return nullptr;
// size_t addr = reinterpret_cast<size_t>(uptr);
uintptr_t addr = reinterpret_cast<uintptr_t>(uptr);
ptr = reinterpret_cast<void *>
((addr + align) & ~(align - 1));
*(reinterpret_cast<void **>(ptr) - 1) = uptr; // (raw ptr)
return ptr;
}
static inline void G0__SIMD_free (void *ptr)
{
if (ptr != nullptr)
free(*(reinterpret_cast<void **>(ptr) - 1)); // (raw ptr)
}
This should be easy to adapt. Obviously you would replace malloc and free, since you're using the global new and delete for raw (char) storage. It assumes that size_t is sufficiently wide for address arithmetic - true in practice, but uintptr_t from <cstdint> would be more correct.
To answer your question, both of those methods are just as safe. The only two operations that are really stinky there are the cast to size_t and new char[stuff]. You should at least be using uintptr_t from <cstdint> for the first. The second operation creates your only pointer aliasing issue as technically the char constructor is run on each char element and that constitutes accessing the data through the char pointer. You should use malloc instead.
The other supposed 'pointer aliasing' isn't an issue. And that's because other than the new operation you aren't accessing any data through the aliased pointers. You are only accessing data through the T * you get after alignment.
Of course, you have to remember to construct all of your array elements. This is true even in your version. Who knows what kind of T people will put there. And, of course, if you do that, you'll have to remember to call their destructors, and have to remember to handle exceptions when you copy them (memcpy doesn't cut it).
If you have a particular C++11 feature, you do not need to do this. C++11 has a function specifically for aligning pointers to arbitrary boundaries. The interface is a little funky, but it should do the job. The call is ::std::align defined in <memory>.Thanks to R. Martinho Fernandes for pointing it out.
Here is a version of your function with the suggested fixed:
#include <cstdint> // For uintptr_t
#include <cstdlib> // For malloc
#include <algorithm>
template <typename T, size_t alignment = 32>
class AlignedArray
{
size_t m_size;
void * m_unaligned;
T * m_aligned;
public:
AlignedArray (size_t const size)
: m_size(0)
, m_unaligned(0)
, m_aligned(0)
{
this->size(size);
}
~AlignedArray ()
{
this->size(0);
}
T const & operator [] (size_t const i) const { return m_aligned[i]; }
T & operator [] (size_t const i) { return m_aligned[i]; }
size_t size() const { return m_size; }
void size (size_t const size)
{
using ::std::uintptr_t;
using ::std::malloc;
if (size > 0)
{
if (size != m_size)
{
void * unaligned = 0;
unaligned = malloc(size * sizeof(T) + alignment - 1);
if (unaligned)
{
T * aligned = reinterpret_cast<T *>((reinterpret_cast<uintptr_t>(unaligned) + alignment - 1) & ~(alignment - 1));
if (m_unaligned)
{
::std::size_t constructed = 0;
const ::std::size_t num_to_copy = ::std::min(size, m_size);
try {
for (constructed = 0; constructed < num_to_copy; ++constructed) {
new(aligned + constructed) T(m_aligned[constructed]);
}
for (; constructed < size; ++constructed) {
new(aligned + constructed) T;
}
} catch (...) {
for (::std::size_t i = 0; i < constructed; ++i) {
aligned[i].T::~T();
}
::std::free(unaligned);
throw;
}
for (size_t i = 0; i < m_size; ++i) {
m_aligned[i].T::~T();
}
free(m_unaligned);
}
m_size = size;
m_unaligned = unaligned;
m_aligned = aligned;
}
}
} else if (m_unaligned) { // and size <= 0
for (::std::size_t i = 0; i < m_size; ++i) {
m_aligned[i].T::~T();
}
::std::free(m_unaligned);
m_size = 0;
m_unaligned = 0;
m_aligned = 0;
}
}
};