using a pointer to vector<T>::data() for cublasSgemm

using a pointer to vector<T>::data() for cublasSgemm - c++

I am trying to use the vector::data() pointer when using cudaMalloc, cudaMemcpy, and cublasSgemm but I can't seem to get it to work. If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory. Using the latter does work, but not the data() pointer.
Here is the code I am working on:
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width); //resizing of the vector of elements for Matrix C
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
The GetPointer() member function returns vector::data() of the vector of elements for that Matrix object. Size is the vector element's size in memory.
The vector of Matrix C returns all zeros when using the data() pointer, and returns the product of Matrix A and B when using T* aArray pointers without vectors.
Is it actually possible to use vectors to store the array of elements and then the data() pointer to initialize the device copy of the array, or am I forced to use the C style array storage on the host? Also, I have tried using thrust::device_vector and that works but I would like to stay away from creating raw_pointer_casts.
Thanks for your help!
Edit:
For those having trouble with copy and pasting, here is the complete example:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cublas_v2.h>
#include <vector>
#include <iostream>
using namespace std;
template<typename T> class Matrix
{
public:
~Matrix();
Matrix();
Matrix(int rows, int columns);
int width;
int height;
int stride;
size_t size;
T &GetElement(int row, int column);
void SetElement(int row, int column, T value);
void SetElements(vector<T> value);
vector<T>& GetElements();
T* GetPointer();
Matrix<T> cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C);
private:
vector<T> elements;
T* firstElement;
};
template<typename T>
Matrix<T>::~Matrix()
{
}
template<typename T>
Matrix<T>::Matrix()
{
}
template<typename T>
Matrix<T>::Matrix(int rows, int columns)
{
height = rows;
width = columns;
stride = columns; //in row major order this is equal to the # of columns
elements.resize(rows*columns);
firstElement = elements.data();
size = height*width*sizeof(T);
}
template<typename T>
T &Matrix<T>::GetElement(int row, int column)
{
return elements[row*width + column]; //row major order return
}
template<typename T>
vector<T>& Matrix<T>::GetElements()
{
return elements; //row major order return
}
template<typename T>
void Matrix<T>::SetElement(int row, int column, T value)
{
elements[row*width + column] = value; //row major order return
}
template<typename T>
void Matrix<T>::SetElements(vector<T> value)
{
elements = value;
}
template<typename T>
T* Matrix<T>::GetPointer()
{
return firstElement;
}
template<typename T>
//Matrix Multiplication using CUDA
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width);
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
//Thrust usage
/*thrust::device_vector<T> d_A = A.GetElements();
T* d_a = thrust::raw_pointer_cast(&d_A[0]);
thrust::device_vector<T> d_B = B.GetElements();
T* d_b = thrust::raw_pointer_cast(&d_B[0]);
thrust::device_vector<T> d_C = C.GetElements();
T* d_c = thrust::raw_pointer_cast(&d_C[0]);*/
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,C.GetPointer(),C.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
//thrust::copy(d_C.begin(), d_C.end(), C.GetElements().begin());
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return C;
}
int main()
{
Matrix<float> A(2,2);
Matrix<float> B(2,2);
Matrix<float> C;
vector<float> aE(4,2);
vector<float> bE(4,4);
A.SetElements(aE);
B.SetElements(bE);
C = C.cudaProd(A, B, C); //function call to cudaProd()
for(int row = 0; row < A.height; ++row)
{
for(int col = 0; col < A.width; ++col)
{
cout<<A.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < B.height; ++row)
{
for(int col = 0; col < B.width; ++col)
{
cout<<B.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < C.height; ++row)
{
for(int col = 0; col < C.width; ++col)
{
cout<<C.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
}

If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory.
The std::vector class is an owning resource class. It means that trying to manage the underlying resource yourself with the data pointer will make you enter a world of pain.
For this very same reason:
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
and:
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
and:
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cannot possibly work.

From the std::vector::data documentation, data() returns both const and non-const qualified pointers, depending on the fact that the vector is qualified as const or not. Quoting the documentation
If the vector object is const-qualified, the function returns a pointer to const value_type. Otherwise, it returns a pointer to value_type.
Accordingly, using
firstElement = elements.data();
in the Matrix constructor is fine to read/write the data.
The main problem with your code is that you are declaring C in the main, passing a reference to C to the cudaProd method and then internally using
C = Matrix<T>(A.height, B.width);
which will redeclare the Matrix.
If you change the definition of the cudaProd method to
template<typename T>
void cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
remove the
return C;
statement and allocate space for C in the main as
Matrix<float> C(2,2);
vector<float> cE(4,10);
C.SetElements(cE);
your code should work correctly.

Related

Cuda C++ design: reusable class with unknown compile-time size

I am looking for a convenient design in order to be able to use a class on the device which has unknown compile-time size.
Only one instance of this class needs to be sent to the device, for which there should be a single call to cudaMalloc and cudaMemcpy (ideally).
The host version of the class would look like this:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
}
double get(int i) const {
// return some processed element from table
}
private:
std::vector<int> table;
};
The kernel:
__global__ void kernel(const A *a){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
a->get(idx); // do something useful with it
}
So far, the way I would design the device version of the class is like that:
const int sizeMax = 1000;
Class A {
public:
A(int size) {
// size checking + some useful initialization of table
}
__host__ __device__
double get(int i) const {
//
}
private:
int table[sizeMax];
};
And the client code:
A a(128);
A* da;
cudaMalloc((void**)&da, sizeof(A));
cudaMemcpy(da, &a, sizeof(A), cudaMemcpyHostToDevice);
kernel<<<1, 32>>>(da);
cudaDeviceSynchronize();
cudaFree(da);
This is rather ugly because:
it wastes bandwith by having to use too large a sizeMax in order to
be on the safe side
the class is not closed for modification, the value of sizeMax will
inevitably need to be raised at some point
Is there any other way to achieve the same thing in a cleaner way without negative performance impact? To be clear, I only need the device version of the class, the first version is just the equivalent non-CUDA code to illustrate the fact that the table size should be dynamic.

In my comment, I said:
separate host and device storage for table, contained in the class, both of which are allocated dynamically. 2. dynamic allocation of table storage size in the constructor, rather than in your client code. This could also include resizing if necessary. 3. differentiation in class methods to use either the host copy of the data or the device copy (i.e. pointer) to the data, depending on whether the method is being executed in host or device code 4. A method to copy data from host to device or vice versa, as the class context is moved from host to device or vice versa.
Here's an example of what I had in mind:
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <iostream>
template <typename T>
class gpuvec{
private:
T *h_vec = NULL;
T *d_vec = NULL;
size_t vsize = 0;
bool iscopy;
public:
__host__ __device__
T * data(){
#ifndef __CUDA_ARCH__
return h_vec;
#else
return d_vec;
#endif
}
__host__ __device__
T& operator[](size_t i) {
assert(i < vsize);
return data()[i];}
void to_device(){
assert(cudaMemcpy(d_vec, h_vec, vsize*sizeof(T), cudaMemcpyHostToDevice) == cudaSuccess);}
void to_host(){
assert(cudaMemcpy(h_vec, d_vec, vsize*sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess);}
gpuvec(gpuvec &o){
h_vec = o.h_vec;
d_vec = o.d_vec;
vsize = o.vsize;
iscopy = true;}
void copy(gpuvec &o){
free();
iscopy = false;
vsize = o.vsize;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);
memcpy(h_vec, o.h_vec, vsize*sizeof(T));
assert(cudaMemcpy(d_vec, o.d_vec, vsize*sizeof(T), cudaMemcpyDeviceToDevice) == cudaSuccess);}
gpuvec(size_t ds) {
assert(ds > 0);
iscopy = false;
vsize = ds;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);}
gpuvec(){
iscopy = false;
}
~gpuvec(){
if (!iscopy) free();}
void free(){
if (d_vec != NULL) cudaFree(d_vec);
d_vec = NULL;
if (h_vec != NULL) ::free(h_vec);
h_vec = NULL;}
__host__ __device__
size_t size() {
return vsize;}
};
template <typename T>
__global__ void test(gpuvec<T> d){
for (int i = 0; i < d.size(); i++){
d[i] += 1;
}
}
int main(){
size_t ds = 10;
gpuvec<int> A(ds);
A.to_device();
test<<<1,1>>>(A);
A.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << A[i];
std::cout << std::endl;
gpuvec<int> B;
B.copy(A);
A.free();
B.to_device();
test<<<1,1>>>(B);
B.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << B[i];
std::cout << std::endl;
B.free();
}
I'm sure quite a few criticisms could be made. This may not adhere to any particular opinion of what "vector syntax" should be. Furthermore I'm sure there are use cases it does not cover, and it may contain outright defects. To create a robust host/device vector realization may require as much work and complexity as thrust host and device vectors. I'm not suggesting that thrust vectors are a drop-in answer for what the question seems to be asking, however.

Based on Robert Crovella's answer, here is a simplified (device only, so ignoring points 3 & 4) working solution:
Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
cudaMalloc((void**)&dTable, sizeof(int) * size);
cudaMemcpy(dTable, &table[0], sizeof(int) * size, cudaMemcpyHostToDevice);
}
~A() {
cudaFree(dTable);
}
__device__
double get(int i) const {
// return some processed element of dTable
}
private:
std::vector<int> table;
int *dTable;
};
Kernel and client code stay exactly the same.

Segmentation Fault when Overloading addition operator and dynamic vectors

I'm trying to overload the addition operator but I keep getting a segmentation fault even though I'm passing in the argument by value after it's memory has been deallocated. Anybody have any idea what I could be doing wrong. Also once I get this running properly after fixing the overloaded addition, I need to use vectors instead of pointing to arrays which I have no idea how to declare in a manner equivalent to what I've wrritten for arrays.
RowAray.h
#ifndef ROWARAY_H // if constant ROWARAY_H not defined do not execute
#define ROWARAY_H // defines constant ROWARAY_H
#include <new> // Needed for bad_alloc exception
#include <cstdlib> // Needed for the exit function
template <class T>
class RowAray{
private:
int size;
T *rowData;
void memError(); // Handles memory allocation errors
void subError(); // Handles subscripts out of range
public:
RowAray(T); //used to construct row Array object
~RowAray(){delete [] rowData;} //used to deallocate dynamically allocated memory from Row array
int getSize(){return size;} //inline accessor member function used to return length of Row array
void setData(int row, T value);
T getData(int i){return (( i >=0&& i < size)?rowData[i]:0);} //
T &operator[](const int &);
};
template <class T>
RowAray<T>::RowAray(T colSize){
size =colSize>1?colSize:1;
// Allocate memory for the array.
try
{
rowData = new T [size];
}
catch (bad_alloc)
{
memError();
}
// Initialize the array.
for (int count = 0; count < size; count++){
T value = rand()%90+10;
setData(count, value);
}
}
template <class T>
void RowAray<T>::memError()
{
cout << "ERROR:Cannot allocate memory.\n";
exit(EXIT_FAILURE);
}
template <class T>
void RowAray<T>::subError()
{
cout << "ERROR: Subscript out of range.\n";
exit(EXIT_FAILURE);
}
template <class T>
T &RowAray<T>::operator[](const int &sub)
{
if (sub < 0 || sub >= size)
subError();
else
return rowData[sub];
}
template <class T>
void RowAray<T>::setData(int row, T value){
//used to fill array with random 2 digit #s
*(rowData + row) = value;
}
#endif /* ROWARAY_H */
Table.h
#ifndef TABLE_H
#define TABLE_H
#include "RowAray.h"
template <class T>
class Table{
private:
int szRow;
int szCol;
RowAray<T> **records;
public:
Table(int,int); //used to construct Table object
Table(const Table &);
~Table(); //used to deallocate dynamically allocated memory from Table object
int getSzRow() const{return szRow;} //used to return row size
int getSzCol()const {return szCol;}
Table operator+(const Table &);
T getRec(int, int) const; //used to return inserted random numbers of 2d arrays
};
template <class T>
Table<T>::Table(int r, int c ){
//Set the row size
this->szRow = r;
//Declare the record array
records = new RowAray<T>*[this->szRow];
//Size each row
this->szCol = c;
//Create the record arrays
for(int i=0;i<this->szRow;i++){
records[i]=new RowAray<T>(this->szCol);
}
}
template <class T>
Table<T>::Table(const Table &Tab){
szRow=Tab.getSzRow();
szCol=Tab.getSzCol();
records = new RowAray<T>*[szCol];
for(int i = 0; i < this->szCol; i++){
records[i] = new RowAray<T>(szRow);
}
//set elements = to random value
for(int row = 0; row < szRow; row++){
for(int col = 0; col < this->szCol; col++){
int value = Tab.getRec(row, col);
records[col]->setData(row,value);
}
}
}
template <class T>
T Table<T>::getRec(int row, int col) const{
//if else statement used to return randomly generated numbers of array
if(row >= 0 && row < this->szRow && col >= 0 && col < this->szCol){
return records[row]->getData(row);
}else{
return 0;
}
}
template <class T>
Table<T>::~Table(){
//Delete each record
for(int i=0;i<this->szRow;i++){
delete records[i];
}
delete []records;
}
template <class T>
Table<T> Table<T>::operator+(const Table &Tab){
Table temp(Tab.getSzRow(), Tab.getSzCol());
//set elements = to random value for operation to
for(int row=0; row < szRow; row++){
for(int col=0; col < szCol; col++){
int value = getRec(row, col) + Tab.getRec(row, col);
temp.records[col]->setData(row,value);
}
}
return temp;
}
#endif /* TABLE_H */
main.cpp
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <iomanip>
using namespace std;
//User Libraries
#include "Table.h"
//Global Constants
//Function Prototype
template<class T>
void prntRow(T *,int);
template<class T>
void prntTab(const Table<T> &);
//Execution Begins Here!
int main(int argc, char** argv) {
//Initialize the random seed
srand(static_cast<unsigned int>(time(0)));
//Declare Variables
int rows=3,cols=4;
//Test out the Row with integers and floats
RowAray<int> a(3);
RowAray<float> b(4);
cout<<"Test the Integer Row "<<endl;
prntRow(&a,3);
cout<<"Test the Float Row "<<endl;
prntRow(&b,4);
//Test out the Table with a float
Table<float> tab1(rows,cols);
Table<float> tab2(tab1);
Table<float> tab3 = tab1 + tab2;
cout<<"Float Table 3 size is [row,col] = Table 1 + Table 2 ["
<<rows<<","<<cols<<"]";
prntTab(tab3);
//Exit Stage Right
return 0;
}
template<class T>
void prntRow(T *a,int perLine){
cout<<fixed<<setprecision(1)<<showpoint<<endl;
for(int i=0;i<a->getSize();i++){
cout<<a->getData(i)<<" ";
if(i%perLine==(perLine-1))cout<<endl;
}
cout<<endl;
}
template<class T>
void prntTab(const Table<T> &a){
cout<<fixed<<setprecision(1)<<showpoint<<endl;
for(int row=0;row<a.getSzRow();row++){
for(int col=0;col<a.getSzCol();col++){
cout<<setw(8)<<a.getRec(row,col);
}
cout<<endl;
}
cout<<endl;
}

You error stems from line 85 of Table.h, in you + operator overload:
temp.records[col]->setData(row,value);
should really be
temp.records[row]->setData(col,value);
Apart from that, I spotted another error in the same file in line 62 (method Table<T>::getRec(int row, int col)):
return records[row]->getData(row);
should be
return records[row]->getData(col);
Apart from those problems, I would highly advise you to rethink and restructure your code (maybe you could also try a code review), since a large part of it is superfluous in my opinion and some things could become problematic if you expand the project (for example, RowAray's constructor takes a T colSize as parameter where you should probably use an int or, better, a size_t. This code as it is will not work when T is not implicitly convertible to an integer type).

Sorting statically allocated array using Thrust

In my code, I have a statically allocated array in global memory (i.e., allocated using __device__), which I want to sort using thrust::sort, which isn't working. All of the examples on this topic are using CUDA runtime allocated arrays (using cudaMalloc). Is there any way I can sort a statically allocated array?
I guess it has something to do with statically allocated memory not being accessible from the host. Using cudaMalloc-allocated arrays, it is working fine. However, I want to avoid using this type of allocation since static allocation allows for easier access to the data from device code (doesn't it?).
Minimal (not-) working example:
#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#define N 4
typedef struct element {
int key;
int value;
__host__ __device__ bool operator<(element e) const
{ return key > e.key; }
} element;
__device__ element array[N];
__global__ void init() {
for (int i = 0; i < N; ++i) {
array[N - i - 1].key = i;
}
}
__global__ void print_array() {
for (int i = 0; i < N; ++i) {
printf("%d ", array[i].key);
}
printf("\n");
}
int main(void) {
thrust::device_ptr<element> array_first(array);
init<<<1,1>>>();
printf("unsorted: ");
print_array<<<1, 1>>>();
cudaDeviceSynchronize();
thrust::sort(array_first, array_first + N);
printf("sorted: ");
print_array<<<1, 1>>>();
cudaDeviceSynchronize();
}

Use cudaGetSymbolAddress to take the address of the array variable from a __host__ function:
void* array_ptr = 0;
cudaGetSymbolAddress(&array_ptr, array);
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));
Here's the complete program:
#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#define N 4
typedef struct element {
int key;
int value;
__host__ __device__ bool operator<(element e) const
{ return key > e.key; }
} element;
__device__ element array[N];
__global__ void init() {
for (int i = 0; i < N; ++i) {
array[N - i - 1].key = i;
}
}
__global__ void print_array() {
for (int i = 0; i < N; ++i) {
printf("%d ", array[i].key);
}
printf("\n");
}
int main(void) {
cudaError_t error;
void* array_ptr = 0;
if(error = cudaGetSymbolAddress(&array_ptr, array))
{
throw thrust::system_error(error, thrust::cuda_category());
}
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));
init<<<1,1>>>();
printf("unsorted: ");
print_array<<<1, 1>>>();
if(error = cudaDeviceSynchronize())
{
throw thrust::system_error(error, thrust::cuda_category());
}
thrust::sort(array_first, array_first + N);
if(error = cudaDeviceSynchronize())
{
throw thrust::system_error(error, thrust::cuda_category());
}
printf("sorted: ");
print_array<<<1, 1>>>();
if(error = cudaDeviceSynchronize())
{
throw thrust::system_error(error, thrust::cuda_category());
}
return 0;
}
Here's the output on my system:
$ nvcc test.cu -run
unsorted: 3 2 1 0
sorted: 3 2 1 0
The sorted output is the same as the unsorted output, but I guess that is intentional given the way the data is generated and the definition of element::operator<.

This:
__device__ element array[N];
...
thrust::device_ptr<element> array_first(array);
is illegal. In host code, array is a host address and can't be passed to device code. Do something like this instead:
element* array_d;
cudaGetSymbolAddress((void **)&array_d, array);
thrust::device_ptr<element> array_first(array_d);
i.e. you need to use cudaGetSymbolAddress to read the address from the GPU context at runtime, then you can use the result of that call in GPU code.

Segmentation fault when creating a row-major array

I'm trying to implement a row-major array, which is basically a single dimension representation of a 2D array.
This is my class definition
class RMA{
public:
RMA(){
size_=0;
row_=0;
column_=0;
arr_ = new double[size_];
}
RMA(int n, int m){
size_ = n*m;
column_ = m;
row_ = n;
if(size_== 0) arr_ = 0;
else arr_ = new double[size_];
}
RMA(const RMA& arr) {
size_ = arr.size_;
if(this != &arr){
delete [] arr_;
arr_ = new double[size_];
for(int i=0; i<size_; i++){
arr_[i] = arr.arr_[i];
}
}
return *this;
}
const double& operator() (int n, int m) const{
return arr_[n*column_+m];
}
double& operator()(int n, int m){
return arr_[n*column_+m];
}
~RMA(){delete[] arr_ ;}
private:
int size_;
int column_;
int row_;
double* arr_;
}
I've a calling function which creates the array.
RMA create_array() {
RMA arr;
arr = RMA(N, M);
std::cout<<"success";
return arr;
}
And this is my client
int main(int argc, char* argv[]) {
RMA arr = create_array();
return 0;
}
I end up getting segmentation fault. What am I doing wrong.

You use operations, that instead of cloning array, take a shallow copy of an object, and when destructors are used, they try to release the same memory block.
Implement the following operations:
RMA::RMA(const RMA&); // copy constructor - clone buffer
RMA& operator=(const &RMA); // assignment - clone buffer, release old
Also instead of:
RMA rma;
rma = RMA(a,b);
Use:
RMA rma = RMA(a,b) or RMA rma(a,b);
Edit: constructor code:
RMA::RMA(const RMA &rma) : size_(0), row_(0), column_(0), buffer_(0)
{
*this = rma;
}
RMA &operator=(const RMA&rma)
{
double *old = buffer_;
size_ = rma.size_;
row_ = rma.row_;
column_ = rma.column_;
buffer_ = new double[size_];
memcpy(buffer_, rma.buffer_, sizeof(buffer_[0]) * size_);
delete []old;
return *this;
}

The best solution is to get rid of all the new/delete, copy-constructors, and fluff. Use a private member variable to manage the memory, and follow the Rule of Zero. Like this:
struct RMA
{
RMA(size_t r = 0, size_t c = 0)
: row(r), column(c), arr(r * c) {}
const double& operator() (int n, int m) const
{ return arr[n * column + m]; }
double& operator() (int n, int m)
{ return arr[n * column + m]; }
private:
std::vector<double> arr;
size_t row, column;
};
That's it. You should not write any copy-constructor, assignment operator, move whatever, because the default-generated ones already do the right thing.
NB. row is actually redundant in my example too, you could remove it and calculate it when needed as arr.size() / column.
You could use .at( ) instead of [ ] on vector in order to throw an exception for out-of-bounds access, instead of causing undefined behaviour.

how to create a contiguous 2d array in c++?

I want to create a function that returns a contiguous 2D array in C++.
It is not a problem to create the array using the command:
int (*v)[cols] = new (int[rows][cols]);
However, I am not sure how to return this array as a general type for a function. The function is:
NOT_SURE_WHAT_TYPE create_array(int rows, int cols)
{
int (*v)[cols] = new (int[rows][cols]);
return v;
}
I tried double*[] and double** and both don't work. I wouldn't want to use double*, since I want to access this array from outside as a 2D array.
Related question: How do I declare a 2d array in C++ using new?

If you want to create an array where the data is contiguous and you don't want a 1-dimensional array (i.e. you want to use the [][] syntax), then the following should work. It creates an array of pointers, and each pointer points to a position into a pool of memory.
#include <iostream>
#include <exception>
template <typename T>
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
if (nrows == 0)
throw std::invalid_argument("number of rows is 0");
if (ncols == 0)
throw std::invalid_argument("number of columns is 0");
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{val}; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols )
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete [] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
template <typename T>
void delete2DArray(T** arr)
{
delete [] arr[0]; // remove the pool
delete [] arr; // remove the pointers
}
int main()
{
try
{
double **dPtr = create2DArray<double>(10,10);
dPtr[0][0] = 10; // for example
delete2DArray(dPtr); // free the memory
}
catch(std::bad_alloc& ex)
{
std::cout << "Could not allocate array";
}
}
Note that only 2 allocations are done. Not only is this more efficient due to the lesser amounts of allocations done, we now have a better chance of doing a rollback of the allocated memory if a memory allocation fails, unlike the "traditional" way of allocating a 2D array in non-contiguous memory:
// The "traditional" non-contiguous allocation of a 2D array (assume N x M)
T** ptr;
ptr = new T*[N];
for (int i = 0; i < N; ++i)
ptr[i] = new T [M]; // <<-- What happens if new[] throws at some iteration?
If new[] throws an exception somewhere during the operation of the for loop, you have to roll back all of the successful calls to new[] that happened previously -- that requires more code and adds complexity.
Note how you deallocate the memory in the contiguous version -- just two calls to delete[] when allocated contiguously instead of a loop calling delete[] for each row.
Also, since the data is in contiguous memory, algorithms, functions, etc. that assume that the data is in contiguous memory, just like a one-dimensional array, can now be used by specifying the start and end range for the M*N matrix:
[&array[0][0], &array[M-1][N])
For example:
std::sort(&myArray[0][0], &myArray[M-1][N]);
will sort the entire matrix in ascending order, starting from index [0][0] up until the last index [M-1][N-1].
You can improve on the design by making this a true class instead of having allocation / deallocation as 2 separate functions.
Edit: The class is not RAII-like, just as the comment says. I leave that as an exercise for the reader. One thing missing from the code above is the check that nRows and nCols are > 0 when creating such an array.
Edit 2: Added a try-catch to ensure a proper roll back of the memory allocation is done if a std::bad_alloc exception is thrown attempting to allocate memory.
Edit: For a 3 dimensional array example of code similar to the above see this answer. Included is code to roll back allocations if the allocation fails.
Edit: Rudimentary RAII class added:
template <typename T>
class Array2D
{
T** data_ptr;
unsigned m_rows;
unsigned m_cols;
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{ val }; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols)
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete[] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
public:
typedef T value_type;
T** data() {
return data_ptr;
}
unsigned get_rows() const {
return m_rows;
}
unsigned get_cols() const {
return m_cols;
}
Array2D() : data_ptr(nullptr), m_rows(0), m_cols(0) {}
Array2D(unsigned rows, unsigned cols, const T& val = T())
{
if (rows == 0)
throw std::invalid_argument("number of rows is 0");
if (cols == 0)
throw std::invalid_argument("number of columns is 0");
data_ptr = create2DArray(rows, cols, val);
m_rows = rows;
m_cols = cols;
}
~Array2D()
{
if (data_ptr)
{
delete[] data_ptr[0]; // remove the pool
delete[] data_ptr; // remove the pointers
}
}
Array2D(const Array2D& rhs) : m_rows(rhs.m_rows), m_cols(rhs.m_cols)
{
data_ptr = create2DArray(m_rows, m_cols);
std::copy(&rhs.data_ptr[0][0], &rhs.data_ptr[m_rows-1][m_cols], &data_ptr[0][0]);
}
Array2D(Array2D&& rhs) noexcept
{
data_ptr = rhs.data_ptr;
m_rows = rhs.m_rows;
m_cols = rhs.m_cols;
rhs.data_ptr = nullptr;
}
Array2D& operator=(Array2D&& rhs) noexcept
{
if (&rhs != this)
{
swap(rhs, *this);
rhs.data_ptr = nullptr;
}
return *this;
}
void swap(Array2D& left, Array2D& right)
{
std::swap(left.data_ptr, right.data_ptr);
std::swap(left.m_cols, right.m_cols);
std::swap(left.m_rows, right.m_rows);
}
Array2D& operator = (const Array2D& rhs)
{
if (&rhs != this)
{
Array2D temp(rhs);
swap(*this, temp);
}
return *this;
}
T* operator[](unsigned row)
{
return data_ptr[row];
}
const T* operator[](unsigned row) const
{
return data_ptr[row];
}
void create(unsigned rows, unsigned cols, const T& val = T())
{
*this = Array2D(rows, cols, val);
}
};
int main()
{
try
{
Array2D<double> dPtr(10, 10);
std::cout << dPtr[0][0] << " " << dPtr[1][1] << "\n";
}
catch (std::exception& ex)
{
std::cout << ex.what();
}
}

Unless the size of the two dimensions is known at compile time, your don't have much choice: allocate a single rows*cols array of ints, and roll your own 2D indexing with integer multiplication and addition. Wrapping this in a class can produce a nice-looking syntax for accessing array elements with square bracket operator. Since your array is 2D, you will need to use proxy (AKA "surrogate") objects for the first level of data access.
Here is a small sample code that uses std::vector<T> for maintaining a contiguous memory region in dynamic memory:
template<class T>
class Array2D {
vector<T> data;
size_t cols;
public:
// This is the surrogate object for the second-level indexing
template <class U>
class Array2DIndexer {
size_t offset;
vector<U> &data;
public:
Array2DIndexer(size_t o, vector<U> &dt) : offset(o), data(dt) {}
// Second-level indexing is done in this function
T& operator[](size_t index) {
return data[offset+index];
}
};
Array2D(size_t r, size_t c) : data (r*c), cols(c) {}
// First-level indexing is done in this function.
Array2DIndexer<T> operator[](size_t index) {
return Array2DIndexer<T>(index*cols, data);
}
};
You can now use Array2D<int> as if it were a built-in C++ array:
Array2D<int> a2d(10, 20);
for (int r = 0 ; r != 10 ; r++) {
for (int c = 0 ; c != 20 ; c++) {
a2d[r][c] = r+2*c+1;
}
}
Running demo on ideone.

Since you're using C++ and not C, I would recommend to use one vector instead of messing around with new/delete.
You can define one contiguous block of memory like this:
std::vector<int> my_matrix(rows*cols);
And now you access this vector in a 2d-array-like way with the formula i*n + j, with i being the row index, j the column index and n the length of a row:
my_matrix[i*n + j];
That's the same as accessing a 2d array with array[i][j]. But now you have the advantage of one contiguous block of memory, you don't need to bother about new/delete and you can easily share and return this vector object with functions.

handling raw memory ressources is often icky. Best shot is a simple wrapper as :
struct array2D : private std::vector<int>
{
typedef std::vector<int> base_type;
array2D() : base_type(), height_(0), width_(0) {}
array2D(std::size_t h, std::size_t w) : base_type(h*w), height_(h), width_(w);
int operator()(std::size_t i, std::size_t j) const
{
return base_type::operator[](i+j*height_);
}
int& operator()(std::size_t i, std::size_t j)
{
return base_type::operator[](i+j*height_);
}
std::size_t rows() const { return height_; }
std::size_t cols() const { return width_; }
private:
std::size_t height_, width_;
}
private inheritance let you grab all the goodies from vector, just add your 2D constructor. Ressources management is free as vector ctor/dtor will do their magic. Obviously, the i+h*j can be changed to whateever storage order you want.
vector< vector< int > > is 2D but won't be contiguous in memory.
Your function then become :
array2D create_array(int rows, int cols)
{
return array2D(cols,rows);
}
EDIT:
You can also retrieve other vector interface parts like begin/end or size with the usign clause to make the private inherited member functions public again.

None of the ways of defining a 2D dynamic array in standard C++ are entirely satisfactory in my opinion.
You end up having to roll your own solutions. Luckily there is already a solution in Boost. boost::multi_array:
#include "boost/multi_array.hpp"
template<typename T>
boost::multi_array<T, 2> create_array(int rows, int cols) {
auto dims = boost::extents[rows][cols];
return boost::multi_array<T, 2>(dims);
}
int main() {
auto array = create_array<int>(4, 3);
array[3][2] = 0;
}
Live demo.

The "Rudimentary RAll" class provided by PaulMcKenzie is an excellent solution. In my use of it I did find a memory leak which is fixed in the version shown below.
The memory leak was due to an issue with
Array2D& operator=(Array2D&& rhs) noexcept.
The statement rhs.m_dataPtr = nullPtr needed to be removed in order to allow the rhs destructor to delete the original data (pool and pointers) swapped from lhs.
Here is the corrected code for the "Rudimentary RAll" class provided by PaulMcKenzie
template <typename T>
class Array2D
{
T** data_ptr;
unsigned m_rows;
unsigned m_cols;
T** create2DArray(unsigned nrows, unsigned ncols, const T& val = T())
{
T** ptr = nullptr;
T* pool = nullptr;
try
{
ptr = new T*[nrows]; // allocate pointers (can throw here)
pool = new T[nrows*ncols]{ val }; // allocate pool (can throw here)
// now point the row pointers to the appropriate positions in
// the memory pool
for (unsigned i = 0; i < nrows; ++i, pool += ncols)
ptr[i] = pool;
// Done.
return ptr;
}
catch (std::bad_alloc& ex)
{
delete[] ptr; // either this is nullptr or it was allocated
throw ex; // memory allocation error
}
}
public:
typedef T value_type;
T** data() {
return data_ptr;
}
unsigned get_rows() const {
return m_rows;
}
unsigned get_cols() const {
return m_cols;
}
Array2D() : data_ptr(nullptr), m_rows(0), m_cols(0) {}
Array2D(unsigned rows, unsigned cols, const T& val = T())
{
if (rows == 0)
throw std::invalid_argument("number of rows is 0");
if (cols == 0)
throw std::invalid_argument("number of columns is 0");
data_ptr = create2DArray(rows, cols, val);
m_rows = rows;
m_cols = cols;
}
~Array2D()
{
if (data_ptr)
{
delete[] data_ptr[0]; // remove the pool
delete[] data_ptr; // remove the pointers
}
}
Array2D(const Array2D& rhs) : m_rows(rhs.m_rows), m_cols(rhs.m_cols)
{
data_ptr = create2DArray(m_rows, m_cols);
std::copy(&rhs.data_ptr[0][0], &rhs.data_ptr[m_rows-1][m_cols], &data_ptr[0][0]);
}
Array2D(Array2D&& rhs) noexcept
{
data_ptr = rhs.data_ptr;
m_rows = rhs.m_rows;
m_cols = rhs.m_cols;
rhs.data_ptr = nullptr;
}
Array2D& operator=(Array2D&& rhs) noexcept
{
if (&rhs != this)
{
swap(rhs, *this);
}
return *this;
}
void swap(Array2D& left, Array2D& right)
{
std::swap(left.data_ptr, right.data_ptr);
std::swap(left.m_cols, right.m_cols);
std::swap(left.m_rows, right.m_rows);
}
Array2D& operator = (const Array2D& rhs)
{
if (&rhs != this)
{
Array2D temp(rhs);
swap(*this, temp);
}
return *this;
}
T* operator[](unsigned row)
{
return data_ptr[row];
}
const T* operator[](unsigned row) const
{
return data_ptr[row];
}
void create(unsigned rows, unsigned cols, const T& val = T())
{
*this = Array2D(rows, cols, val);
}
};
int main()
{
try
{
Array2D<double> dPtr(10, 10);
std::cout << dPtr[0][0] << " " << a2[0][0] << "\n";
}
catch (std::exception& ex)
{
std::cout << ex.what();
}
}

I think you should write a simple class to wrap a 1-dim array. Then you can implement a 2-dim array with operator() overloading for getting values and deconstruct func for release the memory. Code as below:
#include <assert.h>
template <typename T>
class Array_2D
{
private:
T *data_inside;
public:
int size[2];
Array_2D(int row, int column);
~Array_2D();
//
T operator()(int index1, int index2){
return data_inside[get_index(index1, index2)];
}
int get_index(int index1, int index2){
if(index1>=0 and index1<size[0] and index2>=0 and index2<=size[1]){
return index1*size[0] + index2;
}else{
assert("wrong index for array!" == "True");
}
}
};
template <typename T>
Array_2D<T>::Array_2D(int row, int column)
{
size[0] = row;
size[1] = column;
data_inside = new T[row*column];
}
template <typename T>
Array_2D<T>::~Array_2D()
{
// 使用析构函数，自动释放资源
delete[] data_inside;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

using a pointer to vector<T>::data() for cublasSgemm - c++

Related

Cuda C++ design: reusable class with unknown compile-time size

Segmentation Fault when Overloading addition operator and dynamic vectors

Sorting statically allocated array using Thrust

Segmentation fault when creating a row-major array

how to create a contiguous 2d array in c++?

Categories

Resources