c++ cuda: cudaMallocManaged access outside of constructor

c++ cuda: cudaMallocManaged access outside of constructor - c++

I have a c++ class that uses cudaMallocManaged like so:
MyMatrix::MyMatrix(int new_rows, int new_cols, int padrr, int padcc)
{
rows = new_rows;
cols = new_cols;
padr = padrr;
padc = padcc;
cout << "allocating memory" << endl;
float *data;
cudaError_t cudaStatus = cudaMallocManaged(&data, new_rows*new_cols*sizeof(float));
if (cudaStatus != cudaSuccess){
cout << cudaStatus << endl << flush;
exit(1);
}
cudaDeviceSynchronize();
cout << "allocating memory successful:" << cudaStatus << endl;
// I CAN ACCESS DATA HERE
//data[15] = 5.5; //fine
}
MyMatrix::~MyMatrix(void)
{
cudaFree(data); // delete the data array
}
I have a header .h file too:
class MyMatrix
{
public:
MyMatrix(int new_rows, int new_cols, int padr, int padt);
~MyMatrix(void);
float *data;
int padr;
int padc;
int rows;
int cols;
}
I can access the data array fine within this constructor.
However, as soon as I try to access it (read or write) outside of it, I get terminated by signal SIGSEGV (Address boundary error). E.g.,:
MyMatrix *newmat = new MyMatrix(totalr, totalc, padr, padc);
cout << (*newmat).data[0] << endl;
Or
MyMatrix newmat = new MyMatrix(totalr, totalc, padr, padc);
cout << newmat.data[0] << endl;
How can I "persist" this pointer?

In your constructor you create a local variable called data:
float *data;
After allocating the cuda memory and assigning the value to the local variable data, the memory address does not seem to be stored anywhere permanently. So, your memory becomes unreachable.
You have this line newmat.data, but in the constructro you never assigned any value to a member data. You did use just the local variable with the same name.

Your problem is that in
MyMatrix::MyMatrix(int new_rows, int new_cols, int padrr, int padcc)
{
rows = new_rows;
cols = new_cols;
padr = padrr;
padc = padcc;
cout << "allocating memory" << endl;
float *data;
cudaError_t cudaStatus = cudaMallocManaged(&data, new_rows*new_cols*sizeof(float));
if (cudaStatus != cudaSuccess){
cout << cudaStatus << endl << flush;
exit(1);
}
cudaDeviceSynchronize();
cout << "allocating memory successful:" << cudaStatus << endl;
// I CAN ACCESS DATA HERE
//data[15] = 5.5; //fine
}
the data you use in cudaMallocManaged(&data, new_rows*new_cols*sizeof(float)) is the float *data; you declared in the line above, not the data member of your class. You just need to get rid of the local float *data; so you use the data class member like
MyMatrix::MyMatrix(int new_rows, int new_cols, int padrr, int padcc)
{
rows = new_rows;
cols = new_cols;
padr = padrr;
padc = padcc;
cout << "allocating memory" << endl;
cudaError_t cudaStatus = cudaMallocManaged(&data, new_rows*new_cols*sizeof(float));
if (cudaStatus != cudaSuccess){
cout << cudaStatus << endl << flush;
exit(1);
}
cudaDeviceSynchronize();
cout << "allocating memory successful:" << cudaStatus << endl;
// I CAN ACCESS DATA HERE
//data[15] = 5.5; //fine
}

Related

C++ Deep and Shallow Copy

I need to include shallow copy constructor and I'm completely lost. I thought that the compiler provided a default shallow copy constructor but I have to provide one as well but I'm not sure how to write it. I tried writing it similar to the WrapArrayDeep copy constructor without the pointers but that didn't work. After altering the array both arrays for WrapArrayShallow should be empty.
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <string>
#pragma warning(disable: 4996)
using namespace std;
class WrapArrayDeep
{
public:
WrapArrayDeep()
{
capacity = 5;
pca = new char[capacity];
for(int i = 0;i < capacity;i++)
*(pca+i) = (97+i);
} //ends default constructor
WrapArrayDeep(const WrapArrayDeep& wad) //deep copy
{
capacity = wad.getCapacity();
pca = new char[capacity];
for (int i = 0;i < capacity;i++)
*(pca+i) = wad.pca[i];
} //ends copy constructor
~WrapArrayDeep()
{
cout << "destructor for WrapArrayDeep!\n";
delete [] pca;
} //ends destructor
void printArr()
{
for(int i = 0;i < capacity;i++)
cout << pca[i] << " ";
cout << endl;
} //ends print
void alterArr()
{
for(int i = 0;i < capacity;i++)
*(pca + i) = (123+i);
}
int getCapacity() const
{
return capacity;
}
WrapArrayDeep& operator =(const WrapArrayDeep& wad)
{
if(capacity != wad.capacity)
{
delete [] pca;
pca = new char[wad.capacity];
}
capacity = wad.capacity;
for(int i =0;i < capacity;i++)
pca[i] = wad.pca[i];
return *this;
} //end of = operator overload
private:
int capacity;
char *pca;
};
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
class WrapArrayShallow
{
public:
WrapArrayShallow()
{
capacity = 5;
pca = new char[capacity];
for(int i = 0;i < capacity;i++)
pca[i] = (97+i);
} //ends default constructor
~WrapArrayShallow()
{
cout << "destructor for WrapArrayShallow!\n";
delete [] pca;
} //ends destructor
void printArr()
{
for(int i = 0;i < capacity;i++)
cout << *(pca + i) << " ";
}
void alterArr()
{
for(int i = 0;i < capacity;i++)
pca[i] = (123 + i);
}
int getCapacity() const
{
return capacity;
}
private:
int capacity;
char *pca;
};
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int main()
{
//~~~~~~~~Part 1~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cout << "Part 1\n";
int i = 7;
int *pi;
pi = &i;
cout << "i = " << i << endl << "pi = " << pi << endl << "*pi = " << *pi << endl;
cout << "address of i = " << &i << endl << "address of pi = " << &pi << endl;
int **ppi;
ppi = π
cout << "ppi = " << ppi << endl << "*ppi = " << *ppi << endl;
cout << "address of ppi = " << &ppi << endl << "**ppi = " <<**ppi <<endl;
cout << endl << "~~~~~~~~~~~~~~~~~~~~~~~~~~~";
//~~~~~~~~Part 2~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cout << "\nPart 2\n";
cout << "This section instantiates a wrapper class for a dynamic array of 5 elements. \n";
WrapArrayDeep wad1, *wad2;
cout <<"WrapArray Deep 1\n";
wad1.printArr();
//deep copy of wad1
wad2 = new WrapArrayDeep(wad1);
cout << "WrapArrayDeep 2 ~ copy constructor \n";
wad2->printArr();
wad1.alterArr();
cout << "after altering WrapArrayDeep1, 1 and 2 = \n";
wad1.printArr();
wad2->printArr();
WrapArrayShallow was1, *was2;
cout << "WrapArrayShallow1\n";
was1.printArr();
//shallow copy of was1
was2 = new WrapArrayShallow(was1);
cout << "\nWrapArrayShallow2\n";
was2->printArr();
was1.alterArr();
cout << "\nafter altering WrapArrayShallow1, 1 and 2 = \n";
was1.printArr();
cout << endl;
was2->printArr();
cout << endl;
delete wad2;
delete was2;
cout << endl;
system("pause");
return 0;
}

To describe at the surface, Deep copy manages every variables including dynamically allocated one and safely copy to the destination object while the shallow copy don't care much about the dynamic variables. You might have a good read
here

When you copy a WrapArrayShallow, there are now two pointers pointing to the same array, and when either WrapArrayShallow is destroyed, you delete[] the array.
Your program exhibits undefined behaviour, it uses a pointer that has been invalidated, when the second WrapArrayShallow is destroyed, and you delete[] pca.
You need to co-ordinate between the different WrapArrayShallow objects such that the last survivor delete[]s the array when it is destroyed. The simplest way is to use a std::shared_ptr<char[]> instead of a char *

Wrong output for an array of a class

So, I have been provided the following classes along with the member variables and methods ( the constructors included . NO extra variables and methods should be added ). I wrote the following codes for the methods provided:
Container Class
# define INTEGER 1
# define INT_ARRAY 2
# define INT_MATRIX 3
class Container{
int *value;
int *valueArray;
int **valueMatrix;
int firstDim, secondDim;
int storedType;
void reset(){
if (value != NULL){
delete value;
value=NULL;
}
if (valueArray != NULL){
delete[] valueArray;
valueArray=NULL;
}
if (valueMatrix != NULL){
for(int i=0;i<firstDim;i++){
delete[] valueMatrix[i];
}
delete[] valueMatrix;
valueMatrix=NULL;
}
firstDim = 0;
secondDim = 0;
storedType = -1;
}
public:
Container(){
cout << "Constructing Container with empty parameter" << endl;
cout << "___________________________________________" << endl;
value = NULL;
valueArray = NULL;
valueMatrix = NULL;
firstDim = 0;
secondDim = 0;
storedType = -1;
}
Container (int val){
cout << "Constructing Container with a single integer parameter" << endl;
cout << "______________________________________________________" << endl;
value=new int(val);
valueArray = NULL;
valueMatrix = NULL;
firstDim = 0;
secondDim = 0;
storedType = INTEGER;
}
Container (int *valArr, int len){
cout << "Constructing Container with integer array parameter" << endl;
cout << "___________________________________________________" << endl;
valueArray=new int[len];
value=NULL;
valueMatrix=NULL;
for(int i=0;i<len;i++) valueArray[i]=valArr[i];
firstDim = len;
secondDim = 0;
storedType = INT_ARRAY;
}
Container (int **valMat, int r, int c){
cout << "Constructing Container with integer matrix parameter" << endl;
cout << "____________________________________________________" << endl;
valueMatrix=new int*[r];
value=NULL;
valueArray=NULL;
for(int i=0;i<r;i++){
valueMatrix[i]=new int[c];
for(int j=0;j<c;j++){
valueMatrix[i][j]=valMat[i][j];
}
}
firstDim=r;
secondDim=c;
storedType=INT_MATRIX;
}
Container(const Container &obj){
cout << "Calling copy constructor of Container" << endl;
cout << "_____________________________________" << endl;
firstDim=obj.firstDim;
secondDim=obj.secondDim;
storedType=obj.storedType;
value=new int;
value=obj.value;
valueArray=new int[firstDim];
valueArray=obj.valueArray;
valueMatrix=new int*[firstDim];
for(int k=0;k<obj.firstDim;k++){
valueMatrix[k]=new int[secondDim];
}
valueMatrix=obj.valueMatrix;
}
void setItem (int val){
reset();
value=new int(val);
firstDim = 0;
secondDim = 0;
storedType = INTEGER;
valueArray=NULL;
valueMatrix=NULL;
}
void setItem(int *valArr, int len){
reset();
valueArray=new int[len];
for(int i=0;i<len;i++) valueArray[i]=valArr[i];
firstDim = len;
secondDim = 0;
storedType = INT_ARRAY;
value=NULL;
valueMatrix=NULL;
}
void setItem(int **valMat, int r, int c){
reset();
valueMatrix=new int*[r];
for(int i=0;i<r;i++){
valueMatrix[i]=new int[c];
for(int j=0;j<c;j++){
valueMatrix[i][j]=valMat[i][j];
}
}
firstDim=r;
secondDim=c;
storedType=INT_MATRIX;
value=NULL;
valueArray=NULL;
}
void * getItem(){
if (value != NULL) return value;
if (valueArray != NULL) return valueArray;
if (valueMatrix != NULL) return valueMatrix;
return NULL;
}
int getFirstDim(){
return firstDim;
}
int getSecondDim(){
return secondDim;
}
int getStoredType(){
return storedType;
}
void print(){
if (value != NULL){
cout << "There is only an integer value in the container object" << endl;
cout << "The value is: " << *value << endl;
}
else if (valueArray != NULL){
cout << "There is an integer array in the container object" << endl;
cout << "The values stored in the array are:" << endl;
for (int i=0; i<firstDim; i++){
cout << valueArray[i] << " ";
}
cout << endl;
}
else if (valueMatrix != NULL){
cout << "There is an integer matrix in the container object" << endl;
cout << "The values stored in the matrix are:" << endl;
for (int i=0; i<firstDim; i++){
for (int j=0; j<secondDim; j++){
cout << valueMatrix[i][j] << " ";
}
cout << endl;
}
}
else{
cout << "The object has no elements" << endl;
}
}
~Container(){
if (value != NULL){
cout << "Freeing allocated memory for a single integer" << endl;
delete value;
value = NULL;
}
if (valueArray != NULL){
cout << "Freeing allocated memory for integer array" << endl;
delete[] valueArray;
valueArray = NULL;
}
if (valueMatrix != NULL){
cout << "Freeing allocated memory for integer matrix" << endl;
for(int i=0;i<firstDim;i++){
delete[] valueMatrix[i];
}
delete[] valueMatrix;
valueMatrix = NULL;
}
firstDim = 0;
secondDim = 0;
storedType = -1;
cout << "_____________________" << endl;
cout << "Destructing Container" << endl;
}
};
ContainerArray Class
class ContainerArray{
Container *arrayOfContainers;
int allocatedSize;
public:
ContainerArray(){
allocatedSize = 0;
arrayOfContainers = NULL;
}
ContainerArray(int size){
allocatedSize=size;
arrayOfContainers=new Container[size];
}
void setAllocatedSize(int sz){
if(allocatedSize){
delete[] arrayOfContainers;
}
allocatedSize=sz;
arrayOfContainers=new Container[sz];
}
int getAllocatedSize(){
return allocatedSize;
}
Container getItemAt(int index){
if (index >= allocatedSize){
cout << "Cannot get item, Exception: Container Array index out of bound";
exit(0);
}
return arrayOfContainers[index];
}
void setItemAt(Container p,int x){
if(x >= allocatedSize) cout << "Exception: Container Array index out of bound";
else{
if(p.getStoredType() == 1) arrayOfContainers[x].setItem(*(int*)p.getItem());
else if(p.getStoredType() == 2) arrayOfContainers[x].setItem((int*)p.getItem(),p.getFirstDim());
else if(p.getStoredType() == 3) arrayOfContainers[x].setItem((int**)p.getItem(),p.getFirstDim(),p.getSecondDim());
}
}
~ContainerArray(){
delete[] arrayOfContainers;
allocatedSize=0;
arrayOfContainers=NULL;
}
};
main() function
int main()
{
Container a;
Container b(100);
int *arr = new int[3];
arr[0] = 10;
arr[1] = 20;
arr[2] = 30;
Container c(arr, 3);
int **mat = new int*[2];
mat[0] = new int[3];
mat[0][0] = 1;
mat[0][1] = 2;
mat[0][2] = 3;
mat[1] = new int[3];
mat[1][0] = 4;
mat[1][1] = 5;
mat[1][2] = 6;
Container d(mat, 2, 3);
int firstObjArraySize = 3, secondObjArraySize = 4;
ContainerArray containerArray1;
ContainerArray containerArray2(secondObjArraySize);
cout << secondObjArraySize << " constructors with empty parameters are called" << endl;
containerArray1.setAllocatedSize(firstObjArraySize);
cout << firstObjArraySize << " constructors with empty parameters are called" << endl;
containerArray1.setItemAt(a, 0);
containerArray1.setItemAt(b, 2);
containerArray1.setItemAt(c, 1);
containerArray2.setItemAt(c, 0);
containerArray2.setItemAt(d, 1);
for (int i=0; i<3; i++){
cout << i << "-th element of 1st container array:" << endl;
containerArray1.getItemAt(i).print();
}
for (int i=0; i<2; i++){
cout << i << "-th element of 2nd container array:" << endl;
containerArray2.getItemAt(i).print();
}
return 0;
}
The Problem:
The object containerArray1 prints well . But the array initialized in the main() function is used again in the object containeraArray2 . Due to the destructor freeing the memory of the previous object , the array's contents aren't printed as wanted . But the other contents are printed as wanted . I know there might have been any problem in dynamic memory allocation of mine . But I can't understand . Any help would be appreciated .
Thanks ...
EXPECTED OUTPUT
Output for the containerArray1 object :
0-th element of 1st container array:
Calling copy constructor of Container
_____________________________________
The object has no elements
_____________________
Destructing Container
1-th element of 1st container array:
Calling copy constructor of Container
_____________________________________
There is an integer array in the container object
The values stored in the array are:
10 20 30
Freeing allocated memory for integer array
_____________________
Destructing Container
2-th element of 1st container array:
Calling copy constructor of Container
_____________________________________
There is only an integer value in the container object
The value is: 100
Freeing allocated memory for a single integer
_____________________
Destructing Container
Output for containerArray2 object :
0-th element of 2nd container array:
Calling copy constructor of Container
_____________________________________
There is an integer array in the container object
The values stored in the array are:
10 20 30
Freeing allocated memory for integer array
_____________________
Destructing Container
1-th element of 2nd container array:
Calling copy constructor of Container
_____________________________________
There is an integer matrix in the container object
The values stored in the matrix are:
1 2 3
4 5 6
Freeing allocated memory for integer matrix
_____________________
Destructing Container
The problem lies in the 0th element of the 2nd container object . The array prints abrupt numbers .

Your copy-constructor does not copy the values but just bends the pointers.
You allocated the memory but then you overwrite the pointer to the allocated memory.
So instead of this
value=new int;
value=obj.value;
valueArray=new int[firstDim];
valueArray=obj.valueArray;
valueMatrix=new int*[firstDim];
for(int k=0;k<obj.firstDim;k++){
valueMatrix[k]=new int[secondDim];
}
valueMatrix=obj.valueMatrix;
You need to do something like this:
value = nullptr;
valueArray = nullptr;
valueMatrix = nullptr;
storedType = obj.storedType;
switch (storedType)
{
case INTEGER:
value = new int;
*value = *(obj.value);
break;
case INT_ARRAY:
valueArray = new int[firstDim];
std::copy(obj.valueArray, obj.valueArray + firstDim, valueArray);
break;
case INT_MATRIX:
valueMatrix = new int* [firstDim];
for (int k = 0; k < obj.firstDim; k++) {
valueMatrix[k] = new int[secondDim];
std::copy(obj.valueMatrix[k], obj.valueMatrix[k] + secondDim, valueMatrix[k]);
}
break;
}
Since you are just dealing with int values you can just use memcpy instead std::copy, but I just thought I'd mention the general case.

increase performance of opencl

I am trying to implement some image processing algorithm using opencl. But as i see when i use opencl it is taking around 0.5 ms to complet one process i.e one frame. Isn't there any way than i initialize the opencl parameters only once using class object declaration than only call a function run the main kernel? I tried like this by creating class but as i find context, device can't be declared and used seperately and needs to be created each time.
#include <CL/cl.hpp>
#include <chrono>
#include <iostream>
using namespace std::chrono;
using namespace std;
namespace Color {
enum Code {
FG_RED = 31,
FG_GREEN = 32,
FG_BLUE = 34,
FG_DEFAULT = 39,
BG_RED = 41,
BG_GREEN = 42,
BG_BLUE = 44,
BG_DEFAULT = 49
};
class Modifier {
Code code;
public:
Modifier(Code pCode) : code(pCode) {}
friend std::ostream& operator<<(std::ostream& os, const Modifier& mod) {
return os << "\033[" << mod.code << "m";
}
};
} // namespace Color
class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;
cl::Buffer buffer_A;
useOpenCL();
~useOpenCL() {}
void backgroundSub();
};
useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
}
void useOpenCL::backgroundSub() {
int A[size], B[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}
auto start1 = high_resolution_clock::now();
cl::Context context({default_device});
cl::Program program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}
// create buffers on the device
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * size);
// create queue to which we will push commands for the device.
cl::CommandQueue queue(context, default_device);
// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// run the kernel
/*cl::KernelFunctor
simple_add(cl::Kernel(program,"simple_add"),queue,cl::NullRange,cl::NDRange(10),cl::NullRange);
simple_add(buffer_A,buffer_B,buffer_C);*/
// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);
queue.finish();
int C[size];
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);
/*std::cout<<" result: \n";
for(int i=0;i<size;i++){
std::cout<<C[i]<<"\t";
}*/
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}
int main() {
useOpenCL img;
while (true) {
img.backgroundSub();
}
return 0;
}
It is giving me below results:
Segmentation FPS=13.2557 Execution Time(sec)=0.075439
Segmentation FPS=15.7602 Execution Time(sec)=0.063451
Segmentation FPS=14.3872 Execution Time(sec)=0.069506
Segmentation FPS=12.7525 Execution Time(sec)=0.078416
Which is not good since fps is only 12, 13 fps. So how can i make this program faster?

Put the initialization part that you only need to call once in the beginning in the constructor. This initialization should contain ALL memory allocation, OpenCL C code compilation and any initial memory transfers from host to device:
useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
context = cl::Context({default_device});
program = cl::Program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}
// create queue to which we will push commands for the device.
queue = cl::CommandQueue(context, default_device);
// create buffers on host
int A[size], B[size];
int C[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}
// create buffers on the device
buffer_A = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_B = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_C = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
}
Therefore make context, program, queue, buffer_A, buffer_B, buffer_C member variables of your class useOpenCL. Especially the memory allocation and compilation take a long time, so do them only once and reuse the buffers.
class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_C;
cl::Context context;
cl::Program program;
cl::CommandQueue queue;
useOpenCL();
~useOpenCL() {}
void backgroundSub();
};
Then only the kernel call and eventually memory transfers host<->device remain for every frame calculation:
void useOpenCL::backgroundSub() {
auto start1 = high_resolution_clock::now();
// write arrays A and B to the device (ONLY IF NECESSARY FOR EVERY FRAME)
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);
// run the kernel
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);
queue.finish();
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}
The latter code can be called over and over again and should be much faster than if you re-initialize everything over and over again. Also make sure that size is large enough, otherwise the GPU might not be utilized at its full potential and the latencies for host<->device memory transfers will make every frame disproportunately slower.

Set array dimension at runtime

I have a struct, which, depending on user inputs at runtime, will either require a 1D array or a 3D array. It will never need both. Right now, I have it set up like in the sample code below, with separate variables that can point to either a 1D array, or a 3D array. I would like to have just one variable in the struct that can point to either a 1D array or a 3D array, where the dimension is set at runtime. I have intermediate knowledge of C, and am a beginner with C++. I'd be willing to accept an answer based on C++ concepts but only if there is no slowdown (or negligible slowdown) compared to using C when iterating over the values. If it's a 3D array, then the for loops that access and change the array's values are the biggest bottleneck in my code. Once the array is set up, I won't need to change the dimension or size of the array.
Is there a way to do this, or should I just settle for always having an extraneous variable in my struct?
#include <iostream>
using namespace std;
typedef struct {
int dim;
int *one_d_arr;
int ***three_d_arr;
} Struct;
int main() {
int count = 0;
int *arr1 = (int*) malloc(2 * sizeof(int));
arr1[0] = 0;
arr1[1] = 1;
int ***arr3 = (int***) malloc(2 * sizeof(int**));
for (int i=0; i<2; i++) {
arr3[i] = (int**) malloc(2 * sizeof(int*));
for (int j=0; j<2; j++) {
arr3[i][j] = (int*) malloc(2 * sizeof(int));
for (int k=0; k<2; k++) {
arr3[i][j][k] = count++;
}
}
}
Struct s;
s.one_d_arr = NULL;
s.three_d_arr = NULL;
cout << "Enter number of dimensions: ";
cin >> s.dim;
if (s.dim==1) {
s.one_d_arr = arr1;
cout << s.one_d_arr[0] << ", " << s.one_d_arr[1] << endl;
}
else if (s.dim==3) {
s.three_d_arr = arr3;
cout << s.three_d_arr[0][0][0] << ", " << s.three_d_arr[0][0][1] << endl;
cout << s.three_d_arr[0][1][0] << ", " << s.three_d_arr[0][1][1] << endl;
cout << s.three_d_arr[1][0][0] << ", " << s.three_d_arr[1][0][1] << endl;
cout << s.three_d_arr[1][1][0] << ", " << s.three_d_arr[1][1][1] << endl;
}
else {
cout << "Must enter 1 or 3" << endl;
}
}

My recommendation is to use two different types here, instead of a single struct. Using an abstract base class, you can make both subclasses conform to a single interface, but they would have different underlying behavior. A very basic example:
class ArrayBase {
int dim;
public:
// This function is pure virtual, which means it's impossible to
// instantiate an instance of ArrayBase. Any class that inherits from
// ArrayBase must implement printArray().
virtual void printArray() = 0;
}
class Array1D : public ArrayBase {
int* array;
void printArray() {
// some code to print this one-dimensional array
}
}
class Array3D : public ArrayBase {
int*** array;
void printArray() {
// some code to print this three-dimensional array
}
}
Later, when you need to use the array, you can dynamically allocate the type you need, like this:
ArrayBase* inputArray;
// if the user wants a 1D array
inputArray = new Array1D();
// if the user wants a 3D array
inputArray = new Array3D();
// this will call the appropriate function to print the array
inputArray->printArray();
If you really want to have a single type, using boost::any is one way to condense your two array pointers into one. I would not recommend this approach, but it would work.

One of the juicy things about the C/C++ pointers is the existence of void pointers. A void pointer can point to anything you want, from int to int ***.
So you can simply use the following code:
#define CAST1(arr) ((int *)arr)
#define CAST3(arr) ((int ***)arr)
#define CAST(arr,i) CAST##i(arr)
typedef struct {
int dim;
void *arr;
} Struct;
int main()
{
Struct s;
cin >> s.dim;
int count = 0;
if (s.dim == 1){
s.arr = malloc(2 * sizeof(int));
CAST(s.arr, 1)[0] = 0;
CAST(s.arr, 1)[1] = 1;
}
else if (s.dim == 3){
s.arr = malloc(2 * sizeof(int ***));
for (int i = 0; i < 2; i++){
CAST(s.arr, 3)[i] = (int **) malloc(2 * sizeof(int **));
for (int j = 0; j < 2; j++){
CAST(s.arr, 3)[i][j] = (int *)malloc(2 * sizeof(int *));
for (int k = 0; k < 2; k++){
CAST(s.arr, 3)[i][j][k] = count++;
}
}
}
}
if (s.dim == 1) {
cout << CAST(s.arr, 1)[0] << ", " << CAST(s.arr, 1)[1] << endl;
}
else if (s.dim == 3) {
cout << CAST(s.arr, 3)[0][0][0] << ", " << CAST(s.arr, 3)[0][0][1] << endl;
cout << CAST(s.arr, 3)[0][1][0] << ", " << CAST(s.arr, 3)[0][1][1] << endl;
cout << CAST(s.arr, 3)[1][0][0] << ", " << CAST(s.arr, 3)[1][0][1] << endl;
cout << CAST(s.arr, 3)[1][1][0] << ", " << CAST(s.arr, 3)[1][1][1] << endl;
}
else {
cout << "Must enter 1 or 3" << endl;
}
system("pause");
return 0;
}

C++: the function collapses at the 4th element of the array

Could you have a look at what I've faced: http://sdrv.ms/WgafvN
And another screenshot: http://sdrv.ms/UZIp6H
The text of my function is:
bool print_all_points(POINT** pointer)
{
if (pointer == NULL||is_array_empty(pointer))
{
cout << "The array of points is empty." << endl << endl;
return false;
}
else
{
int n = _msize(pointer)/sizeof(pointer[0]);
cout << "The list of points: " << endl<< endl;
cout << "id (x, y)" << endl;
cout << "----------" << endl;
for (int i = 0; i < n; i++)
{
cout << (*pointer[i]).id << " (" << (*pointer[i]).x << ", " << (*pointer[i]).y << ")" << endl;
}
}
return true;
}
This function is expected to print out all the points in an array. My problem is that it perfectly prints the array of 3 points rather than that of 4 points. At the 4th point it bites the dust.
I can't catch what the trouble is.
From the picture it is visible that:
1. All 4 elements of the array are present.
2. It is correctly determined that there 4 of them.
What is the problem?
Could you give me a kick here?
ADDED LATER.
The function which calls this:
POINT** new_point(POINT** pointer, int occup)
{
char x;
char y;
system("cls");
cout << "INPUT A NEW POINT" << endl << endl;
cout << "Input x: ";
cin >> x;
cout << "Input y: ";
cin >> y;
size_t m;
if (pointer != NULL)
{
m = _msize(pointer);
}
POINT * tmp_point = new POINT();
(*tmp_point).id = occup;
(*tmp_point).x = x-48;
(*tmp_point).y = y-48;
POINT** pn = new POINT * [occup];
int necessary_memory = occup * 4; // ???? 4 is the size of a pointer.
if (occup !=1)
{
memcpy(pn, pointer, necessary_memory);
}
POINT ** tmp = new POINT * [occup];
pn[occup - 1] = tmp_point;
memcpy(tmp, pn, occup * sizeof(POINT));
delete[] pn;
pn = tmp;
size_t n = _msize(pn);
cout << endl;
print_all_points(pn);
return pn;
}

several problems:
not copying enough data in 64-bit
int necessary_memory = occup * 4;
should be
int necessary_memory = occup * sizeof(POINT*);
copying too much data
memcpy(tmp, pn, occup * sizeof(POINT));
should be:
memcpy(tmp, pn, occup * sizeof(POINT*));
Someone else can chime in, but I am not sure _msize should be used on memory allocated by new. Is that right? http://msdn.microsoft.com/en-us/library/z2s077bc(v=vs.80).aspx
fucntion in the title should be function
You're welcome. You owe me a beer.
Oh yea, I found my shoes... where would you like it?

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

c++ cuda: cudaMallocManaged access outside of constructor - c++

Related

C++ Deep and Shallow Copy

Wrong output for an array of a class

increase performance of opencl

Set array dimension at runtime

C++: the function collapses at the 4th element of the array

Categories

Resources