Device pointer in a device class (Cuda C++) - c++

I would like to implement a device side vector class which encapsulates a pointer to the elements of the container.
After I instantiate an object of this class I have no access to the inside pointer. It always says 'Access violation writing location some device memory address'.
My code is the following:
#include <iostream>
#include <cuda_runtime.h>
template <typename T>
class DeviceVector
{
private:
T* m_bValues;
std::size_t m_bSize;
public:
__host__
void* operator new(std::size_t size)
{
DeviceVector<T>* object = nullptr;
cudaMalloc((void**)&object, size);
return object;
}
__host__
void operator delete(void* object)
{
cudaFree(object);
}
__host__
DeviceVector(std::size_t size = 1)
{
cudaMemcpy(&m_bSize, &size, sizeof(std::size_t), cudaMemcpyHostToDevice);
// At this cudaMalloc I get Access violation writing location...
cudaMalloc((void**)&m_bValues, size * sizeof(T));
// It's an alternative solution here
T* ptr;
cudaMalloc((void**)&ptr, size * sizeof(T));
cudaMemcpy(&m_bValues, &ptr, sizeof(T*), cudaMemcpyHostToDevice);
// The memory is allocated
// But I can't access it through m_bValues pointer
// It is also Access violation writing location...
}
__host__
~DeviceVector()
{
// Access violation here if I use the second solution in the constructor
cudaFree(m_bValues);
}
};
int main()
{
DeviceVector<int>* vec = new DeviceVector<int>();
delete vec;
return 0;
}
Note:
I have access to the size attribute.
So my questions are:
How to allocate memory for this class to get access to the pointer inside?
Is this even possible to encapsulate a pointer into a class on the device?

This line is illegal:
cudaMalloc((void**)&m_bValues, size * sizeof(T));
because your new operator allocated the object on the device:
cudaMalloc((void**)&object, size);
return object;
and the constructor was called to operate on that allocation. Therefore &m_bValues is taking the address of a device variable in host code which is illegal in CUDA. If you do that, and then attempt to use it in host code (i.e. the cudaMalloc operation), you're going to get a seg fault. cudaMalloc creates a device allocation of a particular size, and then stores the device pointer to that allocation in a variable that is expected to be resident on the host. If you pass it a device address to store that pointer into instead, cudaMalloc will segfault trying to write the pointer value.
Your alternative solution is a somewhat better approach, and is the general idea when it's necessary to copy a pointer to a device allocation to a variable resident on the device.
But you've still basically made the allocation that m_bValues points to inaccessible from the host. (ptr, being a temporary variable, won't help, and creating another variable in the class to hold a value like ptr won't help either because the entire class is allocated and resident on the device.) For the same reason that you're not allowed to use &m_bValues in the previous cudaMalloc operation, you won't be able to use it directly in any other host code (except as the target for cudaMempcy host->device when copying the pointer value itself).
I don't think there are any simple fixes for this. I suggest re-crafting the object to live on the host, and provide appropriate host- and device-side allocations for corresponding pointers and parameters (like size).
It also seems like you're re-inventing the wheel. You might want to investigate thrust device vectors (which are easily usable with ordinary CUDA code.)
Anyway, this was the closest I could come up with:
#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template <typename T>
class DeviceVector
{
private:
T* m_bValues;
std::size_t m_bSize;
std::size_t eleSize;
public:
__host__
void* operator new(std::size_t size)
{
DeviceVector<T>* object = NULL;
object = (DeviceVector<T> *)malloc(size*sizeof(DeviceVector<T>));
return object;
}
__host__
void operator delete(void* object)
{
free(object);
}
__host__
DeviceVector(std::size_t size = 1)
{
m_bSize = size;
eleSize = sizeof(T);
cudaMalloc(&m_bValues, m_bSize*sizeof(T));
cudaCheckErrors("constructor cudaMalloc fail");
cudaMemset(m_bValues, 0, m_bSize*sizeof(T));
}
__host__
~DeviceVector()
{
cudaFree(m_bValues);
cudaCheckErrors("destructor cudaFree fail");
}
__host__
T* getDevPtr(){
return m_bValues;}
__host__
std::size_t getSize(){
return m_bSize;}
__host__
std::size_t geteleSize(){
return eleSize;}
};
int main()
{
DeviceVector<int>* vec = new DeviceVector<int>();
cudaMemset(vec->getDevPtr(), 0xFF, vec->getSize()*vec->geteleSize());
cudaCheckErrors("vector fill fail");
delete vec;
return 0;
}
You've shown very little about how you want to interact with an object of this class, so I'm just guessing here.

Related

Tensorflow GPU new op memory allocation

I am trying to create a new tensorflow GPU op following the instructions on their website.
Looking at their example, it seems they feed a C++ pointer directly into the CUDA kernel without allocating device memory and copying the contents of the host pointer to the device pointer.
From what I understand of CUDA you always have to allocate memory on the device and then use device pointers inside the kernels.
What am I missing? I checked that input_tensor.flat<T>().data() should return a regular C++ pointer. Here is a copy of the code I am referring to:
// kernel_example.cu.cc
#ifdef GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "example.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
using namespace tensorflow;
using GPUDevice = Eigen::GpuDevice;
// Define the CUDA kernel.
template <typename T>
__global__ void ExampleCudaKernel(const int size, const T* in, T* out) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) {
out[i] = 2 * ldg(in + i);
}
}
// Define the GPU implementation that launches the CUDA kernel.
template <typename T>
void ExampleFunctor<GPUDevice, T>::operator()(
const GPUDevice& d, int size, const T* in, T* out) {
// Launch the cuda kernel.
//
// See core/util/cuda_kernel_helper.h for example of computing
// block count and thread_per_block count.
int block_count = 1024;
int thread_per_block = 20;
ExampleCudaKernel<T>
<<<block_count, thread_per_block, 0, d.stream()>>>(size, in, out);
}
// Explicitly instantiate functors for the types of OpKernels registered.
template struct ExampleFunctor<GPUDevice, float>;
template struct ExampleFunctor<GPUDevice, int32>;
#endif // GOOGLE_CUDA
When you look on https://www.tensorflow.org/extend/adding_an_op at this code lines you will see that the allocation is done in kernel_example.cc:
void Compute(OpKernelContext* context) override {
// Grab the input tensor
const Tensor& input_tensor = context->input(0);
// Create an output tensor
Tensor* output_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
&output_tensor));
// Do the computation.
OP_REQUIRES(context, input_tensor.NumElements() <= tensorflow::kint32max,
errors::InvalidArgument("Too many elements in tensor"));
ExampleFunctor<Device, T>()(
context->eigen_device<Device>(),
static_cast<int>(input_tensor.NumElements()),
input_tensor.flat<T>().data(),
output_tensor->flat<T>().data());
}
in context->allocate_output(....) they hand over a reference to the output Tensor, which is then allocated. The context knows if it is running on GPU or CPU and allocates the tensor respectively either on host or device. The pointer handed over to CUDA just points then to the actual data within the Tensor class.

munmap_chunk: invalid pointer when changing location of data storage with std::vector

I'm writing a program for a micro controller where I need to allocate the data of a vector to a specific location in memory (store it in Flash).
#include <iostream>
#include <vector>
struct struct_gauss {
int mean;
int sigma;
};
std::vector<struct_gauss> makeVector(size_t size, void* address, void* &endAddress) {
std::vector<struct_gauss> dummy;
struct_gauss **dummyPointer = (struct_gauss **) &dummy; // Address to metavalue of std::vector dummy (exists of 3 pointers)
*dummyPointer = (struct_gauss *) address; // Point data of dummy to requested address
*(dummyPointer+1) = ((*dummyPointer)+size);
*(dummyPointer+2) = *(dummyPointer+1);
endAddress = (void*) &(*dummy.end());
return dummy;
}
int main()
{
void* dummyPointer1 = malloc(1);
void* dummyPointer2;
auto vector1 = makeVector(10, (void*) dummyPointer1, dummyPointer2);
auto vector2 = makeVector(10, (void*) dummyPointer2, dummyPointer2);
vector1[9].mean = 10;
vector2[0].mean = 5;
std::cout<<"address of vector2 begin = "<< &(*vector2.begin())<<std::endl;
std::cout<<"vector1[9].mean = "<<vector1[9].mean<<"; vector2[0].mean = "<<vector2[0].mean<<std::endl;
return 0;
}
This creates 2 vectors of 10 values in size that are back to back starting from pointer dummyPointer1. However when running this I get the following error:
address of vector2 begin = 0xf42c70
vector1[9].mean = 10; vector2[0].mean = 5
*** Error in `/home/a.out': munmap_chunk(): invalid pointer: 0x0000000000f42c70 ***
Aborted
He errors when exiting the main() function because he can't deallocate vector2.
Why is that? How can I fix this?
Is there a better way?
P.S. I can keep the pointers of the vectors (the meta-values) in RAM but also write them to flash if that is better. Flash is used because I'm RAM limited and these vectors are written only when loading a new model from externally.
There are a number of things that look horrible, and I apologize for the bluntness, but I really do mean 'horrible' in this context.
std::vector<struct_gauss> makeVector(size_t size, void* address, void* &endAddress) {
std::vector<struct_gauss> dummy;
struct_gauss **dummyPointer = (struct_gauss **) &dummy; // Address to metavalue of std::vector dummy (exists of 3 pointers)
*dummyPointer = (struct_gauss *) address; // Point data of dummy to requested address
*(dummyPointer+1) = ((*dummyPointer)+size);
*(dummyPointer+2) = *(dummyPointer+1);
endAddress = (void*) &(*dummy.end());
return dummy;
}
Nothing about this code looks safe or advisable. There seems to be a number of assumptions being made about how the std::vector object is constructed in memory, and even if your assumptions are correct, you've still failed to properly construct the std::vector object in a standard-compliant manner.
int main()
{
void* dummyPointer1 = malloc(1);
void* dummyPointer2;
auto vector1 = makeVector(10, (void*) dummyPointer1, dummyPointer2);
auto vector2 = makeVector(10, (void*) dummyPointer2, dummyPointer2);
This is just flat-out wrong. You're allocating a single byte to store the vector inside, expecting the object to store 10 bytes, and nowhere have you actually allocated the space for the underlying array.
In short, I think you've hit a XY-Problem, full stop. The problem you're trying to solve is "how do I make std::vector allocate memory from flash memory"
The way you normally solve these kinds of problems is with custom allocators.
template<typename T>
struct allocator {
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef T * pointer;
typedef T const& const_pointer;
typedef T& reference;
typedef T const& const_reference;
typedef T value_type;
pointer allocate(size_t size) {
void * mem = micro_controller_api::allocate_flash_memory(size); //I don't know what your API looks like
//You'll need something else if you're not able to throw exceptions in your code.
if(!mem) throw std::bad_alloc();
//On its own, this would be unsafe, but std::vector uses placement new with its memory,
//so you don't need to worry that the cast here would risk some undefined behavior.
return static_cast<pointer>(mem);
}
void deallocate(pointer p, size_t) noexcept { micro_controller_api::free_flash_memory(static_cast<void*>(p)); }
allocator() = default;
template<typename U>
allocator(allocator<U> const&) {}
pointer address(reference r) const {return addressof(r);}
const_pointer address(const_reference r) const {return addressof(r);}
bool operator==(allocator const&) const {return true;} //All allocators are the same
bool operator!=(allocator const&) const {return false;}
};
int main() {
std::vector<struct_gauss, allocator<struct_gauss>> vector1(10);
std::vector<struct_gauss, allocator<struct_gauss>> vector2(10);
//Do whatever; vector1 and vector2 are, as far as anyone is concerned, perfectly valid vectors
}
Mind you, I've not written allocators myself before, and the one I'm providing here is based on a template shown on a C++ reference page, (Edit: I've also added some stuff by referencing this page) so it's possible I've made mistakes. But hopefully all of this is enough to solve your issue.

Access violation when using alloca

My stackAlloc function looks like this:
void* stackAlloc(size_t size) {
if (size > maxStackAllocation)
return malloc(size);
else
return _alloca(size);
}
void stackAllocFree(void *ptr, size_t size) {
if (size > maxStackAllocation) {
free(ptr);
}
}
If I change so the stackAlloc function always use malloc instead of alloca everything works.
I changed the function to a macro, and now its working as expected:
#define maxStackAllocation 1024
#define stackAlloc(size) \
( \
(size > maxStackAllocation)? \
malloc(size): \
_alloca(size) \
)
#define stackAllocFree(ptr, size) \
( \
(size > maxStackAllocation)? \
free(ptr): \
void() \
)
Assuming you're running on Windows, since your code calls _alloca(), per the MSDN documentation:
_alloca allocates size bytes from the program stack. The allocated space is automatically freed when the calling function exits
Note that the memory is freed when the calling function exits - which I'm assuming also means the calling function returns.
Your code:
void* stackAlloc(size_t size) {
if (size > maxStackAllocation)
return malloc(size);
else
return _alloca(size);
}
returns, thus freeing the memory obtained via _alloca().
From the man page,
This temporary space is automatically freed
when the function that called alloca() returns to its caller.
So whenever your stackAlloc function returns, it will automatically free the memory.
This works, but I'd advise against using it in production:
#include <iostream>
#include <alloca.h>
auto stackAlloc(const size_t size)
{
return [size](){ return alloca(size); };
}
int main() {
char *ch = (char *)stackAlloc(40000)();
ch[39999] = '\0';
return 0;
}
Counter-check: if I decrease stackAlloc's parameter, it doesn't work (which is the expected behaviour here)
Feel free to add the check, etc. in stackAlloc (either by returning different lambdas or having the lambda do the check).

Wrapper function for cudaMalloc and cudaMemcpy

I was sick of looking at all the boilerplate cuda code for copying data to the device so I wrote this wrapper function:
void allocateAndCopyToDevice(void* device_array, const void* host_array, const size_t &count)
{
gpuErrchk(cudaMalloc((void**)&device_array, count));
gpuErrchk(cudaMemcpy(device_array, host_array, count, cudaMemcpyHostToDevice));
}
but for some reason this resulted in an out of bounds memory access whenever using an array initialized in this way. The initialization code that I used looked like this:
cuDoubleComplex *d_cmplx;
allocateAndCopyToDevice(d_cmplx,cmplx,size*sizeof(cuDoubleComplex));
Could anyone explain why this doesn't work?
After seeing immibis's comment I realized that cudaMalloc expects a pointer to a pointer, so instead I'm passing by value the pointer to the pointer:
void allocateAndCopyToDevice(void** device_array, const void* host_array, const size_t &count)
{
gpuErrchk(cudaMalloc(device_array, count));
gpuErrchk(cudaMemcpy(*device_array, host_array, count, cudaMemcpyHostToDevice));
}
and the initialization now looks like this:
cuDoubleComplex *d_cmplx;
allocateAndCopyToDevice((void **)&d_cmplx,cmplx,size*sizeof(cuDoubleComplex));
It works, but I'm still wondering if there is a better way of doing this? How do other people handle memory transfers in cuda code?
I would do something like
template <typename T>
T* allocateAndCopyToDevice(const T* host_array, std::size_t count)
{
// some static_assert for allowed types: pod and built-in.
T* device_array = nullptr;
gpuErrchk(cudaMalloc(&device_array, count * sizeof(T)));
gpuErrchk(cudaMemcpy(device_array, host_array, count * sizeof(T), cudaMemcpyHostToDevice));
return device_array;
}
and use it:
cuDoubleComplex *d_cmplx = allocateAndCopyToDevice(cmplx, size);

Temporary CUDA Device Arrays

Having been playing around with this grand CUDA experiment for a few months now, I find myself experimenting more and trying to pull away from the tutorial examples.
My question is this : If I want to just use arrays on the GPU for something like temporary storage without copying them back to the host for display/output, can I just create a device array with __device__ double array[numpoints]; Then for anything I want to take back from the GPU, I need to do the whole cudaMalloc, cudaMemcpy spiel, right? Additionally, is there any difference between one method or another? I thought they both create arrays in global memory.
See this discription about the __device__ qualifier. So if you declare it __device__ you cannot access it in the host through cudaMemcpy but there are other mentioned in the link.
Instead what you can do is declare a global pointer(ie., without __device__) in host code and allocate using the cudaMalloc. So you can use the same to copy the result back to host using the cudaMemcpy.
You can create, fill and use globl memory arrays without the need of using cudaMemcpy to copy data from the host for initialization, if this is what are you asking. In the following simple example, I'm creating a global memory array which is initialized directly on the device and then I'm releasing it when not needed anymore.
#include<stdio.h>
__global__ void init_temp_data(float* temp_data) {
temp_data[threadIdx.x] = 3.f;
}
__global__ void copy_global_data(float* temp_data, float* d_data) {
d_data[threadIdx.x] = temp_data[threadIdx.x];
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
float* data = (float*)malloc(16*sizeof(float));
float* d_data; gpuErrchk(cudaMalloc((void**)&d_data,16*sizeof(float)));
float* temp_data; gpuErrchk(cudaMalloc((void**)&temp_data,16*sizeof(float)));
init_temp_data<<<1,16>>>(temp_data);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
copy_global_data<<<1,16>>>(temp_data,d_data);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaFree(temp_data));
gpuErrchk(cudaMemcpy(data,d_data,16*sizeof(float),cudaMemcpyDeviceToHost));
for (int i=0; i<16; i++) printf("Element number %i is equal to %f\n",i,data[i]);
getchar();
return 0;
}