I was sick of looking at all the boilerplate cuda code for copying data to the device so I wrote this wrapper function:
void allocateAndCopyToDevice(void* device_array, const void* host_array, const size_t &count)
{
gpuErrchk(cudaMalloc((void**)&device_array, count));
gpuErrchk(cudaMemcpy(device_array, host_array, count, cudaMemcpyHostToDevice));
}
but for some reason this resulted in an out of bounds memory access whenever using an array initialized in this way. The initialization code that I used looked like this:
cuDoubleComplex *d_cmplx;
allocateAndCopyToDevice(d_cmplx,cmplx,size*sizeof(cuDoubleComplex));
Could anyone explain why this doesn't work?
After seeing immibis's comment I realized that cudaMalloc expects a pointer to a pointer, so instead I'm passing by value the pointer to the pointer:
void allocateAndCopyToDevice(void** device_array, const void* host_array, const size_t &count)
{
gpuErrchk(cudaMalloc(device_array, count));
gpuErrchk(cudaMemcpy(*device_array, host_array, count, cudaMemcpyHostToDevice));
}
and the initialization now looks like this:
cuDoubleComplex *d_cmplx;
allocateAndCopyToDevice((void **)&d_cmplx,cmplx,size*sizeof(cuDoubleComplex));
It works, but I'm still wondering if there is a better way of doing this? How do other people handle memory transfers in cuda code?
I would do something like
template <typename T>
T* allocateAndCopyToDevice(const T* host_array, std::size_t count)
{
// some static_assert for allowed types: pod and built-in.
T* device_array = nullptr;
gpuErrchk(cudaMalloc(&device_array, count * sizeof(T)));
gpuErrchk(cudaMemcpy(device_array, host_array, count * sizeof(T), cudaMemcpyHostToDevice));
return device_array;
}
and use it:
cuDoubleComplex *d_cmplx = allocateAndCopyToDevice(cmplx, size);
Related
I have the following code which has a function VariadicWrite and this function accepts a pointer that it can modify by incrementing it to it points to a different memory location depending on how much data is written to it:
#include <iostream>
#include <cstring>
template<typename T>
T Read(void* &ptr) noexcept
{
T result;
memcpy(&result, ptr, sizeof(result));
ptr = static_cast<T*>(ptr) + 1;
return result;
}
template<typename T>
void Write(void* &ptr, T result) noexcept
{
memcpy(ptr, &result, sizeof(result));
ptr = static_cast<T*>(ptr) + 1;
}
template<typename... Args>
auto VariadicWrite(void*& ptr, Args&&... args)
{
(Write(ptr, args), ...); //unpack Args
return 0;
}
int main()
{
void* ptr = malloc(1024);
memset(ptr, 0, 1024);
void* temp = ptr;
VariadicWrite(ptr, 1, 2, 3);
std::cout<<Read<int>(ptr)<<"\n";
std::cout<<Read<int>(ptr)<<"\n";
std::cout<<Read<int>(ptr)<<"\n";
free(temp);
return 0;
}
The problem here is that the code prints out 0, 0, 0, 0 if I use: void*& ptr.
If I do void* ptr it prints out 0, 1, 2, 3 but the ptr pointer is never incremented.
How can I modify the ptr pointer of VariadicWrite? I thought that void*& would have worked but in this case it doesn't :S
The problem is your VariadicWrite() call will modify ptr to point to the end of your written data. Then you call Read() without resetting the pointer back to the start, so you read zeros from the uninitialized portion of your buffer that follows the data already written.
Insert ptr = temp; between the write and read and see if that fixes it.
The reason void* ptr does not work is that each call to Write(ptr, ...) will increment the local copy of the argument in the scope of the Write() function. The variable ptr in VariadicWrite() does not change after a call to Write() so the next call will use the same value.
If you change to VariadicWrite(void* ptr, …) and Write(void*& ptr, …), you might get the behavior you want. But I would suggest this a bad idea.
As we can see from the bug in your example, knowing if the function will modify the pass-by-reference parameter or not is of critical importance, yet not readily apparent from the code using the function. This tends to invite bugs just like the one you have created here. An inconsistent interface, where VariadicWrite() does not modify its argument but Write() does, will only make it doubly hard to avoid this kind of bug.
Generally, it's better to avoid non-const references because they often result in bugs like this. I suggest returning the new pointer instead of modifying the argument.
template<typename T>
void* Write(void* ptr, const T& arg)
{
return static_cast<T*>(ptr) + 1;
}
template<typename... Args>
void* WriteV(void* ptr, Args&&... args)
{
((ptr = Write(ptr, args)), ...);
return ptr;
}
I am trying to create a new tensorflow GPU op following the instructions on their website.
Looking at their example, it seems they feed a C++ pointer directly into the CUDA kernel without allocating device memory and copying the contents of the host pointer to the device pointer.
From what I understand of CUDA you always have to allocate memory on the device and then use device pointers inside the kernels.
What am I missing? I checked that input_tensor.flat<T>().data() should return a regular C++ pointer. Here is a copy of the code I am referring to:
// kernel_example.cu.cc
#ifdef GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "example.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
using namespace tensorflow;
using GPUDevice = Eigen::GpuDevice;
// Define the CUDA kernel.
template <typename T>
__global__ void ExampleCudaKernel(const int size, const T* in, T* out) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) {
out[i] = 2 * ldg(in + i);
}
}
// Define the GPU implementation that launches the CUDA kernel.
template <typename T>
void ExampleFunctor<GPUDevice, T>::operator()(
const GPUDevice& d, int size, const T* in, T* out) {
// Launch the cuda kernel.
//
// See core/util/cuda_kernel_helper.h for example of computing
// block count and thread_per_block count.
int block_count = 1024;
int thread_per_block = 20;
ExampleCudaKernel<T>
<<<block_count, thread_per_block, 0, d.stream()>>>(size, in, out);
}
// Explicitly instantiate functors for the types of OpKernels registered.
template struct ExampleFunctor<GPUDevice, float>;
template struct ExampleFunctor<GPUDevice, int32>;
#endif // GOOGLE_CUDA
When you look on https://www.tensorflow.org/extend/adding_an_op at this code lines you will see that the allocation is done in kernel_example.cc:
void Compute(OpKernelContext* context) override {
// Grab the input tensor
const Tensor& input_tensor = context->input(0);
// Create an output tensor
Tensor* output_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
&output_tensor));
// Do the computation.
OP_REQUIRES(context, input_tensor.NumElements() <= tensorflow::kint32max,
errors::InvalidArgument("Too many elements in tensor"));
ExampleFunctor<Device, T>()(
context->eigen_device<Device>(),
static_cast<int>(input_tensor.NumElements()),
input_tensor.flat<T>().data(),
output_tensor->flat<T>().data());
}
in context->allocate_output(....) they hand over a reference to the output Tensor, which is then allocated. The context knows if it is running on GPU or CPU and allocates the tensor respectively either on host or device. The pointer handed over to CUDA just points then to the actual data within the Tensor class.
I'm writing a program for a micro controller where I need to allocate the data of a vector to a specific location in memory (store it in Flash).
#include <iostream>
#include <vector>
struct struct_gauss {
int mean;
int sigma;
};
std::vector<struct_gauss> makeVector(size_t size, void* address, void* &endAddress) {
std::vector<struct_gauss> dummy;
struct_gauss **dummyPointer = (struct_gauss **) &dummy; // Address to metavalue of std::vector dummy (exists of 3 pointers)
*dummyPointer = (struct_gauss *) address; // Point data of dummy to requested address
*(dummyPointer+1) = ((*dummyPointer)+size);
*(dummyPointer+2) = *(dummyPointer+1);
endAddress = (void*) &(*dummy.end());
return dummy;
}
int main()
{
void* dummyPointer1 = malloc(1);
void* dummyPointer2;
auto vector1 = makeVector(10, (void*) dummyPointer1, dummyPointer2);
auto vector2 = makeVector(10, (void*) dummyPointer2, dummyPointer2);
vector1[9].mean = 10;
vector2[0].mean = 5;
std::cout<<"address of vector2 begin = "<< &(*vector2.begin())<<std::endl;
std::cout<<"vector1[9].mean = "<<vector1[9].mean<<"; vector2[0].mean = "<<vector2[0].mean<<std::endl;
return 0;
}
This creates 2 vectors of 10 values in size that are back to back starting from pointer dummyPointer1. However when running this I get the following error:
address of vector2 begin = 0xf42c70
vector1[9].mean = 10; vector2[0].mean = 5
*** Error in `/home/a.out': munmap_chunk(): invalid pointer: 0x0000000000f42c70 ***
Aborted
He errors when exiting the main() function because he can't deallocate vector2.
Why is that? How can I fix this?
Is there a better way?
P.S. I can keep the pointers of the vectors (the meta-values) in RAM but also write them to flash if that is better. Flash is used because I'm RAM limited and these vectors are written only when loading a new model from externally.
There are a number of things that look horrible, and I apologize for the bluntness, but I really do mean 'horrible' in this context.
std::vector<struct_gauss> makeVector(size_t size, void* address, void* &endAddress) {
std::vector<struct_gauss> dummy;
struct_gauss **dummyPointer = (struct_gauss **) &dummy; // Address to metavalue of std::vector dummy (exists of 3 pointers)
*dummyPointer = (struct_gauss *) address; // Point data of dummy to requested address
*(dummyPointer+1) = ((*dummyPointer)+size);
*(dummyPointer+2) = *(dummyPointer+1);
endAddress = (void*) &(*dummy.end());
return dummy;
}
Nothing about this code looks safe or advisable. There seems to be a number of assumptions being made about how the std::vector object is constructed in memory, and even if your assumptions are correct, you've still failed to properly construct the std::vector object in a standard-compliant manner.
int main()
{
void* dummyPointer1 = malloc(1);
void* dummyPointer2;
auto vector1 = makeVector(10, (void*) dummyPointer1, dummyPointer2);
auto vector2 = makeVector(10, (void*) dummyPointer2, dummyPointer2);
This is just flat-out wrong. You're allocating a single byte to store the vector inside, expecting the object to store 10 bytes, and nowhere have you actually allocated the space for the underlying array.
In short, I think you've hit a XY-Problem, full stop. The problem you're trying to solve is "how do I make std::vector allocate memory from flash memory"
The way you normally solve these kinds of problems is with custom allocators.
template<typename T>
struct allocator {
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef T * pointer;
typedef T const& const_pointer;
typedef T& reference;
typedef T const& const_reference;
typedef T value_type;
pointer allocate(size_t size) {
void * mem = micro_controller_api::allocate_flash_memory(size); //I don't know what your API looks like
//You'll need something else if you're not able to throw exceptions in your code.
if(!mem) throw std::bad_alloc();
//On its own, this would be unsafe, but std::vector uses placement new with its memory,
//so you don't need to worry that the cast here would risk some undefined behavior.
return static_cast<pointer>(mem);
}
void deallocate(pointer p, size_t) noexcept { micro_controller_api::free_flash_memory(static_cast<void*>(p)); }
allocator() = default;
template<typename U>
allocator(allocator<U> const&) {}
pointer address(reference r) const {return addressof(r);}
const_pointer address(const_reference r) const {return addressof(r);}
bool operator==(allocator const&) const {return true;} //All allocators are the same
bool operator!=(allocator const&) const {return false;}
};
int main() {
std::vector<struct_gauss, allocator<struct_gauss>> vector1(10);
std::vector<struct_gauss, allocator<struct_gauss>> vector2(10);
//Do whatever; vector1 and vector2 are, as far as anyone is concerned, perfectly valid vectors
}
Mind you, I've not written allocators myself before, and the one I'm providing here is based on a template shown on a C++ reference page, (Edit: I've also added some stuff by referencing this page) so it's possible I've made mistakes. But hopefully all of this is enough to solve your issue.
I have a macro:
#define checkAlloc(ans) checkPointer((ans), __FILE__, __LINE__);
which is used to wrap around any pointer allocation to check it is valid (used for checking memory allocations on a GPU device side).
The macro is used as follows:
SomeObject* myObj = checkAlloc(new SomeObject());
and the checkPointer function is implemented as:
inline __device__ SomeObject* checkPointer(SomeObject* pointer, char *file, int line)
{
if (pointer == nullptr)
{
// do error logging
}
return pointer;
}
Now, it is very inconvenient to create a new version of the function for each type of object I might allocate to. Templates are also not an option since I would like syntax to be clear - i.e. just putting checkAlloc(…) around each allocation rather than checkAlloc<SomeObject>(…) which is ugly and hopefully unnecessary.
Ideally I would like to change the checkPointer to be:
inline __device__ auto checkPointer(auto pointer, char *file, int line)
but I understand auto cannot be used for a function parameter yet. The GPU code supports C++14 so lambdas could be used as a potential workaround from I can read at https://stackoverflow.com/a/29945034/7283981 but I am not familiar enough with lambdas to know how to do this. Any ideas?
This is the perfect case for a template, I don't understand why you are trying to avoid it:
template < typename T >
inline __device__ T* checkPointer(T* pointer, const char *file, int line)
{
if (pointer == nullptr)
{
// do error logging
}
return pointer;
}
This is very clear and clean and you can use it as if there was an auto there:
int main(int argc, char** argv)
{
SomeObject* myObj = checkAlloc(new SomeObject());
}
As you can see, there are automatic argument deduction capabilities that don't even require any type specification...
Oh, and notice that I changed char * file to const char * file as C++11 doesn't allow conversion of literals to char *
I would like to implement a device side vector class which encapsulates a pointer to the elements of the container.
After I instantiate an object of this class I have no access to the inside pointer. It always says 'Access violation writing location some device memory address'.
My code is the following:
#include <iostream>
#include <cuda_runtime.h>
template <typename T>
class DeviceVector
{
private:
T* m_bValues;
std::size_t m_bSize;
public:
__host__
void* operator new(std::size_t size)
{
DeviceVector<T>* object = nullptr;
cudaMalloc((void**)&object, size);
return object;
}
__host__
void operator delete(void* object)
{
cudaFree(object);
}
__host__
DeviceVector(std::size_t size = 1)
{
cudaMemcpy(&m_bSize, &size, sizeof(std::size_t), cudaMemcpyHostToDevice);
// At this cudaMalloc I get Access violation writing location...
cudaMalloc((void**)&m_bValues, size * sizeof(T));
// It's an alternative solution here
T* ptr;
cudaMalloc((void**)&ptr, size * sizeof(T));
cudaMemcpy(&m_bValues, &ptr, sizeof(T*), cudaMemcpyHostToDevice);
// The memory is allocated
// But I can't access it through m_bValues pointer
// It is also Access violation writing location...
}
__host__
~DeviceVector()
{
// Access violation here if I use the second solution in the constructor
cudaFree(m_bValues);
}
};
int main()
{
DeviceVector<int>* vec = new DeviceVector<int>();
delete vec;
return 0;
}
Note:
I have access to the size attribute.
So my questions are:
How to allocate memory for this class to get access to the pointer inside?
Is this even possible to encapsulate a pointer into a class on the device?
This line is illegal:
cudaMalloc((void**)&m_bValues, size * sizeof(T));
because your new operator allocated the object on the device:
cudaMalloc((void**)&object, size);
return object;
and the constructor was called to operate on that allocation. Therefore &m_bValues is taking the address of a device variable in host code which is illegal in CUDA. If you do that, and then attempt to use it in host code (i.e. the cudaMalloc operation), you're going to get a seg fault. cudaMalloc creates a device allocation of a particular size, and then stores the device pointer to that allocation in a variable that is expected to be resident on the host. If you pass it a device address to store that pointer into instead, cudaMalloc will segfault trying to write the pointer value.
Your alternative solution is a somewhat better approach, and is the general idea when it's necessary to copy a pointer to a device allocation to a variable resident on the device.
But you've still basically made the allocation that m_bValues points to inaccessible from the host. (ptr, being a temporary variable, won't help, and creating another variable in the class to hold a value like ptr won't help either because the entire class is allocated and resident on the device.) For the same reason that you're not allowed to use &m_bValues in the previous cudaMalloc operation, you won't be able to use it directly in any other host code (except as the target for cudaMempcy host->device when copying the pointer value itself).
I don't think there are any simple fixes for this. I suggest re-crafting the object to live on the host, and provide appropriate host- and device-side allocations for corresponding pointers and parameters (like size).
It also seems like you're re-inventing the wheel. You might want to investigate thrust device vectors (which are easily usable with ordinary CUDA code.)
Anyway, this was the closest I could come up with:
#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template <typename T>
class DeviceVector
{
private:
T* m_bValues;
std::size_t m_bSize;
std::size_t eleSize;
public:
__host__
void* operator new(std::size_t size)
{
DeviceVector<T>* object = NULL;
object = (DeviceVector<T> *)malloc(size*sizeof(DeviceVector<T>));
return object;
}
__host__
void operator delete(void* object)
{
free(object);
}
__host__
DeviceVector(std::size_t size = 1)
{
m_bSize = size;
eleSize = sizeof(T);
cudaMalloc(&m_bValues, m_bSize*sizeof(T));
cudaCheckErrors("constructor cudaMalloc fail");
cudaMemset(m_bValues, 0, m_bSize*sizeof(T));
}
__host__
~DeviceVector()
{
cudaFree(m_bValues);
cudaCheckErrors("destructor cudaFree fail");
}
__host__
T* getDevPtr(){
return m_bValues;}
__host__
std::size_t getSize(){
return m_bSize;}
__host__
std::size_t geteleSize(){
return eleSize;}
};
int main()
{
DeviceVector<int>* vec = new DeviceVector<int>();
cudaMemset(vec->getDevPtr(), 0xFF, vec->getSize()*vec->geteleSize());
cudaCheckErrors("vector fill fail");
delete vec;
return 0;
}
You've shown very little about how you want to interact with an object of this class, so I'm just guessing here.