Use data allocated dynamically in CUDA kernel on host - c++

I am trying to build a container class on the device which manages some memory.
This memory is allocated dynamically and filled during object construction in the kernel.
According to the documentation that can be done with a simple new[] in the kernel (using CUDA 8.0 with compute cabability 5.0 in Visual Studio 2012).
Afterwards I want to access the data inside the containers in host code (e.g. for testing if all values are correct).
A minimal version of the DeviceContainer class looks like this:
class DeviceContainer
{
public:
__device__ DeviceContainer(unsigned int size);
__host__ __device__ ~DeviceContainer();
__host__ __device__ DeviceContainer(const DeviceContainer & other);
__host__ __device__ DeviceContainer & operator=(const DeviceContainer & other);
__host__ __device__ unsigned int getSize() const { return m_sizeData; }
__device__ int * getDataDevice() const { return mp_dev_data; }
__host__ int* getDataHost() const;
private:
int * mp_dev_data;
unsigned int m_sizeData;
};
__device__ DeviceContainer::DeviceContainer(unsigned int size) :
m_sizeData(size), mp_dev_data(nullptr)
{
mp_dev_data = new int[m_sizeData];
for(unsigned int i = 0; i < m_sizeData; ++i) {
mp_dev_data[i] = i;
}
}
__host__ __device__ DeviceContainer::DeviceContainer(const DeviceContainer & other) :
m_sizeData(other.m_sizeData)
{
#ifndef __CUDA_ARCH__
cudaSafeCall( cudaMalloc((void**)&mp_dev_data, m_sizeData * sizeof(int)) );
cudaSafeCall( cudaMemcpy(mp_dev_data, other.mp_dev_data, m_sizeData * sizeof(int), cudaMemcpyDeviceToDevice) );
#else
mp_dev_data = new int[m_sizeData];
memcpy(mp_dev_data, other.mp_dev_data, m_sizeData * sizeof(int));
#endif
}
__host__ __device__ DeviceContainer::~DeviceContainer()
{
#ifndef __CUDA_ARCH__
cudaSafeCall( cudaFree(mp_dev_data) );
#else
delete[] mp_dev_data;
#endif
mp_dev_data = nullptr;
}
__host__ __device__ DeviceContainer & DeviceContainer::operator=(const DeviceContainer & other)
{
m_sizeData = other.m_sizeData;
#ifndef __CUDA_ARCH__
cudaSafeCall( cudaMalloc((void**)&mp_dev_data, m_sizeData * sizeof(int)) );
cudaSafeCall( cudaMemcpy(mp_dev_data, other.mp_dev_data, m_sizeData * sizeof(int), cudaMemcpyDeviceToDevice) );
#else
mp_dev_data = new int[m_sizeData];
memcpy(mp_dev_data, other.mp_dev_data, m_sizeData * sizeof(int));
#endif
return *this;
}
__host__ int* DeviceContainer::getDataHost() const
{
int * pDataHost = new int[m_sizeData];
cudaSafeCall( cudaMemcpy(pDataHost, mp_dev_data, m_sizeData * sizeof(int), cudaMemcpyDeviceToHost) );
return pDataHost;
}
It just manages the array mp_dev_data.
The array is created and filled with consecutive values during construction, which should only be possible on the device. (Note that in reality the size of the containers might be different from each other.)
I think I need to provide a copy constructor and an assignment operator since I don't know any other way to fill the array in the kernel. (See question No. 3 below.)
Since copy and deletion can also happen on the host, __CUDA_ARCH__ is used to determine for which execution path we're compiling. On the host cudaMemcpy and cudaFree is used, on the device we can just use memcpy and delete[].
The kernel for object creation is rather simple:
__global__ void createContainer(DeviceContainer * pContainer, unsigned int numContainer, unsigned int containerSize)
{
unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
if(offset < numContainer)
{
pContainer[offset] = DeviceContainer(containerSize);
}
}
Each thread in a one-dimensional grid that is in range creates a single container object.
The main-function then allocates arrays for the container (90000 in this case) on the device and host, calls the kernel and attempts to use the objects:
void main()
{
const unsigned int numContainer = 90000;
const unsigned int containerSize = 5;
DeviceContainer * pDevContainer;
cudaSafeCall( cudaMalloc((void**)&pDevContainer, numContainer * sizeof(DeviceContainer)) );
dim3 blockSize(1024, 1, 1);
dim3 gridSize((numContainer + blockSize.x - 1)/blockSize.x , 1, 1);
createContainer<<<gridSize, blockSize>>>(pDevContainer, numContainer, containerSize);
cudaCheckError();
DeviceContainer * pHostContainer = (DeviceContainer *)malloc(numContainer * sizeof(DeviceContainer));
cudaSafeCall( cudaMemcpy(pHostContainer, pDevContainer, numContainer * sizeof(DeviceContainer), cudaMemcpyDeviceToHost) );
for(unsigned int i = 0; i < numContainer; ++i)
{
const DeviceContainer & dc = pHostContainer[i];
int * pData = dc.getDataHost();
for(unsigned int j = 0; j < dc.getSize(); ++j)
{
std::cout << pData[j];
}
std::cout << std::endl;
delete[] pData;
}
free(pHostContainer);
cudaSafeCall( cudaFree(pDevContainer) );
}
I have to use malloc for array creation on the host, since i don't want to have a default constructor for the DeviceContainer.
I try to access the data inside a container via getDataHost() which internally just calls cudaMemcpy.
cudaSafeCall and cudaCheckError are simple macros that evaluate the cudaError returned by the function oder actively poll the last error. For the sake of completeness:
#define cudaSafeCall(error) __cudaSafeCall(error, __FILE__, __LINE__)
#define cudaCheckError() __cudaCheckError(__FILE__, __LINE__)
inline void __cudaSafeCall(cudaError error, const char *file, const int line)
{
if (error != cudaSuccess)
{
std::cerr << "cudaSafeCall() returned:" << std::endl;
std::cerr << "\tFile: " << file << ",\nLine: " << line << " - CudaError " << error << ":" << std::endl;
std::cerr << "\t" << cudaGetErrorString(error) << std::endl;
system("PAUSE");
exit( -1 );
}
}
inline void __cudaCheckError(const char *file, const int line)
{
cudaError error = cudaDeviceSynchronize();
if (error != cudaSuccess)
{
std::cerr << "cudaCheckError() returned:" << std::endl;
std::cerr << "\tFile: " << file << ",\tLine: " << line << " - CudaError " << error << ":" << std::endl;
std::cerr << "\t" << cudaGetErrorString(error) << std::endl;
system("PAUSE");
exit( -1 );
}
}
I have 3 problems with this code:
If it is executed as presented here i recieve an "unspecified launch failure" of the kernel. The Nsight Debugger stops me on the line mp_dev_data = new int[m_sizeData]; (either in the constructor or the assignment operator) and reports several access violation on global memory. The number of violations appears to be random between 4 and 11 and they occur in non-consecutive threads but always near the upper end of the grid (block 85 and 86).
If i reduce numContainer to 10, the kernel runs smoothly, however, the cudaMamcpy in getDataHost() fails with an invalid argument error - even though mp_dev_data is not 0. (I suspect that the assignment is faulty and the memory has already been deleted by another object.)
Even though I would like to know how to correctly implement the DeviceContainer with proper memory management, in my case it would also be sufficient to make it non-copyable and non-assignable. However, I don't know how to properly fill the container-array in the kernel. Maybe something like
DeviceContainer dc(5);
memcpy(&pContainer[offset], &dc, sizeof(DeviceContainer));
Which would lead to problems with deleting mp_dev_data in the destructor. I would need to manually manage memory deletion which feels rather dirty.
I also tried to use malloc and free in kernel code instead of new and delete but the results were the same.
I am sorry that I wasn't able to frame my question in a shorter manner.
TL;DR: How to implement a class that dynamically allocates memory in a kernel and can also be used in host code? How can I initialize an array in a kernel with objects that can not be copied or assigned?
Any help is appreciated. Thank You.

Apparently the answer is: What I am trying to do is more or less impossible.
Memory allocated with new or malloc in the kernel is not placed in global memory but rather in a special heap memory which is inaccessible from the host.
The only option to access all memory on the host is to first allocate an array in global memory which is big enough to hold all elements on the heap and then write a kernel that copies all elements from the heap to global memory.
The access violation are caused by the limited heap size (which can be changed by cudaDeviceSetLimit(cudaLimitMallocHeapSize, size_t size).

Related

Wrong pixel values when using padded local buffer OpenCL

I'm facing an unexpected result when I use a local buffer to copy data in an OpenCL kernel. The code presented here is quite simple (and useless since I don't need to use a local buffer for such an operation), but this is a first step for convolution-like processes.
Here is my code :
std::string implementCopyFromLocalKernel()
{
return BOOST_COMPUTE_STRINGIZE_SOURCE(
__kernel void copyFromLocal_knl(__global const float* in,
const ulong sizeX, const ulong sizeY,
const int filterRadiusX, const int filterRadiusY,
__local float* localImage,
const ulong localSizeX, const ulong localSizeY,
__global float* out)
{
// Store each work-item’s unique row and column
const int x = get_global_id(0);
const int y = get_global_id(1);
// Group size
int groupSizeX = get_local_size(0);
int groupSizeY = get_local_size(1);
// Determine the size of the work group output region
int groupIdX = get_group_id(0);
int groupIdY = get_group_id(1);
// Determine the local ID of each work item
int localX = get_local_id(0);
int localY = get_local_id(1);
// Padding
int paddingX = filterRadiusX;
int paddingY = filterRadiusY;
// Cache the data to local memory
// Copy the data for the current coordinates
localImage[localX + localY*localSizeX] = in[x + y * sizeX];
barrier(CLK_LOCAL_MEM_FENCE);
out[x + y * sizeX] = localImage[localX + localY*localSizeX];
return;
}
);
}
void copyLocalBuffer(const boost::compute::context& context, boost::compute::command_queue& queue, const boost::compute::buffer& bufInn boost::compute::buffer& bufOut, const size_t sizeX, const size_t sizeY)
{
const size_t nbPx = sizeX * sizeY;
const size_t maxSize = (sizeX > sizeY ? sizeX : sizeY);
// Prepare to launch the kernel
std::string kernel_src = implementCopyFromLocalKernel();
boost::compute::program program;
try {
program = boost::compute::program::create_with_source(kernel_src, pGpuDescription->getContext(deviceIdx));
program.build();
}
catch (const boost::compute::opencl_error& e) {
std::cout << "Error bulding program from source : " << std::endl << e.what() << std::endl
<< program.build_log() << std::endl;
return;
}
boost::compute::kernel kernel;
try {
kernel = program.create_kernel("copyFromLocal_knl");
}
catch (const boost::compute::opencl_error& e) {
std::cout << "Error creating kernel : " << std::endl << e.what() << std::endl;
return;
}
try {
int localSizeX = 16;
int localSizeY = 16;
int paddingPixelsX = 2;// 0; // <- Changing to 0 works
int paddingPixelsY = paddingPixelsX;
int localWidth = localSizeX + 2 * paddingPixelsX;
int localHeight = localSizeY + 2 * paddingPixelsY;
boost::compute::buffer localImage(context, localWidth*localHeight * sizeof(float));
kernel.set_arg(0, bufIn);
kernel.set_arg(1, sizeX);
kernel.set_arg(2, sizeY);
kernel.set_arg(3, paddingPixelsX);
kernel.set_arg(4, paddingPixelsY);
kernel.set_arg(5, localImage);
kernel.set_arg(6, localWidth);
kernel.set_arg(7, localHeight);
kernel.set_arg(8, bufOut);
}
catch (const boost::compute::opencl_error& e) {
std::cout << "Error setting kernel arguments: " << std::endl << e.what() << std::endl;
return;
}
try {
size_t origin[2] = { 0, 0 };
size_t region[2] = { 256, 256 };// { sizeX, sizeY };
size_t localSize[2] = { 16, 16 };
queue.enqueue_nd_range_kernel(kernel, 2, origin, region, localSize);
}
catch (const boost::compute::opencl_error& e) {
std::cout << "Error executing kernel : " << std::endl << e.what() << std::endl;
return;
}
}
I reduced the code to simply copy the pixels corresponding to each work item in the associated local coordinate of the local image. Hence, the local image buffer must have unused data for 2*paddingPixelsX on each line and 2*paddingPixelsY unused lines.
It works if I don't add padding data (paddingPixelsX and paddingPixelsY = 0), but it seems that some work items don't read the data from the input buffer or write the data into the ouput buffer (or the local buffer?) in the correct place. Moreover, when I run my program several times, I never get the same result.
This is an example of result I get (right) for the mandrill image as input (left) :
I ensure that the threads are synchronized with barrier(CLK_LOCAL_MEM_FENCE); and each work item read and write a specific data and if my code is buggy, I don't understand why no padding don't gives errors.
Does someone has an idea?
Thanks,
As already confirmed, the problem was that dynamically allocated local buffer passed to the kernel was only created for one work group.
One of the solutions is to create local buffer statically inside the kernel, for example:
__local float localImage[16*16];
If the size of the buffer cannot be hard coded then it could be set via preprocessor:
__local float localImage[SIZE_X*SIZE_Y];
which then these parameters are passed during kernel build.
From what I remember using kernel parameters to define size of static local buffer may not work for every GPU (compilation will fail).
I'm not familiar with boost compute but I presume something similar should be possible to achieve by passing parameters to implementCopyFromLocalKernel() which then they would be converted into values during stringizing.
Thanks to #doqtor, I understood that the issue came from the buffer passed as kernel parameter. Because of that, all work group used the same buffer.
Since I don't know the padding size I will need for convolution operations, I need this buffer as parameter. I modified the kernel parametrization so that a different buffer is used by each work group :
kernel.set_arg(5, localWidth*localHeight*sizeof(float), NULL);
I missed the important part when I read the documentation of clSetKernelArg:
If the argument is declared with the __local qualifier, the arg_value entry must be NULL.

Why does 'new' fail in class initialization

win7
gcc 6.4.0
cygwin 2.9.0
the following code fails in function g_block during class initialization but not when used in main. The failure is in the 'for' loop when I attempt to initialize the code (initialization is a side issue here). In both cases allocation seems successful but when used in a class, I can't use the memory allocated.
# include <iostream>
# include <iomanip>
using namespace std;
typedef struct { // gsl allocation 'block' descritpoin
size_t size; // block bytes size
double* data; // pointer to the first byte of the block
} gsl_block;
typedef struct { // matrix definition
size_t size1; // number of rows
size_t size2; // number of columns
size_t tda; // number of elements in row (stride between rows)
double* data; // pointer to matrix[0][0]
gsl_block* block; // pointer to the gsl_matrix block
int owner; // 1: deallocation permitted
} gsl_matrix;
class X {
public:
inline static gsl_matrix& g_matrix(size_t row, size_t col)
{return g_matrix(row, col, g_block(row * col));};
static gsl_block& g_block(size_t size) {
double* ptr = new double(size);
cout << "size " << setw(5)<< size << " addr range "
<< hex << setfill('0') << ptr << " - " << (ptr + size*sizeof(double))
<< dec << setfill(' ') << endl;
for(size_t ndx = 0; ndx < size; ndx++) ptr[ndx] = 0.0;
return * new gsl_block{size, ptr};
};
static gsl_matrix& g_matrix(size_t row, size_t col, gsl_block& block) {
return * new gsl_matrix{row, col, col, block.data, &block, 0}; }
gsl_matrix& g_mat;
X() : g_mat(g_matrix(92, 92)) {}
}; // class X
int main(int argc, char** argv) {
gsl_matrix& mat = X::g_matrix(92, 92);
X* x = new X();
return 0;
}
double* ptr = new double(size);
This line creates a single double with the value size on the free store, and returns a pointer to it.
for(size_t ndx = 0; ndx < size; ndx++) ptr[ndx] = 0.0;
This line then invokes undefined behavior by attempting to write to memory that your program does not own.
You should really use std::vector instead of raw pointers. As your program stands, you have a significant potential to leak memory. If you made gsl_block::data a std::vector<double>, your classes would get proper copy and move semantics for free, and you wouldn't need to directly use new anywhere in your code.
EDIT:
Now that you've mentioned you're using GNU Scientific Library, you should probably just use the functions that library provides for allocating and freeing matricies: gsl_matrix_alloc and gsl_matrix_free. I would re-write your X class to just contain a std::unique_ptr with gsl_matrix_free as its deleter:
struct X
{
struct free_matrix
{
void operator()(gsl_matrix* mat)
{
gsl_matrix_free(mat);
}
};
std::unique_ptr<gsl_matrix, free_matrix> g_mat;
X(std::size_t rows, std::size_t cols)
: g_mat(gsl_matrix_alloc(rows, cols))
{}
};
You could even go further and completely wrap gsl_matrix in a more C++-like interface, with member functions that call gsl_matrix_get/gsl_matrix_set or gsl_matrix_pointer to provide simple access to the matrix elements.

Transferring an array pointer into CUDA memory via separate class

I have a class named "Coordinate" which consist of an int array pointer and a bool variable. I want to send this pointer into CUDA, modify it and then use it back in CPU memory.
Here is Coordinate.h :
#ifndef __COORDINATE_H
#define __COORDINATE_H
#include <stdlib.h>
#include <cuda.h>
using namespace std;
class Coordinate {
public:
int *array_pointer;
bool flag;
Coordinate() { flag = false; }
Coordinate(int array_length) {
flag = false;
array_pointer = new int[array_length];
for (int i = 0; i < array_length; i++) {
array_pointer[i] = -1;
}
}
};
#endif
I have made 2 global functions in cudamain.cu Check1 and Check2, both will take a Coordinate as argument. Check1 function will change only boolean flag which Check2 will change boolean flag and also modify the array.
Here is cudamain.cu :
#include <iostream>
#include <cuda.h>
#include "Coordinate.h"
using namespace std;
__global__ void check1(Coordinate *ptr) {
c->flag = true;
}
__global__ void check2(Coordinate *c) {
c->flag = true;
for (int i = 0; i < 10; i++) {
c->array_pointer[i] = i;
}
}
int main() {
Coordinate *d_a, *d_b, a, b;
a = Coordinate(10); b = Coordinate(10);
size_t size = sizeof(Coordinate);
cudaMalloc((void**)&d_a, size); cudaMalloc((void**)&d_b, size);
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
check1 << <1, 1 >> > (d_a);
cudaMemcpy(&a, d_a, size, cudaMemcpyDeviceToHost);
cout <<"d_a result-> " <<a.flag <<" " <<a.array_pointer[9] << endl;
check2 << <1, 1 >> > (d_b);
cudaMemcpy(&b, d_b, size, cudaMemcpyDeviceToHost);
cout << "d_b result-> " << b.flag << " " << b.array_pointer[9] << endl;
return 0;
}
I made 2 separate coordinate objects a and b, a will go with check1 and b will go with check2. Both a and b are initialized in same way.
The result I get is
d_a result-> 1 -1
d_b result-> 0 -1
Expected result:
d_a result-> 1 -1
d_b result-> 1 9
Different Coordinate objects may have different array length so I can't initialize the array pointer in the coordinate class.
You cannot access host memory from a CUDA kernel by dereferncing, unless that piece of memory was specially-allocated to allow this, e.g. using cudaMallocManaged(). So your program cannot work. Read this Parallel4All post on accessing the same memory both from the host and the device. Another alternative is the one #RobertCrovella linked to, involving allocating device-side memory.
But, frankly, I doubt any of these two options are what you should go for in this case, since a class named Coordinate does not seem to be something which would need a variable-size array of integers. Are you sure something like
template <unsigned NumDimensions>
class Coordinate<N> {
std::array<int, NumDimensions> a;
// etc. etc.
}
won't do?
(Note that the std::array class itself cannot really be used in device code, like most of the standard library. But you can easily clone std::array and then use your cuda::array class on both the host and the device side.)
Even if dynamic allocation of memory is required for some reason, it is not a good idea to have a class which, it seems, would be used many times, allocate its own memory. Consider using some pre-allocated buffer and have your Coordinates just advance an offset into it (although this would require synchronization for thread safety, or making the buffer thread-local).

Vector push_back of one object results in a vector of enormous size

i'm working on a project in c++, and I have a vector of objects, where I want to push_back an object on the existing vector. However, when checking the size before and after the object is added, the size goes from 0 to 12297829382473034412 which puzzles me greatly. The code in question is the addCommodity function below. (I have created a smaller example of the same problem further down, so skip to "SMALL PROBLEM")
void Instance::addCommodity(std::vector<std::string> & tokens) {
/*if(tokens.size()!=5){
std::cerr << "Error in commodity data format"<< std::endl;
exit(-1);
}*/
// size_t so = std::atoi(tokens[1].c_str());
// size_t si = std::atoi(tokens[2].c_str());
// size_t demand = std::atoi(tokens[3].c_str());
// size_t ti = std::atoi(tokens[4].c_str());
std::cout << "size: " << this->_commodities->size() << "\n";
this->_commodities->push_back(Commodity(1,2,3,4)); // ???
std::cout << "size: " << this->_commodities->size() << "\n";
}
Here I have commented out the parts of the code which are used to read data from a string which was loaded from a file. Commodity is defined as follows:
#include "commodity.h"
Commodity::Commodity(size_t so, size_t si, size_t d, size_t ti):
_source(so),
_sink(si),
_demand(d),
_maxTime(ti)
{}
Commodity::~Commodity(){}
size_t Commodity::getSource() const{
return _source;
}
size_t Commodity::getSink() const {
return _sink;
}
size_t Commodity::getDemand() const {
return _demand;
}
size_t Commodity::getTime() const {
return _maxTime;
}
Where Instance is initialised as:
Instance::Instance(std::shared_ptr<Param> p, size_t n):
_params(p),
_nNodes(n)
{
this->_commodities.reset(new std::vector<Commodity>());
this->_arcs.reset(new std::vector<Arc>());
}
As mentioned before my issue lies in the addCommodity code, when trying to push_back a Commodity. Hopefully this is enough code to identify any stupid mistakes that I have made. I left out most of the other code for this project as it doesn't seem to have an impact on the addCommodity function.
The output received when calling the function is:
size: 0
size: 12297829382473034412
SMALL PROBLEM
Instead of showing all the code, I have run the push_back on the vector in main:
#include <iostream>
#include <memory>
#include <sys/time.h>
#include <vector>
#include "commodity.h"
int main(int argc, char* argv[]){
std::shared_ptr< std::vector<Commodity>> commodities;
commodities.reset(new std::vector<Commodity>());
std::cout << "size: " << commodities->size() << "\n";
size_t a = 1;
size_t b = 2;
size_t c = 3;
size_t d = 4;
commodities->emplace_back(Commodity(a,b,c,d));
std::cout << "size: " << commodities->size() << std::endl;
return 0;
}
This is basically a smaller instance of the same code. The commodity cpp and h files are as follows:
#include "commodity.h"
Commodity::Commodity(size_t so, size_t si, size_t d, size_t ti):
_source(so),
_sink(si),
_demand(d),
_maxTime(ti)
{}
Commodity::~Commodity(){}
size_t Commodity::getSource() const{
return _source;
}
size_t Commodity::getSink() const {
return _sink;
}
size_t Commodity::getDemand() const {
return _demand;
}
size_t Commodity::getTime() const {
return _maxTime;
}
The header file:
#ifndef CG_MCF_COMMODITY_H
#define CG_MCF_COMMODITY_H
#include <stdlib.h>
class Commodity {
public:
Commodity(size_t so, size_t si, size_t d, size_t t);
~Commodity();
size_t getSource() const;
size_t getSink() const;
size_t getDemand() const;
size_t getTime() const;
private:
size_t _source;
size_t _sink;
size_t _demand;
size_t _maxTime;
};
#endif /*CG_MCF_COMMODITY_H*/
The output received when calling the function is:
size: 0
size: 12297829382473034412
Your Commodity class violates the rule of 0/3/5.
Your code (inexplicably) does this:
commodities->emplace_back(Commodity(a,b,c,d));
This is really strange. Presumably, you're calling emplace_back to avoid having to construct a separate Commodity from the one in the vector. But you force that to happen by explicitly constructing a separate Commodity as the parameter to emplace_back.
That invokes Commodity's copy constructor to construct the Commodity in the vector as a copy of the one you explicitly created. Except Commodity doesn't have one. Most likely, the real Commmodity class needs one, since it has a destructor.

How to copy variables from a custom class array on host into a float array on device in CUDA

I am using CUDA. I have the following class on host:
class Particle{
public:
float x;
float v;
// several other variables
}
Then I have a vector of particles
vector <Particle> p_all(512);
On the GPU, I want to operate on an array of all x's (taken from all the Particles), and want to copy the data from the Particles array into a float array on device. I have a hunch that cudaMemcpy can be used, and I tried the following code, but it gives invalid pitch error.
cudaMalloc( (void**) &pos_dev, sizeof(float)*512);
cudaMemcpy2D( (void*) &pos_dev, sizeof(float), (void*)&p_all[0].x, sizeof(Particle), sizeof(Particle), 512*sizeof(float), cudaMemcpyHostToDevice);
Is it at all possible to do so? Of course, the backup solution is to create an array of x's using a for loop and then copy it to the device. But I am looking for a more efficient solution.
Thanks.
FULL CODE BELOW.
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
using namespace std;
// This will output the proper error string when calling cudaGetLastError
void getLastCudaError(string s=""){
string errMessage = s;
cudaError_t err = cudaGetLastError();
if( err != cudaSuccess){
cerr << __FILE__ << "(" << __LINE__ << ") : Last Cuda Error - " << errMessage
<< " (" << int(err) << "): " << cudaGetErrorString(err) << ".\n";
exit(-1);
}
}
class Particle{
public:
float x;
float v;
int a;
char c;
short b;
Particle(){
a=1988; c='a'; v=5.56; x=1810; b=1.66;
}
};
template <class T>
void printVec(vector <T> &v, string name = "v"){
cout << name << " = ";
for (int i=0; i<v.size(); ++i) cout << v[i] << " " ;
cout << '\n';
}
int main(){
const int N = 512;
vector <float> pos(N,5);
vector <Particle> p_all(N);
float * pos_dev;
float * vel_dev;
cudaMalloc( (void**) &pos_dev, sizeof(float)*N);
printVec(pos, "pos");
cudaMemcpy2D( (void*) &pos_dev, sizeof(float), (void*)&(p_all[0].x), sizeof(Particle), sizeof(float), N, cudaMemcpyHostToDevice);
getLastCudaError("HtoD");
cudaMemcpy( (void*) &pos[0], (void*)&pos_dev, N*sizeof(float), cudaMemcpyDeviceToHost);
getLastCudaError("DtoH");
printVec(pos, "pos_new");
return 0;
}
Your cudaMemcpy2D call is set up incorrectly. Check the documentation.
try this instead:
cudaMemcpy2D( (void*) pos_dev, sizeof(float), (void*)&(p_all[0].x), sizeof(Particle), sizeof(float), 512, cudaMemcpyHostToDevice);
There were multiple parameters that needed to be modified, but the invalid pitch error came about because the requested width of transfer in bytes (you had sizeof(Particle)) was wider than the destination pitch (sizeof(float), which is correct)
EDIT: in addition, although you didn't ask about it, the final cudaMemcpy operation in the code you have now posted is also incorrect. The following changes should help:
cudaMemcpy( (void*) &(pos[0]), (void*)pos_dev, N*sizeof(float), cudaMemcpyDeviceToHost);
You are allocating your data as "array of structures", like
class Particle{
public:
float x;
float v;
}
Particle foo[N];
which will lead to coalescing issues due to the data interleaving and for this reason you are trying to use cudaMemcpy2D. A more convenient solution in terms of bandwidth exploitation is allocating the data as "structures of arrays" as
class Particle{
public:
float x[N];
float v[N];
}
Particle foo;
In this way, you will be able to avoid the use of cudaMemcpy2D and copy the data from host to device by a simple cudaMemcpy.