Copying a dynamically allocated 2D array from host to device in CUDA - c++

I want to copy a dynamically allocated 2D array from host to device to get its Discrete Fourier Transform.
I'm using below code to copy the array to the device
cudaMalloc((void**)&array_d, sizeof(cufftComplex)*NX*(NY/2+1));
cudaMemcpy(array_d, array_h, sizeof(float)*NX*NY, cudaMemcpyHostToDevice);
This works fine with static arrays, i get the intended output from my fft.
But it doesn't work with dynamic arrays. After little bit searching I learnt I can not copy dynamic arrays like this from host to device. So I found this solution.
cudaMalloc((void**)&array_d, sizeof(cufftComplex)*NX*(NY/2+1));
for(int i=0; i<NX; ++i){
cudaMemcpy(array_d+ i*NY, array_h[i], sizeof(float)*NY, cudaMemcpyHostToDevice);
}
But it's also not doing the task properly since I get wrong values from my fft.
Given below is my fft code.
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_R2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, (cufftReal*)data, data);
cudaThreadSynchronize();
cudaMemcpy(c, data, sizeof(float)*NX*NY, cudaMemcpyDeviceToHost);
How can I overcome this problem ?
EDIT
given below is the code
#define NX 4
#define NY 5
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iostream>
int check();
int main()
{
// static array
float b[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067}
};
// dynamic array
float **a = new float*[NX];
for (int r = 0; r < NX; ++r)
{
a[r] = new float[NY];
for (int c = 0; c < NY; ++c)
{
a[r][c] = b[r][c];
}
}
// arrray to store the results - host side
float c[NX][NY] = { 0 };
cufftHandle plan;
cufftComplex *data;
int n[NRANK] = {NX, NY};
cudaMalloc((void**)&data, sizeof(cufftComplex)*NX*(NY/2+1));
cudaMemcpy(data, b, sizeof(float)*NX*NY, cudaMemcpyHostToDevice);
/* Create a 2D FFT plan. */
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_R2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, (cufftReal*)data, data);
cudaThreadSynchronize();
cudaMemcpy(c, data, sizeof(float)*NX*NY, cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(data);
return 0;
}

data is of type cufftComplex which is series of typedefs eventually resulting in a float2. That means data + n will advance data by n objects of type float2, or by 2 * n object of type float. This makes your "dynamic array" copying incorrect; you have to halve the increment of data.
EDIT
Looking at the parameter types of cufftExecR2C(), I think this should work:
for(int i=0; i<NX; ++i){
cudaMemcpy(reinterpret_cast<float*>(data) + i*NY, a[i], sizeof(float)*NY, cudaMemcpyHostToDevice);
}
Side note: you don't actually have a dynamic 2D array (that would be new float[NX * NY]). What you have is a dynamic array of pointers to dynamic arrays of floats. I believe it would make more sense for you to use a true 2D array instead, which would allow you to keep the static-case copy code as well.
And since you've tagged this C++, you should seriously consider using std::vector instead of managing your dynamic memory manually. That is, change a like this:
std::vector<float> a(NX * NY);
And while you're at it, I'd suggest turning NX, NY etc. from macros to constants:
const size_t NX = 4;
const size_t NY = 5;
etc.

Related

Passing a Constant Integer in a CUDA Kernel [duplicate]

This question already has answers here:
allocating shared memory
(5 answers)
Closed 5 years ago.
I am having a problem with the following code. In the global kernel, loop_d, M has an integer value of 84. When I try to create a shared array, temp, and use M as the size of the array, I get the following error:
error: expression must have a constant value
I am not sure why that is. I know that if I declare M as a global variable, then it works, but the problem is that I get the value of M by calling the function d_two in a different Fortran program, so I am not sure how to get around that. I know that if I replace temp[M] with temp[84], then my program runs perfectly, but that is not very practical, since different problems might have different values of M. Thank you for your help!
The program
// Parallelized 2D Three-Point Guassian Quadrature Numerical Integration Method
// The following program is part of two linked programs, Integral_2D_Cuda.f.
// This is a CUDA kernel that could be called in the Integral_2D_Cuda.f Fortran code to compute
// the integral of a given 2D-function
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
// The following is a definition for the atomicAddd function that is called in the loop_d kernel
// This is needed because the "regular" atomicAdd function only works for floats and integers
__device__ double atomicAddd(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
// GPU kernel that computes the function of interest. This is good for a two dimensional problem.
__global__ void loop_d(double *a_sx, double *b_swx, double *c_sy, double *d_swy, double *e_ans0, int N, int M)
{
// Declaring a shared array that threads of the same block have access to
__shared__ double temp[M];
int idxX = blockIdx.x * blockDim.x + threadIdx.x; // Thread indices responsible for the swx and sx arrays
int idxY = threadIdx.y; // Thread indices responsible for the swy and sy arrays
// Computing the multiplication of elements
if (idxX < N && idxY < M)
{
temp[idxY] = a_sx[idxX] * b_swx[idxX] * c_sy[idxY] * d_swy[idxY];
}
// synchronizing all threads before summing all the mupltiplied elements int he temp array
__syncthreads();
// Allowing the 0th thread of y to do the summation of the multiplied elements in the temp array of one block
if (0 == idxY)
{
double sum = 0.00;
for(int k = 0; k < M; k++)
{
sum = sum + temp[k];
}
// Adding the result of this instance of calculation to the final answer, ans0
atomicAddd(e_ans0, sum);
}
}
extern "C" void d_two_(double *sx, double *swx, int *nptx, double *sy, double *swy, int *npty, double *ans0)
{
// Assigning GPU pointers
double *sx_d, *swx_d;
int N = *nptx;
double *sy_d, *swy_d;
int M = *npty;
double *ans0_d;
dim3 threadsPerBlock(1,M); // Creating a two dimesional block with 1 thread in the x dimesion and M threads in the y dimesion
dim3 numBlocks(N); // specifying the number of blocks to use of dimesion 1xM
// Allocating GPU Memory
cudaMalloc( (void **)&sx_d, sizeof(double) * N);
cudaMalloc( (void **)&swx_d, sizeof(double) * N);
cudaMalloc( (void **)&sy_d, sizeof(double) * M);
cudaMalloc( (void **)&swy_d, sizeof(double) * M);
cudaMalloc( (void **)&ans0_d, sizeof(double) );
// Copying information fromm CPU to GPU
cudaMemcpy( sx_d, sx, sizeof(double) * N, cudaMemcpyHostToDevice );
cudaMemcpy( swx_d, swx, sizeof(double) * N, cudaMemcpyHostToDevice );
cudaMemcpy( sy_d, sy, sizeof(double) * M, cudaMemcpyHostToDevice );
cudaMemcpy( swy_d, swy, sizeof(double) * M, cudaMemcpyHostToDevice );
cudaMemcpy( ans0_d, ans0, sizeof(double), cudaMemcpyHostToDevice );
// Calling the function on the GPU
loop_d<<< numBlocks, threadsPerBlock >>>(sx_d, swx_d, sy_d, swy_d, ans0_d, N, M);
// Copying from GPU to CPU
cudaMemcpy( ans0, ans0_d, sizeof(double), cudaMemcpyDeviceToHost );
// freeing GPU memory
cudaFree(sx_d);
cudaFree(swx_d);
cudaFree(sy_d);
cudaFree(swy_d);
cudaFree(ans0_d);
return;
}
The compiler needs M to be a compile-time constant. At compile time it cannot determine what M is actually going to be (it doesn't know you will just pass it 84 eventually).
When you want to use shared memory of size you only know at runtime, you use dynamic shared memory.
See this example here on the site or Using Shared Memory in CUDA on the Parallel4All blog.

cudaMallocPitch and cudaMemcpy2D

I have an error when transfering C++ 2D array into CUDA 1D array.
Let me show my source code.
int main(void)
{
float h_arr[1024][256];
float *d_arr;
// --- Some codes to populate h_arr
// --- cudaMallocPitch
size_t pitch;
cudaMallocPitch((void**)&d_arr, &pitch, 256, 1024);
// --- Copy array to device
cudaMemcpy2D(d_arr, pitch, h_arr, 256, 256, 1024, cudaMemcpyHostToDevice);
}
I tried to run the code, but it pops up an error.
How to use cudaMallocPitch() and cudaMemcpy2D() properly?
Talonmies has already satisfactorily answered this question. Here, some further explanation that could be useful to the Community.
When accessing 2D arrays in CUDA, memory transactions are much faster if each row is properly aligned.
CUDA provides the cudaMallocPitch function to “pad” 2D matrix rows with extra bytes so to achieve the desired alignment. Please, refer to the “CUDA C Programming Guide”, Sections 3.2.2 and 5.3.2, for more information.
Assuming that we want to allocate a 2D padded array of floating point (single precision) elements, the syntax for cudaMallocPitch is the following:
cudaMallocPitch(&devPtr, &devPitch, Ncols * sizeof(float), Nrows);
where
devPtr is an output pointer to float (float *devPtr).
devPitch is a size_t output variable denoting the length, in bytes, of the padded row.
Nrows and Ncols are size_t input variables representing the matrix size.
Recalling that C/C++ and CUDA store 2D matrices by row, cudaMallocPitch will allocate a memory space of size, in bytes, equal to Nrows * pitch. However, only the first Ncols * sizeof(float) bytes of each row will contain the matrix data. Accordingly, cudaMallocPitch consumes more memory than strictly necessary for the 2D matrix storage, but this is returned in more efficient memory accesses.
CUDA provides also the cudaMemcpy2D function to copy data from/to host memory space to/from device memory space allocated with cudaMallocPitch. Under the above hypotheses (single precision 2D matrix), the syntax is the following:
cudaMemcpy2D(devPtr, devPitch, hostPtr, hostPitch, Ncols * sizeof(float), Nrows, cudaMemcpyHostToDevice)
where
devPtr and hostPtr are input pointers to float (float *devPtr and float *hostPtr) pointing to the (source) device and (destination) host memory spaces, respectively;
devPitch and hostPitch are size_t input variables denoting the length, in bytes, of the padded rows for the device and host memory spaces, respectively;
Nrows and Ncols are size_t input variables representing the matrix size.
Note that cudaMemcpy2D allows also for pitched memory allocation on the host side. If the host memory has no pitch, then hostPtr = Ncols * sizeof(float). Furthermore, cudaMemcpy2D is bidirectional. For the above example, we are copying data from host to device. If we want to copy data from device to host, then the above line changes to
cudaMemcpy2D(hostPtr, hostPitch, devPtr, devPitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost)
The access to elements of a 2D matrix allocated by cudaMallocPitch can be performed as in the following example:
int tidx = blockIdx.x*blockDim.x + threadIdx.x;
int tidy = blockIdx.y*blockDim.y + threadIdx.y;
if ((tidx < Ncols) && (tidy < Nrows))
{
float *row_a = (float *)((char*)devPtr + tidy * pitch);
row_a[tidx] = row_a[tidx] * tidx * tidy;
}
In such an example, tidx and tidy are used as column and row indices, respectively (remember that, in CUDA, x-threads span the columns and y-threads span the rows to favor coalescence). The pointer to the first element of a row is calculated by offsetting the initial pointer devPtr by the row length tidy * pitch in bytes (char * is a pointer to bytes and sizeof(char) is 1 byte), where the length of each row is computed by using the pitch information.
Below, I'm providing a fully worked example to show these concepts.
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<conio.h>
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16
#define Nrows 3
#define Ncols 5
/*****************/
/* CUDA MEMCHECK */
/*****************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int hostPtr, int b){ return ((hostPtr % b) != 0) ? (hostPtr / b + 1) : (hostPtr / b); }
/******************/
/* TEST KERNEL 2D */
/******************/
__global__ void test_kernel_2D(float *devPtr, size_t pitch)
{
int tidx = blockIdx.x*blockDim.x + threadIdx.x;
int tidy = blockIdx.y*blockDim.y + threadIdx.y;
if ((tidx < Ncols) && (tidy < Nrows))
{
float *row_a = (float *)((char*)devPtr + tidy * pitch);
row_a[tidx] = row_a[tidx] * tidx * tidy;
}
}
/********/
/* MAIN */
/********/
int main()
{
float hostPtr[Nrows][Ncols];
float *devPtr;
size_t pitch;
for (int i = 0; i < Nrows; i++)
for (int j = 0; j < Ncols; j++) {
hostPtr[i][j] = 1.f;
//printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);
}
// --- 2D pitched allocation and host->device memcopy
gpuErrchk(cudaMallocPitch(&devPtr, &pitch, Ncols * sizeof(float), Nrows));
gpuErrchk(cudaMemcpy2D(devPtr, pitch, hostPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
dim3 gridSize(iDivUp(Ncols, BLOCKSIZE_x), iDivUp(Nrows, BLOCKSIZE_y));
dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);
test_kernel_2D << <gridSize, blockSize >> >(devPtr, pitch);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy2D(hostPtr, Ncols * sizeof(float), devPtr, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost));
for (int i = 0; i < Nrows; i++)
for (int j = 0; j < Ncols; j++)
printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);
return 0;
}
The cudaMallocPitch call you have written looks ok, but this:
cudaMemcpy2D(d_arr, pitch, h_arr, 256, 256, 1024, cudaMemcpyHostToDevice);
is incorrect. Quoting from the documentation
Copies a matrix (height rows of width bytes each) from the memory area
pointed to by src to the memory area pointed to by dst, where kind is
one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
cudaMemcpyDeviceToHost, or cudaMemcpyDeviceToDevice, and specifies the
direction of the copy. dpitch and spitch are the widths in memory in
bytes of the 2D arrays pointed to by dst and src, including any
padding added to the end of each row. The memory areas may not
overlap. width must not exceed either dpitch or spitch. Calling
cudaMemcpy2D() with dst and src pointers that do not match the
direction of the copy results in an undefined behavior. cudaMemcpy2D()
returns an error if dpitch or spitch exceeds the maximum allowed.
So the source pitch and width to copy must be specified in bytes. Your host matrix has a pitch of sizeof(float) * 256 bytes, and because the source pitch and the width of the source you will copy are the same, this means your cudaMemcpy2Dcall should look like:
cudaMemcpy2D(d_arr, pitch, h_arr, 256*sizeof(float),
256*sizeof(float), 1024, cudaMemcpyHostToDevice);

Why is my CUDA code not working properly for zero filling a large matrix?

It is a simple CUDA code for initializing a big matrix (filling in zeros).
I output the first 1*3 matrix, if the code works. It should be all zeros.
If I set the matrix size to be small, then the program works properly. But when I make the size larger (> 43200 * 2400), what is inside the matrix are all garbage.
I had cudaDeviceSynchronize() append at the end of each CUDA functions already.
I am using NVIDIA Quadro K4200, Xeon E5-2630 with Ubuntu 14.04.
Thanks for anyone helping me here.
Attached below is my full code.
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <cuComplex.h>
#define BLOCK_SIZE 16 // change it to 16 to get maximum performance
// populate the matrix using first row
__global__ void RepmatKernel (cuComplex *Mat, const unsigned int N, const unsigned int Cols)
{
unsigned int i = (unsigned int)blockIdx.x * (unsigned int)blockDim.x + (unsigned int)threadIdx.x;
if (i < N)
{
Mat[i].x = 0;
Mat[i].y = 0;
}
}
// main routine
int main ()
{
const unsigned int Rows = 43200;
const unsigned int Cols = 2400;
const unsigned int Num_thrd = 256; // max threads per block
unsigned int Mat_size = Rows * Cols; // size of array
cuComplex *vec; // supposedly the input
cuComplex *mat_debug; // for debug
vec = new cuComplex [Cols];
mat_debug = new cuComplex [Rows*Cols];
cuComplex *mat_in_d; // device array
//input in host array
for(unsigned int i = 0; i < Cols; i++)
{
vec[i].x = 3*i+4;
vec[i].y = 0.2*i+1;
}
const unsigned int size_mat_d = Rows * Cols * sizeof(cuComplex);
//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
if (cudaMalloc((void **) &mat_in_d , size_mat_d) != cudaSuccess) std::cout<<"Error allocating GPU";
cudaDeviceSynchronize() ;
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( mat_in_d , vec , Cols , cudaMemcpyHostToDevice ) ;
cudaDeviceSynchronize() ;
// ========================================================================
cudaMemcpy(mat_debug , mat_in_d , size_mat_d , cudaMemcpyDeviceToHost) ;
cudaDeviceSynchronize() ;
std::cout<<"before repmat="<<std::endl;
std::cout<<"[";
for(unsigned int i = 0; i < 3; i++)
{
std::cout<< mat_debug[i * Cols].x <<"+"<<mat_debug[i * Cols].y <<"i, ";
std::cout<<";"<<std::endl;
}
std::cout<<"]"<<std::endl;
// ==========================================================================
RepmatKernel<<<(unsigned int)ceil((float)(Mat_size)/(float)(Num_thrd)),
(Num_thrd)>>>(mat_in_d,
Mat_size,
Cols);
cudaDeviceSynchronize();
// ========================================================================
cudaMemcpy(mat_debug , mat_in_d , size_mat_d , cudaMemcpyDeviceToHost) ;
cudaDeviceSynchronize() ;
std::cout<<"after repmat="<<std::endl;
std::cout<<"[";
for(unsigned int i = 0; i < 3; i++)
{
std::cout<< mat_debug[i * Cols].x <<"+"<<mat_debug[i * Cols].y <<"i, ";
std::cout<<";"<<std::endl;
}
std::cout<<"]"<<std::endl;
// ==========================================================================
cudaFree(mat_in_d);
delete [] vec;
delete [] mat_debug;
return 0;
}
Your call to cudaMalloc states that there is a problem, but doesn't actually terminate the computation. You should put a
if (cudaMalloc((void **) &mat_in_d , size_mat_d) != cudaSuccess)
{
std::cout<<"Error allocating GPU\n";
return 1;
}
so that the computation actually stops when you overflow the memory, rather than attempt to work anyway with only a warning to std::cout. Even better would be to use an error handling macro.
Another problem is here:
cudaMemcpy ( mat_in_d , vec , Cols , cudaMemcpyHostToDevice );
First, mat_in_d is size Rows * Cols * sizeof(cuComplex), but you are only copying Cols bytes into it. Even if you only wanted to copy vec into the first part of the mat_in_d vector, you'd need to change this to
cudaMemcpy ( mat_in_d , vec , Cols*sizeof(cuComplex) , cudaMemcpyHostToDevice );
At this point, you'd expect the first Cols entries of you matrix to be reasonable, at the rest to be garbage. (Making the suggested change shows that this is indeed the case; why you would want to do this is a better question).
Next comes your kernel call, whose entire goal is to set the entries of Mat to zero. This should be done with cudaMemset, i.e., just use
cudaMemset(mat_in_d, 0, Mat_size*sizeof(cuComplex));
We could look more carefully at the execution configuration to see what went wrong with your kernel call, but for now this fixes your problem.
For debugging CUDA errors; I find a header from samples, helper_cuda.h, quite convenient. I almost always include this header, which is located in the common directory of samples, in my projects.
Then, wrapping all CUDA calls with checkCudaErrors(), like checkCudaErrors(cudaMalloc((void **) &mat_in_d , size_mat_d)); gives explicit error messages.
In my case, since just mat_in_d is close to 1 GB and my GPU's memory is only 512 MB, it failed for sure and threw cudaErrorMemoryAllocation. However, an NVIDIA Quadro K4200 should not fail that easily!
Did you check the actual available memory information using cudaMemGetInfo ?

How to get the real and imaginary parts of a complex matrix separately in CUDA?

I'm trying to get the fft of a 2D array. The input is a NxM real matrix, therefore the output matrix is also a NxM matrix (2xNxM output matrix which is complex is saved in a NxM matrix using the property Hermitian symmetry).
So i want to know whether there is method to extract in cuda to extract real and complex matrices separately ? In opencv split function does the duty. So I'm looking for a similar function in cuda, but I couldn't find it yet.
Given below is my complete code
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iostream>
#include <vector>
using namespace std;
int main()
{
const size_t NX = 4;
const size_t NY = 5;
// Input array - host side
float b[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067}
};
// Output array - host side
float c[NX][NY] = { 0 };
cufftHandle plan;
cufftComplex *data; // Holds both the input and the output - device side
int n[NRANK] = {NX, NY};
// Allocated memory and copy from host to device
cudaMalloc((void**)&data, sizeof(cufftComplex)*NX*(NY/2+1));
for(int i=0; i<NX; ++i){
// Uses this because my actual array is a dynamically allocated.
// but here I've replaced it with a static 2D array to make it simple.
cudaMemcpy(reinterpret_cast<float*>(data) + i*NY, b[i], sizeof(float)*NY, cudaMemcpyHostToDevice);
}
// Performe the fft
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_R2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, (cufftReal*)data, data);
cudaThreadSynchronize();
cudaMemcpy(c, data, sizeof(float)*NX*NY, cudaMemcpyDeviceToHost);
// Here c is a NxM matrix. I want to split it to 2 seperate NxM matrices with each
// having the complex and real component of the output
// Here c is in
cufftDestroy(plan);
cudaFree(data);
return 0;
}
EDIT
As suggested by JackOLanter, I modified the code as below. But still the problem is not solved.
float real_vec[NX][NY] = {0}; // host vector, real part
float imag_vec[NX][NY] = {0}; // host vector, imaginary part
cudaError cudaStat1 = cudaMemcpy2D (real_vec, sizeof(real_vec[0]), data, sizeof(data[0]),NY*sizeof(float2), NX, cudaMemcpyDeviceToHost);
cudaError cudaStat2 = cudaMemcpy2D (imag_vec, sizeof(imag_vec[0]),data + 1, sizeof(data[0]),NY*sizeof(float2), NX, cudaMemcpyDeviceToHost);
The error i get is 'invalid pitch argument error'. But i can't understand why. For the destination I use a pitch size of 'float' while for the source i use size of 'float2'
Your question and your code do not make much sense to me.
You are performing a batched FFT, but it seems you are not foreseeing enough memory space neither for the input, nor for the output data;
The output of cufftExecR2C is a NX*(NY/2+1) float2 matrix, which can be interpreted as a NX*(NY+2) float matrix. Accordingly, you are not allocating enough space for c (which is only NX*NY float) for the last cudaMemcpy. You would need still one complex memory location for the continuous component of the output;
Your question does not seem to be related to the cufftExecR2C command, but is much more general: how can I split a complex NX*NY matrix into 2 NX*NY real matrices containing the real and imaginary parts, respectively.
If I correctly interpret your question, then the solution proposed by #njuffa at
Copying data to “cufftComplex” data struct?
could be a good clue to you.
EDIT
In the following, a small example on how "assembling" and "disassembling" the real and imaginary parts of complex vectors when copying them from/to host to/from device. Please, add your own CUDA error checking.
#include <stdio.h>
#define N 16
int main() {
// Declaring, allocating and initializing a complex host vector
float2* b = (float2*)malloc(N*sizeof(float2));
printf("ORIGINAL DATA\n");
for (int i=0; i<N; i++) {
b[i].x = (float)i;
b[i].y = 2.f*(float)i;
printf("%f %f\n",b[i].x,b[i].y);
}
printf("\n\n");
// Declaring and allocating a complex device vector
float2 *data; cudaMalloc((void**)&data, sizeof(float2)*N);
// Copying the complex host vector to device
cudaMemcpy(data, b, N*sizeof(float2), cudaMemcpyHostToDevice);
// Declaring and allocating space on the host for the real and imaginary parts of the complex vector
float* cr = (float*)malloc(N*sizeof(float));
float* ci = (float*)malloc(N*sizeof(float));
/*******************************************************************/
/* DISASSEMBLING THE COMPLEX DATA WHEN COPYING FROM DEVICE TO HOST */
/*******************************************************************/
float* tmp_d = (float*)data;
cudaMemcpy2D(cr, sizeof(float), tmp_d, 2*sizeof(float), sizeof(float), N, cudaMemcpyDeviceToHost);
cudaMemcpy2D(ci, sizeof(float), tmp_d+1, 2*sizeof(float), sizeof(float), N, cudaMemcpyDeviceToHost);
printf("DISASSEMBLED REAL AND IMAGINARY PARTS\n");
for (int i=0; i<N; i++)
printf("cr[%i] = %f; ci[%i] = %f\n",i,cr[i],i,ci[i]);
printf("\n\n");
/******************************************************************************/
/* REASSEMBLING THE REAL AND IMAGINARY PARTS WHEN COPYING FROM HOST TO DEVICE */
/******************************************************************************/
cudaMemcpy2D(tmp_d, 2*sizeof(float), cr, sizeof(float), sizeof(float), N, cudaMemcpyHostToDevice);
cudaMemcpy2D(tmp_d + 1, 2*sizeof(float), ci, sizeof(float), sizeof(float), N, cudaMemcpyHostToDevice);
// Copying the complex device vector to host
cudaMemcpy(b, data, N*sizeof(float2), cudaMemcpyHostToDevice);
printf("REASSEMBLED DATA\n");
for (int i=0; i<N; i++)
printf("%f %f\n",b[i].x,b[i].y);
printf("\n\n");
getchar();
return 0;
}

Copying structure containing 2d pointer to device

I have a question-related to copying structure containing 2D pointer to the device from the host, my code is as follow
struct mymatrix
{
matrix m;
int x;
};
size_t pitch;
mymatrix m_h[5];
for(int i=0; i<5;i++){
m_h[i].m = (float**) malloc(4 * sizeof(float*));
for (int idx = 0; idx < 4; ++idx)
{
m_h[i].m[idx] = (float*)malloc(4 * sizeof(float));
}
}
mymatrix *m_hh = (mymatrix*)malloc(5*sizeof(mymatrix));
memcpy(m_hh,m_h,5*sizeof(mymatrix));
for(int i=0 ; i<5 ;i++)
{
cudaMallocPitch((void**)&(m_hh[i].m),&pitch,4*sizeof(float),4);
cudaMemcpy2D(m_hh[i].m, pitch, m_h[i].m, 4*sizeof(float), 4*sizeof(float),4,cudaMemcpyHostToDevice);
}
mymatrix *m_d;
cudaMalloc((void**)&m_d,5*sizeof(mymatrix));
cudaMemcpy(m_d,m_hh,5*sizeof(mymatrix),cudaMemcpyHostToDevice);
distance_calculation_begins<<<1,16>>>(m_d,pitch);
Problem
With this code I am unable to access 2D pointer elements of the structure, but I can access x from that structure in device. e.g. such as I have receive m_d with pointer mymatrix* m if I initialize
m[0].m[0][0] = 5;
and printing this value such as
cuPrintf("The value is %f",m[0].m[0][0]);
in the device, I get no output. Means I am unable to use 2D pointer, but if I try to access
m[0].x = 5;
then I am able to print this. I think my initializations are correct, but I am unable to figure out the problem. Help from anyone will be greatly appreciated.
In addition to the issues that #RobertCrovella noted on your code, also note:
You are only getting a shallow copy of your structure with the memcpy that copies m_h to m_hh.
You are assuming that pitch is the same in all calls to cudaMemcpy2D() (you overwrite the pitch and use only the latest copy at the end). I think that might be safe assumption for now but it could change in the future.
You are using cudaMemcpyHostToDevice() with cudaMemcpyHostToDevice to copy to m_hh, which is on the host, not the device.
Using many small buffers and tables of pointers is not efficient in CUDA. The small allocations and deallocations can end up taking a lot of time. Also, using tables of pointers cause extra memory transactions because the pointers must be retrieved from memory before they can be used as bases for indexing. So, if you consider a construct such as this:
a[10][20][30] = 3
The pointer at a[10] must first be retrieved from memory, causing your warp to be put on hold for a long time (up to around 600 cycles on Fermi). Then, the same thing happens for the second pointer, adding another 600 cycles. In addition, these requests are unlikely to be coalesced causing even more memory transactions.
As Robert mentioned, the solution is to flatten your memory structures. I've included an example for this, which you may be able to use as a basis for your program. As you can see, the code is overall much simpler. The part that does become a bit more complex is the index calculations. Also, this approach assumes that your matrixes are all of the same size.
I have added error checking as well. If you had added error checking in your code, you would have found at least a couple of the bugs without any extra effort.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
typedef float* mymatrix;
const int n_matrixes(5);
const int w(4);
const int h(4);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void test(mymatrix m_d, size_t pitch_floats)
{
// Print the value at [2][3][4].
printf("%f ", m_d[3 + (2 * h + 4) * pitch_floats]);
}
int main()
{
mymatrix m_h;
gpuErrchk(cudaMallocHost(&m_h, n_matrixes * w * sizeof(float) * h));
// Set the value at [2][3][4].
m_h[2 * (w * h) + 3 + 4 * w] = 5.0f;
// Create a device copy of the matrix.
mymatrix m_d;
size_t pitch;
gpuErrchk(cudaMallocPitch((void**)&m_d, &pitch, w * sizeof(float), n_matrixes * h));
gpuErrchk(cudaMemcpy2D(m_d, pitch, m_h, w * sizeof(float), w * sizeof(float), n_matrixes * h, cudaMemcpyHostToDevice));
test<<<1,1>>>(m_d, pitch / sizeof(float));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
Your matrix m class/struct member appears to be some sort of double pointer based on how you are initializing it on the host:
m_h[i].m = (float**) malloc(4 * sizeof(float*));
Copying an array of structures with embedded pointers between host and device is somewhat compilicated. Copying a data structure that is pointed to by a double pointer is also complicated.
For an array of structures with embedded pointers, refer to this posting.
For copying a 2D array (double pointer, i.e. **), refer to this posting. We don't use cudaMallocPitch/cudaMemcpy2D to accomplish this. (Note that cudaMemcpy2D takes single pointer * arguments, you are passing it double pointer ** arguments e.g. m_h[i].m)
Instead of the above approaches, it's recommended that you flatten your data so that it can all be referenced with single pointer referencing, with no embedded pointers.