Calculate matrix determinants with cublas device API - c++

I am trying to evaluate a scalar function f(x), where x is a k-dimensional vector (i.e. f:R^k->R). During the evaluation, I have to perform many matrix operations: inversion, multiplication and finding matrix determinants and traces for matrices of moderate sizes (most of them are less than 30x30). Now I want to evaluate the function at many different xs at the same time by using different threads on the GPU. That is why I need the device api.
I have written the following codes to test calculating matrix determinants by the cublas device API, cublasSgetrfBatched, where I first find the LU decomposition of the matrix and calculate the product of all the diagonal elements in the U matrix. I have done this on both the GPU thread and CPU using the result returned by cublas. But the result from the GPU does not make any sense while the result on the CPU is correct. I have used cuda-memcheck, but found no errors. Could someone help shed some light on this issue? Many thanks.
cat test2.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
__host__ __device__ unsigned int IDX(unsigned int i,unsigned int j,unsigned int ld){return j*ld+i;}
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
#define ERRCHECK \
if (cudaPeekAtLastError()) { \
fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
__device__ float
det_kernel(float *a_copy,unsigned int *n,cublasHandle_t *hdl){
int *info = (int *)malloc(sizeof(int));info[0]=0;
int batch=1;int *p = (int *)malloc(*n*sizeof(int));
float **a = (float **)malloc(sizeof(float *));
*a = a_copy;
cublasStatus_t status=cublasSgetrfBatched(*hdl, *n, a, *n, p, info, batch);
unsigned int i1;
float res=1;
for(i1=0;i1<(*n);++i1)res*=a_copy[IDX(i1,i1,*n)];
return res;
}
__global__ void runtest(float *a_i,unsigned int n){
cublasHandle_t hdl;cublasCreate_v2(&hdl);
printf("det on GPU:%f\n",det_kernel(a_i,&n,&hdl));
cublasDestroy_v2(hdl);
}
int
main(int argc, char **argv)
{
float a[] = {
1, 2, 3,
0, 4, 5,
1, 0, 0};
cudaSetDevice(1);//GTX780Ti on my machine,0 for GTX1080
unsigned int n=3,nn=n*n;
printf("a is \n");
for (int i = 0; i < n; ++i){
for (int j = 0; j < n; j++) printf("%f, ",a[IDX(i,j,n)]);
printf("\n");}
float *a_d;
PERR(cudaMalloc((void **)&a_d, nn*sizeof(float)));
PERR(cudaMemcpy(a_d, a, nn*sizeof(float), cudaMemcpyHostToDevice));
runtest<<<1, 1>>>(a_d,n);
cudaDeviceSynchronize();
ERRCHECK;
PERR(cudaMemcpy(a, a_d, nn*sizeof(float), cudaMemcpyDeviceToHost));
float res=1;
for (int i = 0; i < n; ++i)res*=a[IDX(i,i,n)];
printf("det on CPU:%f\n",res);
}
nvcc -arch=sm_35 -rdc=true -o test test2.cu -lcublas_device -lcudadevrt
./test
a is
1.000000, 0.000000, 1.000000,
2.000000, 4.000000, 0.000000,
3.000000, 5.000000, 0.000000,
det on GPU:0.000000
det on CPU:-2.000000

cublas device calls are asynchronous.
That means that they return control to the calling thread before the cublas call is finished.
If you want the calling thread to be able to process the results directly (as you are doing here to compute res), you must force a synchronization to wait for the results, before beginning computation.
You don't see this in the host side computation, because there is implicit synchronization of any device activity (including cublas device dynamic parallelism), before the parent kernel terminates.
So if you add add a synchronization after the device cublas call, like this:
cublasStatus_t status=cublasSgetrfBatched(*hdl, *n, a, *n, p, info, batch);
cudaDeviceSynchronize(); // add this line
I think you'll see a match between the device computation and the host computation, as you expect.

Related

nvcc cuda from command prompt not using gpu

Trying to run a CUDA program from command prompt using nvcc, but it seems like GPU code is not running as expected. The exact same code runs successfully on Visual Studio and outputs the expected output.
nvcc -arch=sm_60 -std=c++11 -o test.cu test.exe
test.exe
Environment:
Windows 10,
NVIDIA Quadro k4200,
CUDA 10.2
Source Code
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
/* this is the vector addition kernel.
:inputs: n -> Size of vector, integer
a -> constant multiple, float
x -> input 'vector', constant pointer to float
y -> input and output 'vector', pointer to float */
__global__ void saxpy(int n, float a, const float x[], float y[])
{
int id = threadIdx.x + blockDim.x*blockIdx.x; /* Performing that for loop */
// check to see if id is greater than size of array
if(id < n){
y[id] += a*x[id];
}
}
int main()
{
int N = 256;
//create pointers and device
float *d_x, *d_y;
const float a = 2.0f;
//allocate and initializing memory on host
std::vector<float> x(N, 1.f);
std::vector<float> y(N, 1.f);
//allocate our memory on GPU
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
//Memory Transfer!
cudaMemcpy(d_x, x.data(), N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y.data(), N*sizeof(float), cudaMemcpyHostToDevice);
//Launch the Kernel! In this configuration there is 1 block with 256 threads
//Use gridDim = int((N-1)/256) in general
saxpy<<<1, 256>>>(N, a, d_x, d_y);
//Transfering Memory back!
cudaMemcpy(y.data(), d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << y[0] << std::endl;
cudaFree(d_x);
cudaFree(d_y);
return 0;
}
Output
1
Expected Output
3
Things I tried
When I first tried to compile with nvcc, it had the same error as discussed here.
Cuda compilation error: class template has already been defined
So I tried the suggested solution
"now: D:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.22.27905\bin\Hostx64\x64"
and now it compiles and runs but the output is not as expected.
"Also, -arch=sm_60 is an incorrect arch specification for a Quadro K4200. It should be -arch=sm_30" by Robert Crovella

Multi-GPU batched 1D FFTs: only a single GPU seems to work

I have three Tesla V100s on RHEL 8 with CUDA toolkit version 10.2.89.
I'm attempting to compute a batch of 1D FFTs of the columns of a row-major matrix. In the example below, the matrix is 16x8, so with three GPUs I'd expect GPU 0 to perform the FFTs of the first 3 columns, GPU 1 to perform FFTs of the next 3, and GPU 2 to perform FFTs of the final 2.
The plan created in the example works as expected on a single GPU, but when running on three only the first three columns are computed (correctly), the remainder are untouched.
When I inspect the descriptor that is filled by cufftXtMalloc, I see that it has allocated space for 123 elements on GPUs 0 and 1, and 122 on GPU 2. This seems weird: I would expect 48=16*3 on GPUs 0 and 1 and 32=16*2 on GPU 2. Indeed this is the size of the workspaces filled by cufftMakePlanMany. When I inspect the data that was copied, elements 0-122 are in the buffer on GPU 0, and elements 123-127 are at the beginning of the buffer on GPU 1. The remainder of that buffer and the buffer on GPU 2 are junk.
In addition, when I increase the number of rows to 1024, I get a SIGABRT on the cufftXtFree call with the message 'free(): corrupted unsorted chunks'.
#include "cufft.h"
#include "cufftXt.h"
#include <vector>
#include <cuComplex.h>
#include <cassert>
#define CUDA_CHECK(x) assert(x == cudaSuccess)
#define CUFFT_CHECK(x) assert(x == CUFFT_SUCCESS)
int main() {
static const int numGPUs = 3;
int gpus[numGPUs] = {0, 1, 2};
int nr = 16;
int nc = 8;
// Fill with junk data
std::vector<cuFloatComplex> h_x(nr * nc);
for (int i = 0; i < nr * nc; ++i) {
h_x[i].x = static_cast<float>(i);
}
cufftHandle plan;
CUFFT_CHECK(cufftCreate(&plan));
CUFFT_CHECK(cufftXtSetGPUs(plan, numGPUs, gpus));
std::vector<size_t> workSizes(numGPUs);
int n[] = {nr};
CUFFT_CHECK(cufftMakePlanMany(plan,
1, // rank
n, // n
n, // inembed
nc, // istride
1, // idist
n, // onembed
nc, // ostride
1, // odist
CUFFT_C2C,
nc,
workSizes.data()));
cudaLibXtDesc *d_x;
CUFFT_CHECK(cufftXtMalloc(plan, &d_x, CUFFT_XT_FORMAT_INPLACE));
CUFFT_CHECK(cufftXtMemcpy(plan, d_x, (void *)h_x.data(), CUFFT_COPY_HOST_TO_DEVICE));
CUFFT_CHECK(cufftXtExecDescriptorC2C(plan, d_x, d_x, CUFFT_FORWARD));
std::vector<cuFloatComplex> h_out(nr * nc);
CUFFT_CHECK(cufftXtMemcpy(plan, (void *)h_out.data(), d_x, CUFFT_COPY_DEVICE_TO_HOST));
CUFFT_CHECK(cufftXtFree(d_x));
CUFFT_CHECK(cufftDestroy(plan));
CUDA_CHECK(cudaDeviceReset());
return 0;
}
Thanks to #RobertCrovella for the answer:
As of CUDA 10.2.89 according to the documentation strided input and output are not supported for multi-GPU transforms.

CUDA kernel returns nothing

I'm using CUDA Toolkit 8 with Visual Studio Community 2015. When I try simple vector addition from NVidia's PDF manual (minus error checking which I don't have the *.h's for) it always comes back as undefined values, which means the output array was never filled. When I pre-fill it with 0's, that's all I get at the end.
Others have had this problem and some people are saying it's caused by compiling for the wrong compute capability. However, I am using an NVidia GTX 750 Ti, which is supposed to be Compute Capability 5. I have tried compiling for Compute Capability 2.0 (the minimum for my SDK) and 5.0.
I also cannot make any of the precompiled examples work, such as vectoradd.exe which says, "Failed to allocate device vector A (error code initialization error)!" And oceanfft.exe says, "Error unable to find GLSL vertex and fragment shaders!" which doesn't make sense because GLSL and fragment shading are very basic features.
My driver version is 361.43 and other apps such as Blender Cycles in CUDA mode and Stellarium work perfectly.
Here is the code that should work:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <algorithm>
#define N 10
__global__ void add(int *a, int *b, int *c) {
int tid = blockIdx.x; // handle the data at this index
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main(void) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
// fill the arrays 'a' and 'b' on the CPU
for (int i = 0; i<N; i++) {
a[i] = -i;
b[i] = i * i;
}
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy(dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice);
add << <N, 1 >> >(dev_a, dev_b, dev_c);
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost);
// display the results
for (int i = 0; i<N; i++) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
// free the memory allocated on the GPU
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
I'm trying to develop CUDA apps so any help would be greatly appreciated.
This was apparently caused by using an incompatible driver version with the CUDA 8 toolkit. Installing the driver distributed with the version 8 toolkit solved thr problem.
[Answer assembled from comments and added as a community wiki entry to get the question off the unanswered queue for the CUDA tag]

How can I skip the fourth element in a float4 when using cublas sgemv?

Part of the code I am working on requires to perform a Matrix vector multiplication as fast as possible, i.e using an optimized third party library like cublas (although the same principle applies to any cpu blas).
The problem is that there is a kind of stride between elements in the vector like so:
The matrix is stored as a 3Nx3N 1D array of floats.
The vector is stored as a N 1D array of float4s, but only the first three elements of each float4 are to be used, the fourth should be ignored.
N is in the order of millions.
If the vector were stored as float3 instead of float4 I could just cast the pointer to float, like in this working example:
//Compile with nvcc test.cu -O3 -lcublas -o test
/*
Multiply a 3Nx3N float matrix, M, by a vector, X, of N float3 elements
The result, Y, is a 3N float vector
-----------------------
What if X is a vector of N float4?
How can I tell cublas to skip the forth element?
*/
#include<iostream>
#include<thrust/device_vector.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>
using namespace std;
int main(){
int N = 3;
thrust::device_vector<float3> X(N);
thrust::device_vector<float> Y(3*N);
for(int i=0; i<N; i++)
X[i] = make_float3(1,1,1); //make_float4(1,1,1,0); //in the case of float4 i.e., The result should be the same
thrust::device_vector<float> M(3*N*3*N, 1);
cublasHandle_t handle;
cublasCreate(&handle);
float beta = 0.0f;
float alpha = 1.0f;
cublasSgemv(handle, CUBLAS_OP_T,
3*N, 3*N,
&alpha,
thrust::raw_pointer_cast(&M[0]), 3*N,
(float*) thrust::raw_pointer_cast(&X[0]), 1,
&beta,
thrust::raw_pointer_cast(&Y[0]), 1);
cout<<"Performed Y = M·X\n\tX = ";
for(int i=0; i<N; i++){
float3 Xi = X[i];
cout<<Xi.x<<" "<<Xi.y<<" "<<Xi.z<<" ";
}
cout<<"\n\tY = ";
for(int i=0; i<3*N; i++){
cout<<Y[i]<<" ";
}
cout<<endl;
return 0;
}
But, how can I perform this operation if the X vector is stored as float4 s?
Given that float4* can be interpreted as a float* with 4 times more elements, the question could be more general (although I am only interested in the float4 case);
If there is a stride between each 3 "useful" elements. I want to say to cublas that the array is not coalescent in memory. But something like: There is 3 elements at the start, the next three are "stride" elements after that, etc.
Similar to what you can do in OpenGL whith vertex array objects.
EDIT:
The answers suggested that the most viable method is to just copy the strided array into a temporal, transformed, float3 array that cublas understands.
The two options at the moment to do so are:
1. Use cudaMemcpy2D
2. Use a thrust transformation
3. Use a custom copy kernel
I wrote this code to test the three cases:
//Compile with Compile with: nvcc test.cu -O3 -lcublas -o test
#include<iostream>
#include<thrust/device_vector.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>
using namespace std;
struct Timer{
cudaEvent_t start, stop;
float time;
void tic(){
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
}
float toc(){
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return time;
}
};
struct copy_functor{
copy_functor(){}
__device__ float3 operator() (const float4& X4){
return make_float3(X4.x, X4.y, X4.z);
}
};
__global__ void copy_kernel(const float4* __restrict__ X4, float3* __restrict__ X3, int N){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id < N){
float4 x4 = X4[id];
X3[id] = make_float3(x4.x, x4.y, x4.z);
}
}
int main(){
int N = 1000000;
int Ntest = 1000;
Timer t;
thrust::device_vector<float3> X3(N, make_float3(0,0,0));
thrust::device_vector<float4> X4(N, make_float4(1,1,1,10));
/*************************CUDAMEMCPY2D*******************/
t.tic();
for(int i= 0; i<Ntest; i++){
cudaMemcpy2DAsync(thrust::raw_pointer_cast(&X3[0]),
3*sizeof(float),
thrust::raw_pointer_cast(&X4[0]),
4*sizeof(float),
3*sizeof(float),
N,
cudaMemcpyDeviceToDevice);
cudaDeviceSynchronize();
}
printf ("Time for cudaMemcpy2DAsync: %f ms\n", t.toc()/(float)Ntest);
/************************THRUST***********************/
t.tic();
for(int i= 0; i<Ntest; i++){
transform(X4.begin(), X4.end(), X3.begin(), copy_functor());
cudaDeviceSynchronize();
}
printf ("Time for thrust transformation: %f ms\n", t.toc()/(float)Ntest);
/*********************COPY KERNEL*****************************/
t.tic();
for(int i= 0; i<Ntest; i++){
copy_kernel<<< N/128 + 1, 128 >>>(thrust::raw_pointer_cast(&X4[0]),
thrust::raw_pointer_cast(&X3[0]), N);
cudaDeviceSynchronize();
}
printf ("Time for copy kernel: %f ms\n", t.toc()/(float)Ntest);
return 0;
}
Notice that I am performing the mean of 1000 copies.
The output of this code in a GTX 980 is the following:
Time for cudaMemcpy2DAsync: 1.465522 ms
Time for thrust transformation: 0.178745 ms
Time for copy kernel: 0.168507 ms
cudaMemcpy2D is an order of magnitude more slow than the rest.
thrust and copy kernel are very similar and the fastest way
This behavior appears to remain with any number of elements.
EDIT2:
Other answers suggest that GEMM could be used to communicate the stride. Without the need for a temporal array.
Interpreting the Matrix vector mul. as a Matrix Matrix mul. would be done like so:
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T,
3*N, 1 /*m*/, 3*N,
&alpha,
thrust::raw_pointer_cast(&M[0]), 3*N,
(float*) thrust::raw_pointer_cast(&X3[0]), 1 /*ldb*/,
&beta,
thrust::raw_pointer_cast(&Y[0]), 3*N);
However, at this point, I do not know how to pass X4 instead of X3. The solution appears to be in the m and ldb parameters.
You could treat your 1-D float4 vector as a Nx3 2-D float matrix with a row stride of 4, and use cudaMemcpy2DAsync to change the stride from 4 to 3 with
cudaMemcpy2DAsync(dst,
3*sizeof(float),
src,
4*sizeof(float),
3*sizeof(float),
N,
cudaMemcpyDeviceToDevice);
Then the dst can be treated as a 3N 1-D float vector and passed to gemv() directly.
Given the scale of your N, the time of copying is not noticeable compared to gemv().
EDIT
Benchmark result from #Apo shows that it is better to use a copy kernel instead of cudaMemcpy2DAsync. I was over-expected on cudaMemcpy2DAsync and thought it would be well optimized and have best performance for all cases.

How to get the real and imaginary parts of a complex matrix separately in CUDA?

I'm trying to get the fft of a 2D array. The input is a NxM real matrix, therefore the output matrix is also a NxM matrix (2xNxM output matrix which is complex is saved in a NxM matrix using the property Hermitian symmetry).
So i want to know whether there is method to extract in cuda to extract real and complex matrices separately ? In opencv split function does the duty. So I'm looking for a similar function in cuda, but I couldn't find it yet.
Given below is my complete code
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iostream>
#include <vector>
using namespace std;
int main()
{
const size_t NX = 4;
const size_t NY = 5;
// Input array - host side
float b[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067}
};
// Output array - host side
float c[NX][NY] = { 0 };
cufftHandle plan;
cufftComplex *data; // Holds both the input and the output - device side
int n[NRANK] = {NX, NY};
// Allocated memory and copy from host to device
cudaMalloc((void**)&data, sizeof(cufftComplex)*NX*(NY/2+1));
for(int i=0; i<NX; ++i){
// Uses this because my actual array is a dynamically allocated.
// but here I've replaced it with a static 2D array to make it simple.
cudaMemcpy(reinterpret_cast<float*>(data) + i*NY, b[i], sizeof(float)*NY, cudaMemcpyHostToDevice);
}
// Performe the fft
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_R2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, (cufftReal*)data, data);
cudaThreadSynchronize();
cudaMemcpy(c, data, sizeof(float)*NX*NY, cudaMemcpyDeviceToHost);
// Here c is a NxM matrix. I want to split it to 2 seperate NxM matrices with each
// having the complex and real component of the output
// Here c is in
cufftDestroy(plan);
cudaFree(data);
return 0;
}
EDIT
As suggested by JackOLanter, I modified the code as below. But still the problem is not solved.
float real_vec[NX][NY] = {0}; // host vector, real part
float imag_vec[NX][NY] = {0}; // host vector, imaginary part
cudaError cudaStat1 = cudaMemcpy2D (real_vec, sizeof(real_vec[0]), data, sizeof(data[0]),NY*sizeof(float2), NX, cudaMemcpyDeviceToHost);
cudaError cudaStat2 = cudaMemcpy2D (imag_vec, sizeof(imag_vec[0]),data + 1, sizeof(data[0]),NY*sizeof(float2), NX, cudaMemcpyDeviceToHost);
The error i get is 'invalid pitch argument error'. But i can't understand why. For the destination I use a pitch size of 'float' while for the source i use size of 'float2'
Your question and your code do not make much sense to me.
You are performing a batched FFT, but it seems you are not foreseeing enough memory space neither for the input, nor for the output data;
The output of cufftExecR2C is a NX*(NY/2+1) float2 matrix, which can be interpreted as a NX*(NY+2) float matrix. Accordingly, you are not allocating enough space for c (which is only NX*NY float) for the last cudaMemcpy. You would need still one complex memory location for the continuous component of the output;
Your question does not seem to be related to the cufftExecR2C command, but is much more general: how can I split a complex NX*NY matrix into 2 NX*NY real matrices containing the real and imaginary parts, respectively.
If I correctly interpret your question, then the solution proposed by #njuffa at
Copying data to “cufftComplex” data struct?
could be a good clue to you.
EDIT
In the following, a small example on how "assembling" and "disassembling" the real and imaginary parts of complex vectors when copying them from/to host to/from device. Please, add your own CUDA error checking.
#include <stdio.h>
#define N 16
int main() {
// Declaring, allocating and initializing a complex host vector
float2* b = (float2*)malloc(N*sizeof(float2));
printf("ORIGINAL DATA\n");
for (int i=0; i<N; i++) {
b[i].x = (float)i;
b[i].y = 2.f*(float)i;
printf("%f %f\n",b[i].x,b[i].y);
}
printf("\n\n");
// Declaring and allocating a complex device vector
float2 *data; cudaMalloc((void**)&data, sizeof(float2)*N);
// Copying the complex host vector to device
cudaMemcpy(data, b, N*sizeof(float2), cudaMemcpyHostToDevice);
// Declaring and allocating space on the host for the real and imaginary parts of the complex vector
float* cr = (float*)malloc(N*sizeof(float));
float* ci = (float*)malloc(N*sizeof(float));
/*******************************************************************/
/* DISASSEMBLING THE COMPLEX DATA WHEN COPYING FROM DEVICE TO HOST */
/*******************************************************************/
float* tmp_d = (float*)data;
cudaMemcpy2D(cr, sizeof(float), tmp_d, 2*sizeof(float), sizeof(float), N, cudaMemcpyDeviceToHost);
cudaMemcpy2D(ci, sizeof(float), tmp_d+1, 2*sizeof(float), sizeof(float), N, cudaMemcpyDeviceToHost);
printf("DISASSEMBLED REAL AND IMAGINARY PARTS\n");
for (int i=0; i<N; i++)
printf("cr[%i] = %f; ci[%i] = %f\n",i,cr[i],i,ci[i]);
printf("\n\n");
/******************************************************************************/
/* REASSEMBLING THE REAL AND IMAGINARY PARTS WHEN COPYING FROM HOST TO DEVICE */
/******************************************************************************/
cudaMemcpy2D(tmp_d, 2*sizeof(float), cr, sizeof(float), sizeof(float), N, cudaMemcpyHostToDevice);
cudaMemcpy2D(tmp_d + 1, 2*sizeof(float), ci, sizeof(float), sizeof(float), N, cudaMemcpyHostToDevice);
// Copying the complex device vector to host
cudaMemcpy(b, data, N*sizeof(float2), cudaMemcpyHostToDevice);
printf("REASSEMBLED DATA\n");
for (int i=0; i<N; i++)
printf("%f %f\n",b[i].x,b[i].y);
printf("\n\n");
getchar();
return 0;
}