I have this peace of code in OpenCL:
std::string src = "__kernel void dot_product(__global float* weights,"
"__global float* values,"
"__global float* result,"
"__const unsigned int sz){"
"float dot = 0.f;"
"unsigned int i;"
"int current_idx = get_global_id(0);"
"unsigned int offset = current_idx * sz;"
"for( i = 0; i < sz; ++i )"
"{"
"dot += weights[ offset + i ] * values[ offset + i ];"
"}"
"result[current_idx] = dot;"
"}";
Which gets stuck on result[current_idx] = dot; If I comment out this code everything works well.
I don't get why it shall get stack.
The relevant c++ code is here:
using namespace cl;
std::array< float, CONST_INPUTS_NUMBER * CONST_NEURONS_NUMBER > in_weights;
std::array< float, CONST_INPUTS_NUMBER * CONST_NEURONS_NUMBER > in_values;
// Create a command queue and use the first device
const std::size_t size = in_weights.size();
std::vector< Device > devices =
m_context.getInfo< CL_CONTEXT_DEVICES >();
Buffer weights(m_context, CL_MEM_READ_ONLY, size * sizeof(float));
Buffer values(m_context, CL_MEM_READ_ONLY, size * sizeof(float));
Buffer product(m_context, CL_MEM_WRITE_ONLY, CONST_NEURONS_NUMBER * sizeof(float));
std::cout << __FILE__ << __LINE__ << std::endl;
// Set arguments to kernel
m_kernel.setArg(0, weights);
m_kernel.setArg(1, values);
m_kernel.setArg(2, product);
m_kernel.setArg(3, CONST_INPUTS_NUMBER);
CommandQueue queue(m_context, devices[0]);
try {
std::vector< float > dotProducts(CONST_NEURONS_NUMBER);
for(std::size_t i = 0; i < CONST_NEURONS_NUMBER; ++i) {
// Create memory buffers
for(std::size_t j = 0; j < CONST_INPUTS_NUMBER; ++j) {
const std::size_t index = i * CONST_INPUTS_NUMBER + j;
in_weights[index] = m_internal[i][j].weight;
in_values[index] = m_internal[i][j].value;
}
}
queue.enqueueWriteBuffer(weights,
CL_TRUE,
0,
in_weights.size() * sizeof(float),
in_weights.data());
queue.enqueueWriteBuffer(values,
CL_TRUE,
0,
in_values.size() * sizeof(float),
in_values.data());
for(std::size_t offset = 0; offset < CONST_NEURONS_NUMBER; ++offset) {
queue.enqueueNDRangeKernel(m_kernel,
cl::NDRange(offset),
cl::NDRange(CONST_INPUTS_NUMBER));
}
std::cout << __FILE__ << __LINE__ << std::endl;
queue.enqueueReadBuffer(product,
CL_TRUE,
0,
CONST_NEURONS_NUMBER * sizeof(float),
dotProducts.data());
std::cout << __FILE__ << __LINE__ << std::endl;
for(std::size_t i = 0; i < CONST_NEURONS_NUMBER; ++i) {
std::cout << __FILE__ << __LINE__ << std::endl;
m_internal[i].calculateOutput(dotProducts.begin(),
dotProducts.end());
}
} catch(const cl::Error& e) {
cl_int err;
cl::STRING_CLASS buildlog =
m_program.getBuildInfo< CL_PROGRAM_BUILD_LOG >(devices[0], &err);
std::cout << "Building error! Log: " << buildlog << std::endl;
}
Which gets stuck on result[current_idx] = dot; If I comment out this
code everything works well. I don't get why it shall get stack.
When you comment out the line where the calculated results are being written to the output buffer then the all calculations quite likely are being removed by optimizer leaving your kernel empty.
I think here is the problem:
for(std::size_t offset = 0; offset < CONST_NEURONS_NUMBER; ++offset) {
queue.enqueueNDRangeKernel(m_kernel, cl::NDRange(offset), cl::NDRange(CONST_INPUTS_NUMBER));
}
Specifically that you loop over enqueuing many kernels that work on the same output buffer which causes each kernel fighting for access to the same one buffer where the results are being overwritten anyway.
You need to enqueue the kernel only once without offset and with CONST_NEURONS_NUMBER global work items:
queue.enqueueNDRangeKernel(m_kernel, cl::NullRange, cl::NDRange(CONST_NEURONS_NUMBER));
CONST_INPUTS_NUMBER is already passed as a kernel argument.
Related
I'm trying to write one function that can deinterleave 8/16/24/32 bit audio data, given that the audio data naturally arrives in an 8 bit buffer.
I have this working for 8 bit, and it works for 16/24/32, but only for the first channel (channel 0). I have tried so many + and * and other operators that I'm just guessing at this point. I cannot find the magic formula. I am using C++ but would also accept a memcpy into the vector if that's easiest.
Check out the code. If you change the demux call to another bitrate you will see the problem. There is an easy math solution here I am sure, I just cannot get it.
#include <vector>
#include <map>
#include <iostream>
#include <iomanip>
#include <string>
#include <string.h>
const int bitrate = 8;
const int channel_count = 5;
const int audio_size = bitrate * channel_count * 4;
uint8_t audio_ptr[audio_size];
const int bytes_per_channel = audio_size / channel_count;
void Demux(int bitrate){
int byterate = bitrate/8;
std::map<int, std::vector<uint8_t> > channel_audio;
for(int i = 0; i < channel_count; i++){
std::vector<uint8_t> audio;
audio.reserve(bytes_per_channel);
for(int x = 0; x < bytes_per_channel; x += byterate){
for(int z = 0; z < byterate; z++){
// What is the magic formula!
audio.push_back(audio_ptr[(x * channel_count) + i + z]);
}
}
channel_audio.insert(std::make_pair(i, audio));
}
int remapsize = 0;
std::cout << "\nRemapped Audio";
std::map<int, std::vector<uint8_t> >::iterator it;
for(it = channel_audio.begin(); it != channel_audio.end(); ++it){
std::cout << "\nChannel" << it->first << " ";
std::vector<uint8_t> v = it->second;
remapsize += v.size();
for(size_t i = 0; i < v.size(); i++){
std::cout << "0x" << std::hex << std::setfill('0') << std::setw(2) << +v[i] << " ";
if(i && (i + 1) % 32 == 0){
std::cout << std::endl;
}
}
}
std::cout << "Total remapped audio size is " << std::dec << remapsize << std::endl;
}
int main()
{
// External data
std::cout << "Raw Audio\n";
for(int i = 0; i < audio_size; i++){
audio_ptr[i] = i;
std::cout << "0x" << std::hex << std::setfill('0') << std::setw(2) << +audio_ptr[i] << " ";
if(i && (i + 1) % 32 == 0){
std::cout << std::endl;
}
}
std::cout << "Total raw audio size is " << std::dec << audio_size << std::endl;
Demux(8);
//Demux(16);
//Demux(24);
//Demux(32);
}
You're actually pretty close. But the code is confusing: specifically the variable names and what actual values they represent. As a result, you appear to be just guessing the math. So let's go back to square one and determine what exactly it is we need to do, and the math will very easily fall out of it.
First, just imagine we have one sample covering each of the five channels. This is called an audio frame for that sample. The frame looks like this:
[channel0][channel1][channel2][channel3][channel4]
The width of a sample in one channel is called byterate in your code, but I don't like that name. I'm going to call it bytes_per_sample instead. You can easily see the width of the entire frame is this:
int bytes_per_frame = bytes_per_sample * channel_count;
It should be equally obvious that to find the starting offset for channel c within a single frame, you multiply as follows:
int sample_offset_in_frame = bytes_per_sample * c;
That's just about all you need! The last bit is your z loop which covers each byte in a single sample for one channel. I don't know what z is supposed to represent, apart from being a random single-letter identifier you chose, but hey let's just keep it.
Putting all this together, you get the absolute offset of sample s in channel c and then you copy individual bytes out of it:
int sample_offset = bytes_per_frame * s + bytes_per_sample * c;
for (int z = 0; z < bytes_per_sample; ++z) {
audio.push_back(audio_ptr[sample_offset + z]);
}
This does actually assume you're looping over the number of samples, not the number of bytes in your channel. So let's show all the loops for completion sake:
const int bytes_per_sample = bitrate / 8;
const int bytes_per_frame = bytes_per_sample * channel_count;
const int num_samples = audio_size / bytes_per_frame;
for (int c = 0; c < channel_count; ++c)
{
int sample_offset = bytes_per_sample * c;
for (int s = 0; s < num_samples; ++s)
{
for (int z = 0; z < bytes_per_sample; ++z)
{
audio.push_back(audio_ptr[sample_offset + z]);
}
// Skip to next frame
sample_offset += bytes_per_frame;
}
}
You'll see here that I split the math up so that it's doing less multiplications in the loops. This is mostly for readability, but might also help a compiler understand what's happening when it tries to optimize. Concerns over optimization are secondary (and in your case, there are much more expensive worries going on with those vectors and the map)..
The most important thing is you have readable code with reasonable variable names that makes logical sense.
I am running google benchmark for some basic cache testing and I get the following error:
terminate called after throwing an instance of 'std::length_error'
what(): cannot create std::vector larger than max_size()
However, I am printing the max_size and the actual size (see below) and while the max_size equals 2^60-1 it breaks at 2^28. What am I missing?
The benchmark code is below. The code is compiled using Clang 11 with c++20.
static void bm_std_vector_in_cache_double(benchmark::State& state)
{
auto constexpr d{3.1415};
auto const bytes = (2 << state.range(0)) * 1024;
auto data = std::vector<double>(bytes / sizeof(double), d);
std::cout << data.max_size() << '\n';
std::cout << data.size() << '\n';
for (auto _ : state){
auto sum = 0.0;
for(auto j = 0; j < data.size(); ++j)
benchmark::DoNotOptimize(sum += data[j] * data[j]);
}
state.SetBytesProcessed(state.iterations() * data.size());
}
BENCHMARK(bm_std_vector_in_cache_double)->DenseRange(1, 20);
The issue here was that the type of bytes was an int.
auto const bytes = (2 << state.range(0)) * 1024;
changing to
auto const bytes = (2 << state.range(0)) * 1024L;
changes it to a long allowing for longer vectors and even better better unsigned long long:
auto const bytes = (2 << state.range(0)) * 1024ULL;
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
So Here is My almost Complete code:
the first kernel which is normal global histogram works correctly. but I get the error "an illegal memory access was encountered (77)"
at the final memcpy after calculating the shared_histogram. I dont know what is wrong with the code. seems like the shared histogram does change the size of d_hist2. I also checked that bin_count is changed or not. but it didnt. so is my shared_histog kernel wrong or i am doing a mistake on memCpy??
note : w * h * nc is the size of my input image
__global__ void histog( int *img, int *hist, int bin_count, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
if(x>=n) return;
unsigned char value = img[x];
int bin = value % bin_count;
atomicAdd(&hist[bin],1);
}
__global__ void shared_histog( int *img, int *hist, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
int indx = threadIdx.x;
if(x>n) return;
__shared__ int shHist[256];
if (indx < 256)
shHist[indx] =0;
__syncthreads();
unsigned char value = img[x];
__syncthreads();
atomicAdd( (int*)&shHist[value], 1);
__syncthreads();
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
}
int main(int argc, char **argv)
{
cudaDeviceSynchronize(); CUDA_CHECK;
int *imgval = new int[(size_t)w*h*nc];
for (int i =0; i<w*h*nc; i++)
imgval[i] = (imgIn[i])*256 + 1;
int bin_count = 256;
int *Histogram = new int[bin_count];
int *Histogram2 = new int[bin_count];
for (int i =0; i <bin_count; i++)
Histogram2[i] = 0;
Timer timer; timer.start();
for (int i =0; i <bin_count; i++)
Histogram[i] = 0;
for (int i =0; i<w*h*nc; i++)
Histogram[(imgval[i])]++;
showHistogram256("CPU_Histo", Histogram, 100 + w + 40, 100);
timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "CPU time: " << t*1000 << " ms" << endl;
int *d_img = NULL;
int nbytes = w * h * nc * sizeof(int);
cudaMalloc(&d_img, nbytes); CUDA_CHECK;
cudaMemcpy(d_img, imgval, nbytes, cudaMemcpyHostToDevice); CUDA_CHECK;
int *d_hist = NULL;
cudaMalloc(&d_hist, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist, 0, bin_count * sizeof(int)); CUDA_CHECK;
int *d_hist2 = NULL;
cudaMalloc(&d_hist2, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist2, 0, bin_count * sizeof(int)); CUDA_CHECK;
dim3 block = dim3(1024,1,1);
dim3 grid = dim3 ((w*h*nc+block.x-1)/block.x, 1, 1);
Timer timer2; timer2.start();
histog <<<grid, block>>> (d_img, d_hist, bin_count, nbytes); CUDA_CHECK;
timer2.end(); float t2 = timer2.get(); // elapsed time in seconds
cout << "GPU time: " << t2*1000 << " ms" << endl;
cudaMemcpy(Histogram, d_hist,bin_count * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo", Histogram, 100 + w + 40, 100 + h/2 + 10);
Timer timer3; timer3.start();
shared_histog <<<grid, block>>> (d_img, d_hist2, nbytes); CUDA_CHECK;
timer3.end(); float t3 = timer3.get(); // elapsed time in seconds
cout << "Shared time: " << t3*1000 << " ms" << endl;
* here comes the error *
cudaMemcpy(Histogram2, d_hist2, 256 * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo_Shared", Histogram2, 100 + w + 40, 100 + h +10);
return 0;
}
You're using __syncthreads() after a conditional statement:
if(x>n) return;
that may prevent all threads in the block from reaching it. That is not correct usage:
__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.
But it is probably not connected to the illegal memory access.
You are launching this kernel with 1024 threads per block:
dim3 block = dim3(1024,1,1);
which means in the kernel, your indx variable:
int indx = threadIdx.x;
will go from 0..1023 depending on the thread, which means that this line:
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
^^^^ ^^^^
will attempt to index into both hist and shHist out-of bounds for threads whose indx value is greater than 255, since both hist and shHist are only allocated with 256 elements.
You can probably fix this by adding a conditional statement:
if (indx < 256)
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
If you compile with -lineinfo and use cuda-memcheck, you can actually have cuda-memcheck pinpoint the line of source code that is generating the out-of-bounds access.
I seem to be having some difficulty in the use of texture objects in CUDA. I took the code from here and simplified it and fleshed it out a bit. When I go to build it I get the error "type name is not allowed". It occurs on line 18 in my code, does anyone have any idea why that is the case?
#include <cuda_runtime.h>
#include <texture_fetch_functions.h>
#include <cuda_texture_types.h>
#include <texture_indirect_functions.h>
#include <cuda.h>
#include "device_launch_parameters.h"
#include <vector>
#include <iostream>
#include <cstdlib>
#include <cstring>
#define L 16384
__global__ void read(cudaTextureObject_t t, float *b){
float offset = blockIdx.x + 0.5f;
b[blockIdx.x] = tex2D<float>(t, offset, 0.5f);
}
int main(){
//device memory and host memory allocation
cudaChannelFormatDesc channelFormat = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *dev_buff_a;
float *dev_buff_b, *hst_buff, *print_buff;
hst_buff = (float *)malloc(L * sizeof(float));
print_buff = (float *)malloc(L * sizeof(float));
cudaMallocArray(&dev_buff_a, &channelFormat, L, 1);
cudaMalloc(&dev_buff_b, L * sizeof(float));
for(int i = 0; i < L; i++){
hst_buff[i] = 1.0f;
}
//
cudaMemcpyToArray(dev_buff_a, 0, 0, hst_buff, L * sizeof(float), cudaMemcpyHostToDevice);
//creating the texture object
//start with the resource descriptor
cudaResourceDesc resource;
memset(&resource, 0, sizeof(resource));
resource.resType = cudaResourceTypeArray;
resource.res.array.array = dev_buff_a;
/*resource.res.linear.desc.f = cudaChannelFormatKindFloat; //channel format
resource.res.linear.desc.x = 32; //bits per channel
resource.res.linear.sizeInBytes = L * sizeof(float);*/
//next, is the texture descriptor
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
//to create the actual texture object
cudaTextureObject_t tObj = 0;
cudaCreateTextureObject(&tObj, &resource, &texDesc, NULL);
//perform reading function
dim3 block(1, 0, 0);
dim3 grid(16384, 0, 0);
read<<<grid, block>>>(tObj, dev_buff_b);
//copy stuff over from dev_buff_b to print
cudaMemcpy(print_buff, dev_buff_b, L * sizeof(float), cudaMemcpyDeviceToHost);
//print out the arrays and compare
std::cout << "the original array was:\n";
for(int i = 0; i < L; i++){
std::cout << "element " << i << "is: " << hst_buff[i] << "\n";
}
std::cout << "the new array is:\n";
for(int i = 0; i < L; i++){
std::cout << "element " << i << "is: " << print_buff[i] << "\n";
}
//destroy the texture object
cudaDestroyTextureObject(tObj);
//free device memory
cudaFreeArray(dev_buff_a);
cudaFree(dev_buff_b);
return 0;
}
You need to make sure to follow these two things:
You're using Cuda 5.0 or newer
Compiler settings are to compile only for devices of compute capability 3.0 or better ('-arch compute_30' flag for nvcc)
Texture objects are only available on these newer devices.
I'm trying to find the fft of a dynamically allocated array. The input array is copied from host to device using cudaMemcpy2D. Then the fft is taken (cufftExecR2C) and the results are copied back from device to host.
So my initial problem was how to use the pitch information in the fft. Then I found an answer here - CUFFT: How to calculate fft of pitched pointer?
But unfortunately it doesn't work. The results I get are garbage values. Given below is my code.
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iomanip>
#include <iostream>
#include <vector>
using namespace std;
const size_t NX = 4;
const size_t NY = 6;
int main()
{
// Input array (static) - host side
float h_in_data_static[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961 , 0.9261},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782 , 0.1782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427, 0.3842},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067, 0.1712}
};
// --------------------------------
// Input array (dynamic) - host side
float *h_in_data_dynamic = new float[NX*NY];
// Set the values
size_t h_ipitch;
for (int r = 0; r < NX; ++r) // this can be also done on GPU
{
for (int c = 0; c < NY; ++c)
{ h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c]; }
}
// --------------------------------
// Output array - host side
float2 *h_out_data_temp = new float2[NX*(NY/2+1)] ;
// Input and Output array - device side
cufftHandle plan;
cufftReal *d_in_data;
cufftComplex * d_out_data;
int n[NRANK] = {NX, NY};
// Copy input array from Host to Device
size_t ipitch;
cudaError cudaStat1 = cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);
cout << cudaGetErrorString(cudaStat1) << endl;
cudaError cudaStat2 = cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);
cout << cudaGetErrorString(cudaStat2) << endl;
// Allocate memory for output array - device side
size_t opitch;
cudaError cudaStat3 = cudaMallocPitch((void**)&d_out_data,&opitch,(NY/2+1)*sizeof(cufftComplex),NX);
cout << cudaGetErrorString(cudaStat3) << endl;
// Performe the fft
int rank = 2; // 2D fft
int istride = 1, ostride = 1; // Stride lengths
int idist = 1, odist = 1; // Distance between batches
int inembed[] = {ipitch, NX}; // Input size with pitch
int onembed[] = {opitch, NX}; // Output size with pitch
int batch = 1;
cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch);
//cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, d_in_data, d_out_data);
cudaThreadSynchronize();
// Copy d_in_data back from device to host
cudaError cudaStat4 = cudaMemcpy2D(h_out_data_temp,(NY/2+1)*sizeof(float2), d_out_data, opitch, (NY/2+1)*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost);
cout << cudaGetErrorString(cudaStat4) << endl;
// Print the results
for (int i = 0; i < NX; i++)
{
for (int j =0 ; j< NY/2 + 1; j++)
printf(" %f + %fi",h_out_data_temp[i*(NY/2+1) + j].x ,h_out_data_temp[i*(NY/2+1) + j].y);
printf("\n");
}
cudaFree(d_in_data);
return 0;
}
I think the problem is in cufftPlanMany. How can I solve this issue ?
You may want to study the advanced data layout section of the documentation carefully.
I think the previous question that was linked is somewhat confusing because that question is passing the width and height parameters in reverse order for what I would expect for a cufft 2D plan. However the answer then mimics that order so it is at least consistent.
Secondly, you missed in the previous question that the "pitch" parameters that are being passed in inembed and onembed are not the same as the pitch parameters that you would receive from a cudaMallocPitch operation. They have to be scaled by the number of bytes per data element in the input and output data sets. I'm actually not entirely sure this is the intended use of the inembed and onembed parameters, but it seems to work.
When I adjust your code to account for the above two changes, I seem to get valid results, at least they appear to be in a reasonable range. You've posted several questions now about 2D FFTs, where you've said the results are not correct. I can't do these 2D FFT's in my head, so I suggest in the future you indicate what data you are expecting.
This has the changes I made:
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iomanip>
#include <iostream>
#include <vector>
using namespace std;
const size_t NX = 4;
const size_t NY = 6;
int main()
{
// Input array (static) - host side
float h_in_data_static[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961 , 0.9261},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782 , 0.1782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427, 0.3842},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067, 0.1712}
};
// --------------------------------
// Input array (dynamic) - host side
float *h_in_data_dynamic = new float[NX*NY];
// Set the values
size_t h_ipitch;
for (int r = 0; r < NX; ++r) // this can be also done on GPU
{
for (int c = 0; c < NY; ++c)
{ h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c]; }
}
// --------------------------------
int owidth = (NY/2)+1;
// Output array - host side
float2 *h_out_data_temp = new float2[NX*owidth] ;
// Input and Output array - device side
cufftHandle plan;
cufftReal *d_in_data;
cufftComplex * d_out_data;
int n[NRANK] = {NX, NY};
// Copy input array from Host to Device
size_t ipitch;
cudaError cudaStat1 = cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);
cout << cudaGetErrorString(cudaStat1) << endl;
cudaError cudaStat2 = cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);
cout << cudaGetErrorString(cudaStat2) << endl;
// Allocate memory for output array - device side
size_t opitch;
cudaError cudaStat3 = cudaMallocPitch((void**)&d_out_data,&opitch,owidth*sizeof(cufftComplex),NX);
cout << cudaGetErrorString(cudaStat3) << endl;
// Performe the fft
int rank = 2; // 2D fft
int istride = 1, ostride = 1; // Stride lengths
int idist = 1, odist = 1; // Distance between batches
int inembed[] = {NX, ipitch/sizeof(cufftReal)}; // Input size with pitch
int onembed[] = {NX, opitch/sizeof(cufftComplex)}; // Output size with pitch
int batch = 1;
if ((cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch)) != CUFFT_SUCCESS) cout<< "cufft error 1" << endl;
//cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
if ((cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)) != CUFFT_SUCCESS) cout << "cufft error 2" << endl;
if ((cufftExecR2C(plan, d_in_data, d_out_data)) != CUFFT_SUCCESS) cout << "cufft error 3" << endl;
cudaDeviceSynchronize();
// Copy d_in_data back from device to host
cudaError cudaStat4 = cudaMemcpy2D(h_out_data_temp,owidth*sizeof(float2), d_out_data, opitch, owidth*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost);
cout << cudaGetErrorString(cudaStat4) << endl;
// Print the results
for (int i = 0; i < NX; i++)
{
for (int j =0 ; j< owidth; j++)
printf(" %f + %fi",h_out_data_temp[i*owidth + j].x ,h_out_data_temp[i*owidth + j].y);
printf("\n");
}
cudaFree(d_in_data);
return 0;
}