How to use linear indexes on cv::cuda::PtrStepSzf data - c++

I'm working with opencv 3.1 cv::cuda template matching but the cv::cuda::minMaxLoc() function is too slow for my case. My match results have minimum size of 128x128 and max size up to 512x512. In average minMaxLoc() will take 1.65 ms for the 128x128 and up to 25 ms for something like 350x350 which is too long since this is done hundreds of times.
I underestand that my match sizes are maybe too small for what do you usually use in GPU. But I want to test along the lines that Robert Crovella did at thrust::max_element slow in comparison cublasIsamax - More efficient implementation? to see if I can get better performance.
My problem is that all those reductions the data is being read using linear indexes and cv::cuda::PtrStepSzfdoes not allow this(At least I did not find how). I try to reshape my match result but I cannot do that since the data is not contiguous. Do I need to go toward cudaMallocPitch and cudaMemcpy2DIf that the case how I do that with a cv::cuda::GPUMat read as cv::cuda::PtrStepSzf object?
__global__ void minLoc(const cv::cuda::PtrStepSzf data,
float* minVal,
float * minValLoc
)
{
int dsize = data.cols*data.rows
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
T my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
//data(idx) is an illegal call;The legal one is data(x,y)
// How do I do it?
if (data(idx) > my_val)
{
my_val = data(idx); my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
dim3 cthreads(32, 32);
dim3 cblocks(
static_cast<int>(std::ceil(input1.size().width /
static_cast<double>(cthreads.x))),
static_cast<int>(std::ceil(input1.size().height /
static_cast<double>(cthreads.y))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
int main()
{
//read Background and template
cv::Mat input = imread("cat.jpg",0);
cv::Mat templ = imread("catNose.jpg",0)
//convert to floats
cv::Mat float_input, float_templ;
input.convertTo(float_input,CV_32FC1);
input.convertTo(float_templ,CV_32FC1);
//upload Bckg and template to gpu
cv::cuda::GpuMat d_src,d_templ, d_match;
Size size = float_input.size();
d_src.upload(float_input);
d_templ.upload(float_templ);
double min_val, max_val;
Point min_loc, max_loc;
Ptr<cv::cuda::TemplateMatching> alg = cuda::createTemplateMatching(d_src.type(), CV_TM_SQDIFF);
alg->match(d_src, d_templ, d_match);
cv::cuda::Normalize(d_match,d_match);
//Too slow
//cv::cuda::minMaxLoc(d_match, &min_val, &max_val, &min_loc, &max_loc);
callMinLocKernel(d_match,min_val,min_loc);
return 0;
}

I did not find a way to actually use linear indexes with cv::cuda::PtrStepSzf. I am not sure there is one. Looks like when this format is used it can only use 2 subscripts. Instead I used the pointer ptr on cv::cuda::GpuMat input variable in the kernel wrapper as follow:
#define nTPB 1024
#define FLOAT_MAX 9999.0f
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
const float* linSrc = input.ptr<float>();
size_t step = input.step;
dim3 cthreads(nTPB);
dim3 cblocks(
static_cast<int>(std::ceil(input.size().width*input1.size().height /
static_cast<double>(nTPB))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
And inside the Kernel as:
__global__ void minLoc(const float* data,
const size_t step,
cv::Size dataSz,
float* minVal,
float * minValLoc
)
{
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
const int dsize = dataSz.height*dataSz.width;
last_block = 0;
float my_val = FLOAT_MAX;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
int row = idx / dataSz.width;
int id = ( row*step / sizeof( float ) ) + idx % dataSz.width;
if ( data[id] < my_val )
{
my_val = data[id];
my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
The step is in bytes so it needs to be divided by sizeof(typeVariable)
I hope this help!

Related

Image subtraction with CUDA and textures

My goal is to use C++ with CUDA to subtract a dark frame from a raw image. I want to use textures for acceleration. The input of the images is cv::Mat with the type CV_8UC4 (I use the pointer to the data of the cv::Mat). This is the kernel I came up with, but I have no idea how to eventually subtract the textures from each other:
__global__ void DarkFrameSubtractionKernel(unsigned char* outputImage, size_t pitchOutputImage,
cudaTextureObject_t inputImage, cudaTextureObject_t darkImage, int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const float tx = (x + 0.5f);
const float ty = (y + 0.5f);
if (x >= width || y >= height) return;
uchar4 inputImageTemp = tex2D<uchar4>(inputImage, tx, ty);
uchar4 darkImageTemp = tex2D<uchar4>(darkImage, tx, ty);
outputImage[y * pitchOutputImage + x] = inputImageTemp - darkImageTemp; // this line will throw an error
}
This is the function that calls the kernel (you can see that I create the textures from unsigned char):
void subtractDarkImage(unsigned char* inputImage, size_t pitchInputImage, unsigned char* outputImage,
size_t pitchOutputImage, unsigned char* darkImage, size_t pitchDarkImage, int width, int height,
cudaStream_t stream)
{
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
resDesc.res.pitch2D.devPtr = inputImage;
resDesc.res.pitch2D.pitchInBytes = pitchInputImage;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
cudaTextureDesc texDesc = {};
texDesc.readMode = cudaReadModeElementType;
texDesc.addressMode[0] = cudaAddressModeBorder;
texDesc.addressMode[1] = cudaAddressModeBorder;
cudaTextureObject_t imageInputTex, imageDarkTex;
CUDA_CHECK(cudaCreateTextureObject(&imageInputTex, &resDesc, &texDesc, 0));
resDesc.res.pitch2D.devPtr = darkImage;
resDesc.res.pitch2D.pitchInBytes = pitchDarkImage;
CUDA_CHECK(cudaCreateTextureObject(&imageDarkTex, &resDesc, &texDesc, 0));
dim3 block(32, 8);
dim3 grid = paddedGrid(block.x, block.y, width, height);
DarkImageSubtractionKernel << <grid, block, 0, stream >> > (reinterpret_cast<uchar4*>(outputImage), pitchOutputImage / sizeof(uchar4),
imageInputTex, imageDarkTex, width, height);
CUDA_CHECK(cudaDestroyTextureObject(imageInputTex));
CUDA_CHECK(cudaDestroyTextureObject(imageDarkTex));
}
The code does not compile as I can not subtract a uchar4 from another one (in the kernel). Is there an easy way of subtraction here?
Help is very much appreciated.
Is there an easy way of subtraction here?
There are no arithmetic operators defined for CUDA built-in vector types. If you replace
outputImage[y * pitchOutputImage + x] = inputImageTemp - darkImageTemp;
with
uchar4 val;
val.x = inputImageTemp.x - darkImageTemp.x;
val.y = inputImageTemp.y - darkImageTemp.y;
val.z = inputImageTemp.z - darkImageTemp.z;
val.w = inputImageTemp.w - darkImageTemp.w;
outputImage[y * pitchOutputImage + x] = val;
things will work. If this offends you, I suggest writing a small library of helper functions to hide the mess.

Unexpected CPU utilization with OpenCL

I've written a simple OpenCL kernel to calculate the cross-correlation of two images on the GPU. However, when I execute the kernel with enqueueNDRangeKernel the CPU usage of one core rises to 100%, but the host code does nothing except waiting for the enqueued command to finish. Is this normal behavior of an OpenCL program? What is going on there?
OpenCL kernel (if relevant):
kernel void cross_correlation(global double *f,
global double *g,
global double *res) {
// This work item will compute the cross-correlation value for pixel w
const int2 w = (int2)(get_global_id(0), get_global_id(1));
// Main loop
int xy_index = 0;
int xy_plus_w_index = w.x + w.y * X;
double integral = 0;
for ( int y = 0; y + w.y < Y; ++y ) {
for ( int x = 0; x + w.x < X; ++x, ++xy_index, ++xy_plus_w_index ) {
// xy_index is equal to x + y * X
// xy_plus_w_index is equal to (x + w.x) + (y + w.y) * X
integral += f[xy_index] * g[xy_plus_w_index];
}
xy_index += w.x;
xy_plus_w_index += w.x;
}
res[w.x + w.y * X] = integral;
}
The images f, g, res have a size of X times Y pixels, where X and Y are set at compile time. I'm testing the above kernel with X = 2048 and Y = 2048.
Additional info: I am running the kernel on a Nvidia GPU with OpenCL version 1.2. The C++ program is written using the OpenCL C++ Wrapper API and executed on Debian using optirun from the bumblebee package.
As requested, here is a minimal working example:
#include <CL/cl.hpp>
#include <sstream>
#include <fstream>
using namespace std;
int main ( int argc, char **argv ) {
const int X = 2048;
const int Y = 2048;
// Create context
cl::Context context ( CL_DEVICE_TYPE_GPU );
// Read kernel from file
ifstream kernel_file ( "cross_correlation.cl" );
stringstream buffer;
buffer << kernel_file.rdbuf ( );
string kernel_code = buffer.str ( );
// Build kernel
cl::Program::Sources sources;
sources.push_back ( { kernel_code.c_str ( ), kernel_code.length ( ) } );
cl::Program program ( context, sources );
program.build ( " -DX=2048 -DY=2048" );
// Allocate buffer memory
cl::Buffer fbuf ( context, CL_MEM_READ_WRITE, X * Y * sizeof(double) );
cl::Buffer gbuf ( context, CL_MEM_READ_WRITE, X * Y * sizeof(double) );
cl::Buffer resbuf ( context, CL_MEM_WRITE_ONLY, X * Y * sizeof(double) );
// Create command queue
cl::CommandQueue queue ( context );
// Create kernel
cl::Kernel kernel ( program, "cross_correlation" );
kernel.setArg ( 0, fbuf );
kernel.setArg ( 1, gbuf );
kernel.setArg ( 2, resbuf );
// Set input arguments
double *f = new double[X*Y];
double *g = new double[X*Y];
for ( int i = 0; i < X * Y; i++ )
f[i] = g[i] = 0.001 * i;
queue.enqueueWriteBuffer ( fbuf, CL_TRUE, 0, X * Y * sizeof(double), f );
queue.enqueueWriteBuffer ( gbuf, CL_TRUE, 0, X * Y * sizeof(double), g );
// Execute kernel
queue.enqueueNDRangeKernel ( kernel, cl::NullRange, cl::NDRange ( X, Y ), cl::NullRange, NULL, NULL );
queue.finish ( );
return 0;
}
You don't say how you call enqueueNDRangeKernel - which is the critical bit. As I understand it, for NVidia, the call is blocking (although I don't think it's part of the standard that it should be so.)
You can get around this by having a separate thread invoke enqueueNDRangeKernel and let that thread block on it whilst your other threads continue, and teh blocking thread can signal an event when it completes.
There's a discussion on it here - and it raises some caveats about having multiple calls to the enqueue occurring in parallel.

OPENCV : CUDA context initialization for different methods

I'm working on a simple c++ program to evaluate the performance of some Opencv GPU methods (cv::cuda).
I am using Opencv 3.1 on Ubuntu 15 (with CUDA 7.5) with a GeForce 770.
I previously read that we need to initialize CUDA environment to avoid slow process at first call. So, I initialize my program with a cv::cuda::getDevice() and setDevice().
Then, I test 2 methods:
cv::cuda::resize() (factor 0.5)
and cv::cuda::meanStdDev.
Initialization takes 400ms. Then, resizing takes 2 or 3 ms, that's OK.
But... meanStdDev takes 476ms!
If I run 2 successive meanStdDev, the second one is much faster (3ms).
I really don't understand why the initialization has an effect on resize() but not on meanStdDev().
I compile OPENCV with -DCUDA_ARCH_BIN=3.0. I try with -DCUDA_ARCH_PTX="" but the problem is still the same.
#include <opencv2/opencv.hpp>
#include <opencv2/cudaimgproc.hpp>
#include "opencv2/cudawarping.hpp"
#include "opencv2/cudaarithm.hpp"
using namespace std;
int main(int argc, char *argv[])
{
double t_init_cuda = (double)cv::getTickCount();
int CudaDevice;
if(cv::cuda::getCudaEnabledDeviceCount()==0)
{
cerr<<endl<<"ERROR: NO CudaEnabledDevice"<<endl;
exit(2);
}
else
{
CudaDevice = cv::cuda::getDevice();
cv::cuda::setDevice(CudaDevice);
}
t_init_cuda = ((double)cv::getTickCount() - t_init_cuda)/cv::getTickFrequency() * 1000;
cout<<endl<<"\t*T_INIT_CUDA="<<t_init_cuda<<"ms\n";;
cv::Mat src = cv::imread(argv[1], 0);
if (!src.data) exit(1);
cv::cuda::GpuMat d_src(src);
//CV::CUDA::RESIZE
cv::cuda::GpuMat d_dst;
double factor = 0.5;
double t_gpu_resize = cv::getTickCount();
cv::cuda::resize(d_src, d_dst, cv::Size( (int) ((float) (d_src.cols)*factor) , (int) ((float) (d_src.rows)*factor)), 0, 0, CV_INTER_AREA);
t_gpu_resize = ((double)cv::getTickCount() - t_gpu_resize)/cv::getTickFrequency() * 1000;
cout<<endl<<"D_SRC="<<d_src.rows<<"x"<<d_src.cols<<" => D_DST="<<d_dst.rows<<"x"<<d_dst.cols<<endl;
cout<<endl<<"\t*T_GPU_RESIZE="<<t_gpu_resize<<"ms\n";;
//CV::CUDA::MEANSTDDEV
double t_meanstddev = (double)cv::getTickCount();
cv::Scalar mean, stddev;
std::vector<cv::cuda::GpuMat> d_src_split;
cv::cuda::split(d_src, d_src_split);
cv::cuda::meanStdDev (d_src_split[0], mean, stddev);
t_meanstddev = ((double)cv::getTickCount() - t_meanstddev)/cv::getTickFrequency() * 1000.0;
cout<<endl<<"mean="<<mean.val[0]<<" | stddev="<<stddev.val[0]<<endl;
cout<<endl<<"\t*T_GPU_MEANSTDDEV="<<t_meanstddev<<"ms\n";
return 0;
}
My friend, When you call same function twice :
1- First time you allocate new memory at Device for resized. "According to WIKI of OpenCV"
2- Second time you reuse allocated memory so it will be fast.
I get that function from OpenCV for you so you can understand why it said that.
void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
{
if (!deviceSupports(FEATURE_SET_COMPUTE_13))
CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
const GpuMat src = getInputMat(_src, stream);
CV_Assert( src.type() == CV_8UC1 );
GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
int bufSize;
#if (CUDA_VERSION <= 4020)
nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
#else
nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
#endif
BufferPool pool(stream);
GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1); // <--- this line create new GpuMat
NppStreamHandler h(StreamAccessor::getStream(stream));
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );
syncOutput(dst, _dst, stream);
}
this function
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
{
GpuMat buf(allocator_);
buf.create(rows, cols, type);
return buf;
}
I hope this will help you.

CUDA -- simple code but some of my warps don't run

EDIT: As I was reading this question after myself, I figured it out.
The root of the problem is most likely that I didn't allocate enough memory. I will try to think about this and do it correctly and then answer to my question. Silly me. :-[ It doesn't explain the warps not showing up in stdout though...
Original question
I created a templated kernel in CUDA in which I iterate over sections of grayscale image data in global memory (shared memory optimizations are due when I get this working) to achieve morphological operations with disc-shaped structure elements. Each thread corresponds to a pixel of the image. When the data type is char, everything works as expected, all my threads do what they should. When I change it to unsigned short, it starts acting up and only computes the upper half of my image. When I put in some printfs (my device has 2.0 CC), I found out that some of the warps that should run aren't even computed.
Here's the relevant code.
From my main.cpp I call gcuda::ErodeGpuGray8(img, radius); and gcuda::ErodeGpuGray16(img, radius); which are the following functions:
// gcuda.h
…
i3d::Image3d<i3d::GRAY8> ErodeGpuGray8(i3d::Image3d<i3d::GRAY8> img, const unsigned int radius);
i3d::Image3d<i3d::GRAY16> ErodeGpuGray16(i3d::Image3d<i3d::GRAY16> img, const unsigned int radius);
…
// gcuda.cu
…
// call this from outside
Image3d<GRAY8> ErodeGpuGray8(Image3d<GRAY8> img, const unsigned int radius) {
return ErodeGpu<GRAY8>(img, radius);
}
// call this from outside
Image3d<GRAY16> ErodeGpuGray16(Image3d<GRAY16> img, const unsigned int radius) {
return ErodeGpu<GRAY16>(img, radius);
}
…
The library I'm using defines GRAY8 as char and GRAY16 as unsigned short.
Here's how I call the kernel (blockSize is a const int set to 128 in the relevant namespace):
// gcuda.cu
template<typename T> Image3d<T> ErodeGpu(Image3d<T> img, const unsigned int radius) {
unsigned int width = img.GetWidth();
unsigned int height = img.GetHeight();
unsigned int w = nextHighestPower2(width);
unsigned int h = nextHighestPower2(height);
const size_t n = width * height;
const size_t N = w * h;
Image3d<T>* rslt = new Image3d<T>(img);
T *vx = rslt->GetFirstVoxelAddr();
// kernel parameters
dim3 dimBlock( blockSize );
dim3 dimGrid( ceil( N / (float)blockSize) );
// source voxel array on device (orig)
T *vx_d;
// result voxel array on device (for result of erosion)
T *vxr1_d;
// allocate memory on device
gpuErrchk( cudaMalloc( (void**)&vx_d, n ) );
gpuErrchk( cudaMemcpy( vx_d, vx, n, cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMalloc( (void**)&vxr1_d, n ) );
gpuErrchk( cudaMemcpy( vxr1_d, vx_d, n, cudaMemcpyDeviceToDevice ) );
ErodeGpu<T><<<dimGrid, dimBlock>>>(vx_d, vxr1_d, n, width, radius);
gpuErrchk( cudaMemcpy( vx, vxr1_d, n, cudaMemcpyDeviceToHost ) );
// free device memory
gpuErrchk( cudaFree( vx_d ) );
gpuErrchk( cudaFree( vxr1_d ) );
// for debug purposes
rslt->SaveImage("../erodegpu.png");
return rslt;
}
The dimensions of my testing image are 82x82, so n = 82*82 = 6724 and N = 128*128 = 16384.
This is my kernel:
// gcuda.cu
// CUDA Kernel -- used for image erosion with a circular structure element of radius "erosionR"
template<typename T> __global__ void ErodeGpu(const T *in, T *out, const unsigned int n, const int width, const int erosionR)
{
ErodeOrDilateCore<T>(ERODE, in, out, n, width, erosionR);
}
// The core of erosion or dilation. Operation is determined by the first parameter
template<typename T> __device__ void ErodeOrDilateCore(operation_t operation, const T *in, T *out, const unsigned int n, const int width, const int radius) {
// get thread number, this method is overkill for my purposes but generally should be bulletproof, right?
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
int tx = threadId;
if (tx >= n) {
printf("[%d > %d]", tx, n);
return;
} else {
printf("{%d}", tx);
}
… (erosion implementation, stdout is the same when this is commented out so it's probably not the root of the problem)
}
To my understanding, this code should write a randomly sorted set of [X > N] and {X} strings to stdout, where X = thread ID and there should be n curly-bracketed numbers (i.e. the output of threads with the index < n) and N - n of the rest, but when I run it and count the curly-bracketed numbers using a regex, I find out that I only get 256 of them. Furthermore, they seem to occur in 32-member groups, which tells me that some warps are run and some are not.
I am really baffled by this. It doesn't help that when I don't comment out the erosion implementation part, the GRAY8 erosion works and the GRAY16 erosion doesn't, even though the stdout output is exactly the same in both cases (could be input-dependent, I only tried this with 2 images).
What am I missing? What could be the cause of this? Is there some memory-management mistake on my part or is it fine that some warps don't run and the erosion stuff is possibly just a bug in the image library that only occurs with the GRAY16 type?
So this was just a stupid malloc mistake.
Instead of
const size_t n = width * height;
const size_t N = w * h;
I used
const int n = width * height;
const int N = w * h;
and instead of the erroneous
gpuErrchk( cudaMalloc( (void**)&vx_d, n ) );
gpuErrchk( cudaMemcpy( vx_d, vx, n, cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMalloc( (void**)&vxr1_d, n ) );
gpuErrchk( cudaMemcpy( vxr1_d, vx_d, n, cudaMemcpyDeviceToDevice ) );
…
gpuErrchk( cudaMemcpy( vx, vxr1_d, n, cudaMemcpyDeviceToHost ) );
I used
gpuErrchk( cudaMalloc( (void**)&vx_d, n * sizeof(T) ) );
gpuErrchk( cudaMemcpy( vx_d, vx, n * sizeof(T), cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMalloc( (void**)&vxr1_d, n * sizeof(T) ) );
gpuErrchk( cudaMemcpy( vxr1_d, vx_d, n * sizeof(T), cudaMemcpyDeviceToDevice ) );
…
gpuErrchk( cudaMemcpy( vx, vxr1_d, n * sizeof(T), cudaMemcpyDeviceToHost ) );
and the erosion is working correctly now, which was the main problem I was trying to solve. I'm still not getting the stdout output I'm expecting though, so if someone could shed some light on that, please do so.

fftw - Access violation error

I implemented a fftw (fftw.org) example to use Fast Fourier transforms...
This is the code....
I load an image that I convert from uint8_t to double (this code works fine...).
string bmpFileNameImage = "files/testDummyFFTWWithWisdom/onechannel_image.bmp";
BMPImage bmpImage(bmpFileNameImage);
vector<double>pixelColors;
vector<uint8_t> image = bmpImage.copyBits();
toDouble(image,pixelColors,256,256, 1);
int width = bmpImage.width();
int height = bmpImage.height();
I use wisdom files to improve the performance
FILE * file = fopen("wisdom.fftw", "r");
if (file) {
fftw_import_wisdom_from_file(file);
fclose(file);
}
///* fftw variables */
fftw_complex *out;
double *wisdomInput = (double *) fftw_malloc(sizeof(double)*width*2*(height/2 +1 ));
const fftw_plan forward =fftw_plan_dft_r2c_2d(width,height, wisdomInput,reinterpret_cast<fftw_complex *>(wisdomInput),FFTW_PATIENT);
const fftw_plan inverse = fftw_plan_dft_c2r_2d(width, height,reinterpret_cast<fftw_complex *>(wisdomInput),wisdomInput, FFTW_PATIENT);
file = fopen("wisdom.fftw", "w");
if (file) {
fftw_export_wisdom_to_file(file);
fclose(file);
}
Finally, I execute the fftw library.... I receive an Access violation error with the first
function (fftw_execute_dft_r2c) and I don't know why... I read this tutorial:
http://www.fftw.org/fftw3_doc/Multi_002dDimensional-DFTs-of-Real-Data.html#Multi_002dDimensional-DFTs-of-Real-Data.
I do a malloc with (ny/2+1) how it is explained.... . I don't understand why it is not working.... I am testing different sizes...
out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * width *(height / 2 + 1));
double *result =(double *)fftw_malloc(width * (height+2) * sizeof(double));
fftw_execute_dft_r2c(forward,&pixelColors[0],out);
fftw_execute_dft_c2r(inverse,out,result);
Regards.
This is the corrected code.
It had a few mistakes:
It was reading a wrong wisdom.fftw file (from some old test...). Now, It always creates a new fftw_plan and a new file.
I misunderstood how it works the fftw library with in-place and out-of-place parameters. I had to change mallocs for the correct padding for "in-place" (I added +2 in malloc functions).
In order to restore the image, I had to divide by its size ((width+2) * height) how it is explained in this link.
`
/* load image */
string bmpFileNameImage = "files/polyp.bmp";
BMPImage bmpImage(bmpFileNameImage);
int width = bmpImage.width();
int height = bmpImage.height();
vector<double> pixelColors;
vector<uint8_t> image = bmpImage.copyBits();
//get one channel from the image
Uint8ToDouble(image,pixelColors,bmpImage.width(),bmpImage.height(),1);
//We don't reuse old wisdom.fftw... It can be corrupt
/*
FILE * file = fopen("wisdom.fftw", "r");
if (file) {
fftw_import_wisdom_from_file(file);
fclose(file);
} */
double *wisdomInput = (double *) fftw_malloc(sizeof(double)*height*(width+2));
const fftw_plan forward =fftw_plan_dft_r2c_2d(width,height,wisdomInput,reinterpret_cast<fftw_complex *>(wisdomInput),FFTW_PATIENT);
const fftw_plan inverse = fftw_plan_dft_c2r_2d(width,height,reinterpret_cast<fftw_complex *>(wisdomInput),wisdomInput, FFTW_PATIENT);
double *bitsColors =(double *)fftw_malloc((width) * height * sizeof(double));
for (int y = 0; y < height; y++) {
for (int x = 0; x < width+2; x++) {
if (x < width) {
int currentIndex = ((y * width) + (x));
bitsColors[currentIndex] = (static_cast<double>(result[y * (width+2) + x])) / (height*width);
}
}
}
fftw_free (wisdomInput);
fftw_free (out);
fftw_free (result);
fftw_free (bitsColors);
fftw_destroy_plan(forward);
fftw_destroy_plan(inverse);
fftw_cleanup();
}
`
fftw_execute_dft_r2c(forward,&pixelColors[0],out);
What are you doing here ? The array has already a pointer.
Change it to fftw_execute_dft_r2c(forward,pixelColors[0],out); it should work now.
Maybe the problem is here (http://www.fftw.org/doc/New_002darray-Execute-Functions.html):
[...] that the following conditions are met:
The input and output arrays are the same (in-place) or different (out-of-place) if the plan was originally created to be in-place or
out-of-place, respectively.
In the plan you are using in-place transformation parameters (with bad allocation, BTW, since:
double *wisdomInput = (double *) fftw_malloc(sizeof(double)*width*2*(height/2 +1 ));
should be:
double *wisdomInput = (double *) fftw_malloc(sizeof(fftw_complex)*width*2*(height/2 +1 ));
to be suitable for output too).
But you're calling fftw_execute_dft_r2c function with out-of-place parameters.