OPENCV : CUDA context initialization for different methods - c++

I'm working on a simple c++ program to evaluate the performance of some Opencv GPU methods (cv::cuda).
I am using Opencv 3.1 on Ubuntu 15 (with CUDA 7.5) with a GeForce 770.
I previously read that we need to initialize CUDA environment to avoid slow process at first call. So, I initialize my program with a cv::cuda::getDevice() and setDevice().
Then, I test 2 methods:
cv::cuda::resize() (factor 0.5)
and cv::cuda::meanStdDev.
Initialization takes 400ms. Then, resizing takes 2 or 3 ms, that's OK.
But... meanStdDev takes 476ms!
If I run 2 successive meanStdDev, the second one is much faster (3ms).
I really don't understand why the initialization has an effect on resize() but not on meanStdDev().
I compile OPENCV with -DCUDA_ARCH_BIN=3.0. I try with -DCUDA_ARCH_PTX="" but the problem is still the same.
#include <opencv2/opencv.hpp>
#include <opencv2/cudaimgproc.hpp>
#include "opencv2/cudawarping.hpp"
#include "opencv2/cudaarithm.hpp"
using namespace std;
int main(int argc, char *argv[])
{
double t_init_cuda = (double)cv::getTickCount();
int CudaDevice;
if(cv::cuda::getCudaEnabledDeviceCount()==0)
{
cerr<<endl<<"ERROR: NO CudaEnabledDevice"<<endl;
exit(2);
}
else
{
CudaDevice = cv::cuda::getDevice();
cv::cuda::setDevice(CudaDevice);
}
t_init_cuda = ((double)cv::getTickCount() - t_init_cuda)/cv::getTickFrequency() * 1000;
cout<<endl<<"\t*T_INIT_CUDA="<<t_init_cuda<<"ms\n";;
cv::Mat src = cv::imread(argv[1], 0);
if (!src.data) exit(1);
cv::cuda::GpuMat d_src(src);
//CV::CUDA::RESIZE
cv::cuda::GpuMat d_dst;
double factor = 0.5;
double t_gpu_resize = cv::getTickCount();
cv::cuda::resize(d_src, d_dst, cv::Size( (int) ((float) (d_src.cols)*factor) , (int) ((float) (d_src.rows)*factor)), 0, 0, CV_INTER_AREA);
t_gpu_resize = ((double)cv::getTickCount() - t_gpu_resize)/cv::getTickFrequency() * 1000;
cout<<endl<<"D_SRC="<<d_src.rows<<"x"<<d_src.cols<<" => D_DST="<<d_dst.rows<<"x"<<d_dst.cols<<endl;
cout<<endl<<"\t*T_GPU_RESIZE="<<t_gpu_resize<<"ms\n";;
//CV::CUDA::MEANSTDDEV
double t_meanstddev = (double)cv::getTickCount();
cv::Scalar mean, stddev;
std::vector<cv::cuda::GpuMat> d_src_split;
cv::cuda::split(d_src, d_src_split);
cv::cuda::meanStdDev (d_src_split[0], mean, stddev);
t_meanstddev = ((double)cv::getTickCount() - t_meanstddev)/cv::getTickFrequency() * 1000.0;
cout<<endl<<"mean="<<mean.val[0]<<" | stddev="<<stddev.val[0]<<endl;
cout<<endl<<"\t*T_GPU_MEANSTDDEV="<<t_meanstddev<<"ms\n";
return 0;
}

My friend, When you call same function twice :
1- First time you allocate new memory at Device for resized. "According to WIKI of OpenCV"
2- Second time you reuse allocated memory so it will be fast.
I get that function from OpenCV for you so you can understand why it said that.
void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
{
if (!deviceSupports(FEATURE_SET_COMPUTE_13))
CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
const GpuMat src = getInputMat(_src, stream);
CV_Assert( src.type() == CV_8UC1 );
GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
int bufSize;
#if (CUDA_VERSION <= 4020)
nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
#else
nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
#endif
BufferPool pool(stream);
GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1); // <--- this line create new GpuMat
NppStreamHandler h(StreamAccessor::getStream(stream));
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );
syncOutput(dst, _dst, stream);
}
this function
GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
{
GpuMat buf(allocator_);
buf.create(rows, cols, type);
return buf;
}
I hope this will help you.

Related

What causes this segmentation fault (core dumped) error at cudaMemcpy when copying to GPU?

I have been trying to fix segmentation fault (core dumped) error messages with a toy program when calling cudaMemcpy. It works for small images, but for bigger images it normally fails; I say normally, because it has sometimes succeeded when debugging with valgrind (more about that below). I have looked at similar questions, but have been unable to find the answer; sorry if this is a duplicate! I am just learning out (and following programming massively parallel processors).
Here is my code, cleaned up:
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include "opencv2/imgproc/imgproc.hpp"
#include <cuda.h>
#include <iostream>
#include <cuda_runtime.h>
using namespace cv;
using namespace std;
__global__ void
colorToGreyKernel(unsigned char* outPic, unsigned char* inPic, unsigned int width, unsigned int height){
// printf("trying \n" );
int Col = blockDim.x * blockIdx.x + threadIdx.x;
int Row = blockDim.y * blockIdx.y + threadIdx.y;
if( Col < width && Row < height){
int greyOffset = Row * width + Col;
int rgbOffset = greyOffset * 3;
unsigned char b = inPic[rgbOffset];
unsigned char g = inPic[rgbOffset +1];
unsigned char r = inPic[rgbOffset +2];
outPic[greyOffset] = 0.21f*r + 0.71f*g + 0.07f*b;
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
bool test = code == cudaSuccess;
// cout << "code " << std::boolalpha<< test;
if (code != cudaSuccess)
{
// const char *errorStr = NULL;
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char** argv )
{
if ( argc != 2 )
{
printf("usage: DisplayImage.out <Image_Path>\n");
return -1;
}
Mat image;
unsigned int imSize[2] = {400,400};
unsigned char* inPic = NULL;
unsigned char* outPic = NULL;
gpuErrchk(cudaMalloc(&inPic, imSize[0] * imSize[1] * 3 * sizeof(CV_8U)));
gpuErrchk(cudaMalloc(&outPic, imSize[0] * imSize[1] * sizeof(CV_8U)));
image = imread( argv[1], IMREAD_COLOR );
resize(image, image, Size(imSize[0],imSize[1]));
Mat greyImg(image.rows, image.cols, CV_8U, Scalar(125));
size_t size = image.cols * image.rows * image.channels() * sizeof(CV_8U);
// This is where it always fails for bigger images
gpuErrchk(cudaMemcpy(inPic,(void*) &image.data[0], size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(outPic, (void*)&greyImg.data[0], size/3, cudaMemcpyHostToDevice));
dim3 dimGrid(ceil(image.rows/16.0),ceil(image.cols/16.0),1);
dim3 dimBlock(16,16,1);
colorToGreyKernel<<<dimGrid, dimBlock>>>(outPic, inPic,(int) image.rows,(int) image.cols);
cudaDeviceSynchronize();
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(greyImg.data, outPic, size / 3, cudaMemcpyDeviceToHost));
namedWindow("Display Image", WINDOW_AUTOSIZE );
imshow("Display Image", greyImg);
waitKey(0);
cudaFree(&inPic[0]);
cudaFree(&outPic[0]);
return 0;
}
I'm able to allocate on the device, but the copying fails for bigger images. I've tried it using opencv::cuda, and I can load any picture and do cvtColor on the device without resizing, so I conclude it's not memory (similar when looking at nvidia-smi).
When I run using valgrind, I get a lot of Invalid write of size 8 errors around this point, all referencing to libcuda. I know it's this particular memcopy that's the problem, by isolating it. Sometimes it also works in valgrind, but I've gathered that this is normal. I don't have experience with valgrind yet, but the memory issues don't make sense to me (I'm trying to copy to the device, so why a segmentation fault which is related to the host?).
My question is simple, where does the error come from and how to fix this?
NVCC = 11.1
gpu = GeForce GTX 960M (not a lot, but that shouldn't matter)
Again, I am new to programming in Cuda, but have tried what I can think of and can not isolate the problem! Thanks for your help.
The problem here relates to your usage of OpenCV. An item like CV_8U is not a type, it is a compiler #define. Therefore sizeof(CV_8U) is not doing what you think it is doing. Your intended usage should be to capture the size of the underlying type (e.g. unsigned char, i.e. a type size of 1). However, sizeof(CV_8U) returns evidently the size of an integer, which is 4.
As a result of that, your calculation of size is wrong (4x too large). As a result of that, when the cudaMemcpy operation attempts to access &image.data[0] for size bytes, it will attempt to copy past the end of the buffer. For small images, the overrun doesn't trigger the run time check/limit. For a large enough size calculation (large enough image) you will hit a seg fault. Although the failure is triggered within a CUDA call, the origin of the error is outside of CUDA.
One possible solution is to replace your usage of sizeof(CV_8U) with something like sizeof(unsigned char). Since that size is 1, you can also just delete the multiplication by sizeof(CV_8U) and get the same behavior.
You can also avoid this sort of allocation and let OpenCV do the allocation (and host-device data copying) work for you as demonstrated in the answer here and here

pass Mat object C++ to Unity

I'd like to return a Mat object to Unity from c++ code. However i get access violation error at c++ part like that
Unity Editor [version: Unity 2017.3.0f3_a9f86dcd79df]
SaliencyCV.dll caused an Access Violation (0xc0000005)
in module SaliencyCV.dll at 0033:270027f0.
Error occurred at 2018-03-06_235212.
C:\Program Files\Unity\Editor\Unity.exe, run by Dilara.
43% memory in use.
16266 MB physical memory [9199 MB free].
18698 MB paging file [9861 MB free].
134217728 MB user address space [134185466 MB free].
Read from location 990d0000 caused an access violation.
Here is c++ code:
uchar* cppMethod(uchar* frameData, int WIDTH, int HEIGHT, int* rows, int* cols)
{
Mat img(HEIGHT, WIDTH, CV_8UC3);
img.data = frameData;
flip(img, img, 0);
Mat result = calculateSaliency(img);
*rows = result.rows;
*cols = result.cols;
int length = result.rows * result.cols * 3;
uchar* tmpArr = result.data;
uchar* resultArray = new uchar[length];
for (int i = 0; i < length; i++)
{
resultArray[i] = tmpArr[i];
}
return resultArray;
}
Can someone help me?
You should call the correct Mat constructor, which accepts external data pointer, to make the object not release/destruct the corresponding memory location data points to. You can read about this behaviour in Mat::release().
The problem with your code is that
Mat img(HEIGHT, WIDTH, CV_8UC3) allocates a memory block of type CV_8UC3 of size HEIGHT*WIDTH, which is not used (because you are changing the data member variable to point to a different memory location, anyways),
At function exit, img is destructed, which results in a call to release(), which in turn destructs frameData, which is not the intended behaviour.
Change your first two lines to read
Mat img(HEIGHT, WIDTH, CV_8UC3, frameData);
And if you are passing resultArray to C#, where you are most likely not managing the pointed-to-memory's lifetime, you would be most likely having memory leaks. #Programmer has already suggested in his answer to your previous question that you should allocate the memory in C#, pass it to C++, and write in-place in the C++ side.
In short, you should have something like:
#include <algorithm>
void cppMethod(uchar *frameData, uchar *out, const int WIDTH, const int HEIGHT,
int *rows, int *cols) {
/* this constructor will not manage frameData's lifetime */
Mat img(HEIGHT, WIDTH, CV_8UC3, frameData);
/* in-place operation */
flip(img, img, 0);
/* local variable --- it will be destructed properly */
Mat result = calculateSaliency(img);
/* well-defined if rows and cols are scalars passed by reference */
*rows = result.rows;
*cols = result.cols;
/* make sure length will not overflow */
int length = result.rows * result.cols * 3;
/* you don't need this */
// uchar *tmpArr = result.data;
/* you sholuld NOT do this */
// uchar *resultArray = new uchar[length];
// use std::copy from <algorithm>
// for (int i = 0; i < length; i++) {
// resultArray[i] = tmpArr[i];
// }
std::copy(result.data, result.data + length, out);
// return resultArray;
}

How to use linear indexes on cv::cuda::PtrStepSzf data

I'm working with opencv 3.1 cv::cuda template matching but the cv::cuda::minMaxLoc() function is too slow for my case. My match results have minimum size of 128x128 and max size up to 512x512. In average minMaxLoc() will take 1.65 ms for the 128x128 and up to 25 ms for something like 350x350 which is too long since this is done hundreds of times.
I underestand that my match sizes are maybe too small for what do you usually use in GPU. But I want to test along the lines that Robert Crovella did at thrust::max_element slow in comparison cublasIsamax - More efficient implementation? to see if I can get better performance.
My problem is that all those reductions the data is being read using linear indexes and cv::cuda::PtrStepSzfdoes not allow this(At least I did not find how). I try to reshape my match result but I cannot do that since the data is not contiguous. Do I need to go toward cudaMallocPitch and cudaMemcpy2DIf that the case how I do that with a cv::cuda::GPUMat read as cv::cuda::PtrStepSzf object?
__global__ void minLoc(const cv::cuda::PtrStepSzf data,
float* minVal,
float * minValLoc
)
{
int dsize = data.cols*data.rows
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
T my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
//data(idx) is an illegal call;The legal one is data(x,y)
// How do I do it?
if (data(idx) > my_val)
{
my_val = data(idx); my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
dim3 cthreads(32, 32);
dim3 cblocks(
static_cast<int>(std::ceil(input1.size().width /
static_cast<double>(cthreads.x))),
static_cast<int>(std::ceil(input1.size().height /
static_cast<double>(cthreads.y))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
int main()
{
//read Background and template
cv::Mat input = imread("cat.jpg",0);
cv::Mat templ = imread("catNose.jpg",0)
//convert to floats
cv::Mat float_input, float_templ;
input.convertTo(float_input,CV_32FC1);
input.convertTo(float_templ,CV_32FC1);
//upload Bckg and template to gpu
cv::cuda::GpuMat d_src,d_templ, d_match;
Size size = float_input.size();
d_src.upload(float_input);
d_templ.upload(float_templ);
double min_val, max_val;
Point min_loc, max_loc;
Ptr<cv::cuda::TemplateMatching> alg = cuda::createTemplateMatching(d_src.type(), CV_TM_SQDIFF);
alg->match(d_src, d_templ, d_match);
cv::cuda::Normalize(d_match,d_match);
//Too slow
//cv::cuda::minMaxLoc(d_match, &min_val, &max_val, &min_loc, &max_loc);
callMinLocKernel(d_match,min_val,min_loc);
return 0;
}
I did not find a way to actually use linear indexes with cv::cuda::PtrStepSzf. I am not sure there is one. Looks like when this format is used it can only use 2 subscripts. Instead I used the pointer ptr on cv::cuda::GpuMat input variable in the kernel wrapper as follow:
#define nTPB 1024
#define FLOAT_MAX 9999.0f
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
const float* linSrc = input.ptr<float>();
size_t step = input.step;
dim3 cthreads(nTPB);
dim3 cblocks(
static_cast<int>(std::ceil(input.size().width*input1.size().height /
static_cast<double>(nTPB))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
And inside the Kernel as:
__global__ void minLoc(const float* data,
const size_t step,
cv::Size dataSz,
float* minVal,
float * minValLoc
)
{
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
const int dsize = dataSz.height*dataSz.width;
last_block = 0;
float my_val = FLOAT_MAX;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
int row = idx / dataSz.width;
int id = ( row*step / sizeof( float ) ) + idx % dataSz.width;
if ( data[id] < my_val )
{
my_val = data[id];
my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
The step is in bytes so it needs to be divided by sizeof(typeVariable)
I hope this help!

NVIDIA Visual profiler does not generate a timeline

My question is almost same as the question [asked here at SO before][1]. But no answer has been provided to it so, I am asking a separate question.
I am using CUDA 7.0 toolkit on a Windows-7 OS. I am using VS-2013.
I tried to generate the timeline of vector addition sample program and it worked. But when I follow exactly same steps to generate a timeline of my own code then, it keep showing a message "Running application to generate timeline". I know that the kernel gets called and everything is working.
cudaDeviceReset() call is also there after finishing everything related to CUDA.
Program: I have changed my original question to provide a minimal working example which can produce the same problem. The following code is not generating a timeline using nvvp irrespective of the place where I put cudaDeviceReset().
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//OpenCV
#include <opencv2/highgui.hpp>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <stdio.h>
using namespace cv;
__global__ void colorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
{
// Calculate our pixel's location
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
// Operate only if we are in the correct boundaries
if (x >= 0 && x < iw && y >= 0 && y < ih)
{
ptr_dst[numChannels* (iw*y + x) + 0] = ptr_source[numChannels* (iw*y + x) + 0];
ptr_dst[numChannels* (iw*y + x) + 1] = ptr_source[numChannels* (iw*y + x) + 1];
ptr_dst[numChannels* (iw*y + x) + 2] = ptr_source[numChannels* (iw*y + x) + 2];
}
}
int main()
{
while (1)
{
Mat image(400, 400, CV_8UC3, Scalar(0, 0, 255));
unsigned char *h_src = image.data;
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;
unsigned char *dev_src, *dev_dst, *h_dst;
//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src, numBytes * sizeof(unsigned char));
cudaMalloc((void**)&dev_dst, numBytes * sizeof(unsigned char));
////Copy the source image to the device i.e. GPU
cudaMemcpy(dev_src, h_src, numBytes * sizeof(unsigned char), cudaMemcpyHostToDevice);
////KERNEL
dim3 numOfBlocks(3 * (image.cols / 20), 3 * (image.rows / 20)); //multiplied by 3 because we have 3 channel image now
dim3 numOfThreadsPerBlocks(20, 20);
colorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks >> >(numChannels, image.cols, image.rows, dev_src, dev_dst);
cudaDeviceSynchronize();
//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3);
h_dst = org_dijSDK_img.data;
cudaMemcpy(h_dst, dev_dst, numBytes * sizeof(unsigned char), cudaMemcpyDeviceToHost);
//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);
}
cudaDeviceReset();
return 0;
}
Very Important Clue: If I comment the line while(1) and hence run the code only once then, the nvvp generates timeline. But in my original project, I cannot get the timeline profile by doing so because, it contain multi-threading and other stuff due to which, there is no image to process during the first run. So, I must need some way to generate the timeline with a code containing infinite while loop.
The problem in my code is the endless while loop due to which the cudaDeviceReset() were never being called. There are two possible solutions to deal with such situations:
If you are interested to have a look at timeline profiling only then, just comment your while loop and the nvvp would be able to reach the cudaDeviceReset() present at the end of main().
There might be a situation where you must keep a loop inside your program. For example, in my original project containing multi-threading, there is no image to process during initial 180 run of while loop. To deal with such situations, replace your while loop with the for loop which can run for limited number of times. For example, the following code has helped me to get a timeline profiling of 4 number of runs. I am posting only the modified main().
int main()
{
cudaStream_t stream_one;
cudaStream_t stream_two;
cudaStream_t stream_three;
//while (1)
for (int i = 0; i < 4; i++)
{
cudaStreamCreate(&stream_one);
cudaStreamCreate(&stream_two);
cudaStreamCreate(&stream_three);
Mat image = imread("DijSDK_test_image.jpg", 1);
//Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;
int iw = image.rows;
int ih = image.cols;
size_t totalMemSize = numBytes * sizeof(unsigned char);
size_t oneThirdMemSize = totalMemSize / 3;
unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;
//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);
//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
h_dst = org_dijSDK_img.data;
//copy new data of image to the host pointer
h_src = image.data;
//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
//KERNEL--stream-1
callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);
//KERNEL--stream-2
callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);
//KERNEL--stream-3
callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);
//RESULT copy: GPU to CPU
cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);
// wait for results
cudaStreamSynchronize(stream_one);
cudaStreamSynchronize(stream_two);
cudaStreamSynchronize(stream_three);
//Assign the processed data to the display image.
org_dijSDK_img.data = h_dst;
//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);
}
cudaDeviceReset();
return 0;
}

running opencv code with matlab via mex fails while on VisualStudio it works

I want to extract some harriscorners from an image and get FREAK descriptors. Here is how I try to do it:
(The passed variables are globally defined.)
void computeFeatures(cv::Mat &src, std::vector<cv::KeyPoint> &keypoints, cv::Mat &desc ) {
cv::Mat featureSpace;
featureSpace = cv::Mat::zeros( src.size(), CV_32FC1 );
//- Detector parameters
int blockSize = 3;
int apertureSize = 3;
double k = 0.04;
//- Detecting corners
cornerHarris( src, featureSpace, blockSize, apertureSize, k, cv::BORDER_DEFAULT );
//- Thresholding featureSpace
keypoints.clear();
nonMaximumSuppression(featureSpace, keypoints, param.nms_n);
//- compute FREAK-descriptor
cv::FREAK freak(false, false, 22.0f, 4);
freak.compute(src, keypoints, desc);
}
I can compile it with Visual Studio 12 as well as Matlab R2013b via mex. When I run it as "stand alone" (.exe) it works just fine. When I try to execute it via Matlab it fails with this message:
A buffer overrun has occurred in MATLAB.exe which has corrupted the
program's internal state. Press Break to debug the program or Continue
to terminate the program.
I mexed with the debug option '-g' and attached VisualStudio to Matlab to be able to get closer to the error:
After nonMaximumSuppression() the size of keypoints is 233 when I jump into freak.compute() the size is suddenly 83 with "random" values stored.
The actual error is then in KeyPointsFilter::runByKeypointSize when keypoints should be erased.
in keypoint.cpp line 256:
void KeyPointsFilter::runByKeypointSize( vector<KeyPoint>& keypoints, float minSize, float maxSize )
{
CV_Assert( minSize >= 0 );
CV_Assert( maxSize >= 0);
CV_Assert( minSize <= maxSize );
keypoints.erase( std::remove_if(keypoints.begin(), keypoints.end(), SizePredicate(minSize, maxSize)),
keypoints.end() );
}
Is there some error I'm making with passing the keyPoint-vector? Has anybody run into a similar problem?
EDIT:
Here is the mex-file with the additional library "opencv_matlab.hpp" taken from MatlabCentral
#include "opencv_matlab.hpp"
void mexFunction (int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[]) {
// read command
char command[128];
mxGetString(prhs[0],command,128);
if (!strcmp(command,"push") || !strcmp(command,"replace")) {
// check arguments
if (nrhs!=1+1 && nrhs!=1+2)
mexErrMsgTxt("1 or 2 inputs required (I1=left image,I2=right image).");
if (!mxIsUint8(prhs[1]) || mxGetNumberOfDimensions(prhs[1])!=2)
mexErrMsgTxt("Input I1 (left image) must be a uint8_t matrix.");
// determine input/output image properties
const int *dims1 = mxGetDimensions(prhs[1]);
const int nDims1 = mxGetNumberOfDimensions(prhs[1]);
const int rows1 = dims1[0];
const int cols1 = dims1[1];
const int channels1 = (nDims1 == 3 ? dims1[2] : 1);
// Allocate, copy, and convert the input image
// #note: input is double
cv::Mat I1_ = cv::Mat::zeros(cv::Size(cols1, rows1), CV_8UC(channels1));
om::copyMatrixToOpencv<uchar>((unsigned char*)mxGetPr(prhs[1]), I1_);
// push back single image
if (nrhs==1+1) {
// compute features and put them to ring buffer
pushBack(I1_,!strcmp(command,"replace"));
// push back stereo image pair
} else {
if (!mxIsUint8(prhs[2]) || mxGetNumberOfDimensions(prhs[2])!=2)
mexErrMsgTxt("Input I2 (right image) must be a uint8_t matrix.");
// determine input/output image properties
const int *dims2 = mxGetDimensions(prhs[2]);
const int nDims2 = mxGetNumberOfDimensions(prhs[2]);
const int rows2 = dims2[0];
const int cols2 = dims2[1];
const int channels2 = (nDims2 == 3 ? dims2[2] : 1);
// Allocate, copy, and convert the input image
// #note: input is double
cv::Mat I2_ = cv::Mat::zeros(cv::Size(cols2, rows2), CV_8UC(channels2));
om::copyMatrixToOpencv<uchar>((unsigned char*)mxGetPr(prhs[2]), I2_);
// check image size
if (dims1_[0]!=dims2_[0] || dims1_[1]!=dims2_[1])
mexErrMsgTxt("Input I1 and I2 must be images of same size.");
// compute features and put them to ring buffer
pushBack(I1_,I2_,!strcmp(command,"replace"));
}
}else {
mexPrintf("Unknown command: %s\n",command);
}
}
And here is an additional part of the main cpp project.
std::vector<cv::KeyPoint> k1c1, k2c1, k1p1, k2p1; //KeyPoints
cv::Mat d1c1, d2c1, d1p1, d2p1; //descriptors
void pushBack (cv::Mat &I1,cv::Mat &I2,const bool replace) {
// sanity check
if (I1.empty()) {
cerr << "ERROR: Image empty!" << endl;
return;
}
if (replace) {
//if (!k1c1.empty())
k1c1.clear(); k2c1.clear();
d1c1.release(); d2c1.release();
} else {
k1p1.clear(); k2p1.clear();
d1p1.release(); d2p1.release();
k1p1 = k1c1; k2p1 = k2c1;
d1c1.copyTo(d1p1); d2c1.copyTo(d2p1);
k1c1.clear(); k2c1.clear();
d1c1.release(); d2c1.release();
}
// compute new features for current frame
computeFeatures(I1,k1c1,d1c1);
if (!I2.empty())
computeFeatures(I2,k2c1,d2c1);
}
And here is how I call the mex-file from Matlab
I1p = imread('\I1.bmp');
I2p = imread('\I2.bmp');
harris_freak('push',I1p,I2p);
Hope this helps...
I hope this is the correct way to give an answer to my own question.
After a couple of days I found kind of a work around. Instead of building the mex file in Matlab, which gives the above mentioned error, I built it in Visual Studio with instructions taken from here.
Now everything works just fine.
It kind of bothers me to not know how to do it with matlab, but hey, maybe someone still has an idea.
Thanks to the commenters for taking the time to look through my question!
If you have the Computer Vision System Toolbox then you do not need mex. It includes the detectHarrisFeatures function for detecting Harris corners, and the extractFeatures function, which can compute FREAK descriptors.