Efficient way to generate a three-channel LUT mask - c++

I have some code to create an HSV mask that looks approximately like this (taken from this Japanese-language page):
void colorExtraction(const cv::gpu::GpuMat &src,
cv::gpu::GpuMat *dst)
{
cv::Mat lut(256, 1, CV_8UC3);
for (int i = 0; i < 256; i++)
{
cv::Vec3b thisHSV;
thisHSV[0] = inHRange(i) ? 255 : 0;
thisHSV[1] = inSRange(i) ? 255 : 0;
thisHSV[2] = inVRange(i) ? 255 : 0;
lut.at<cv::Vec3b>(i) = thisHSV;
}
/* apply LUT to input image */
cv::gpu::GpuMat extracted(src.size(), CV_8UC3);
cv::gpu::LUT(src, lut, extracted);
/* divide image into each channel */
std::vector<cv::gpu::GpuMat> channels;
cv::gpu::split(extracted, channels);
/* create mask */
cv::gpu::bitwise_and(channels[0], channels[1], *dst);
cv::gpu::bitwise_and(*dst, channels[2], *dst);
}
This works, but despite the operations being mostly in the GPU, it is slower than I would like, perhaps due to a number of intermediate GpuMats. I suspect there might be a nice way to fold this all up into just one or two calls, but I don't know what it could be. Writing my own CUDA kernel is, of course, an option, but I want to check to see if I don't need to reinvent the wheel.

To self-answer, I ended up coding my own kernel to do the LUT(), split(), and two bitwise_and()s in a single call:
__global__ void colorExtractionKernel(cv::gpu::PtrStepSz<uchar3> const src,
cv::gpu::PtrStepSz<unsigned char> dst,
cv::gpu::PtrStepSz<uchar3> const lut)
{
unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
// Extract post-LUT hsv flags
uchar3 srcHSV = src(row, col);
unsigned char h = lut(srcHSV.x, 0).x;
unsigned char s = lut(srcHSV.y, 0).y;
unsigned char v = lut(srcHSV.z, 0).z;
// Result pixel is the AND of the pixels
dst(row, col) = (h & s & v);
}
void colorExtraction_cuda(const cv::gpu::GpuMat &src, // input HSV image
cv::gpu::GpuMat &dst, // specified color extracted binarized image
cv::Mat const &lut) // Look-up thresholds
{
cudaStream_t thisStream;
gpuErrChk(cudaStreamCreate(&thisStream));
dim3 Threads(32, 16);
dim3 Blocks((src.cols + Threads.x - 1)/Threads.x, (src.rows + Threads.y - 1)/Threads.y);
cv::gpu::GpuMat gpuLut(lut);
colorExtractionKernel<<<Blocks, Threads, 0, thisStream>>>(src, dst, gpuLut);
gpuErrChk(cudaStreamSynchronize(thisStream));
gpuErrChk(cudaStreamDestroy(thisStream));
gpuErrChk(cudaGetLastError());
}

Related

OpenCV image stacking - Alignment is off

I have some code that takes multiple images, aligns them, and stacks them together. For some reason, the alignment is off. A simplified version of the code is below
void stackImages(uint8_t **pixels, uint32_t width, uint32_t height, size_t len)
{
cv:: Mat firstImg;
cv::Mat stacked;
for (int i = 0; i < len; i++)
{
// Transformation matrix
cv::Mat1f M = cv::Mat1f(cv::Mat::eye(3, 3, CV_8UC1));
// Convert pixels (4 channel RGBA) to Mat
cv::Mat pixels = cv::Mat(height, width, CV_8UC4, pixels[i]);
cv::Mat gray;
cv::cvtColor(pixels, gray, cv::COLOR_RGBA2GRAY);
// skip the reference image
if(!i) {
firstImg = gray;
stacked = gray;
continue;
}
cv::Mat warped;
// create size struct
cv::Size size;
size.width = width;
size.height = height;
// create the transformation matrix
cv::findTransformECC(firstImg, gray, M, cv::MOTION_HOMOGRAPHY);
// warp the image according ot the transformation matrix
cv::warpPerspective(gray, warped, M, size);
// stack the image
stacked += warped;
}
// write the image
cv::imwrite("stacked.jpg", stacked);
}
I've tested this code with three images taken in rapid succession and the results are below. This is my first foray into image processing, so I'm mostly following online documentation.

OpenCV Histogram Mat to Bitmap for Picturebox

I use a FLIR Camera (Grasshopper3) and the SDK (Spinnaker) to take an image (Mono8). After converting the image, I wuold like to compute the Histogram and display it in my GUI in a picturebox (C++ CLR/CLI .net environment). For this, I need to convert it, but I guess there is a mistake in the color conversion or the BitMap creation.
Here is the code:
Spinnaker::ImagePtr convertedImage_MONO = Grasshopper3.pResultImage_MONO->Convert(Spinnaker::PixelFormat_Mono8, Spinnaker::NO_COLOR_PROCESSING); // Raw image is converted to Mono8
unsigned int XPadding = convertedImage_MONO->GetXPadding();
unsigned int YPadding = convertedImage_MONO->GetYPadding();
unsigned int rowsize = convertedImage_MONO->GetWidth();
unsigned int colsize = convertedImage_MONO->GetHeight();
//image data contains padding. When allocating Mat container size, you need to account for the X,Y image data padding.
cv::Mat cvimg_Mono = cv::Mat(colsize + YPadding, rowsize + XPadding, CV_8UC1, convertedImage_MONO->GetData(), convertedImage_MONO->GetStride());
cvtColor(cvimg_Mono, cvimg_Mono, cv::COLOR_BGR2BGRA);
// Histogram
int bins = 256;
int histSize[] = { bins };
// Set ranges for histogram bins
float lranges[] = { 0, 256 };
const float* ranges[] = { lranges };
// create matrix for histogram
cv::Mat hist;
int channels[] = { 0 };
// create matrix for histogram visualization
int const hist_height = 256;
cv::Mat3b hist_image = cv::Mat3b::zeros(hist_height, bins);
cv::calcHist(&cvimg_Mono, 1, channels, cv::Mat(), hist, 1, histSize, ranges, true, false);
double max_val = 0;
minMaxLoc(hist, 0, &max_val);
// visualize each bin
for (int b = 0; b < bins; b++)
{
float const binVal = hist.at<float>(b);
int const height = cvRound(binVal*hist_height / max_val);
cv::line(hist_image, cv::Point(b, hist_height - height), cv::Point(b, hist_height), cv::Scalar::all(255));
}
cv::Mat Histogram_Mono = hist_image;
cv::resize(Histogram_Mono, Histogram_Mono, cv::Size(pictureBox_Mono->Width, pictureBox_Mono->Height), cv::INTER_AREA);
hBit_Mono = CreateBitmap(Histogram_Mono.cols, Histogram_Mono.rows, 1, 32, Histogram_Mono.data); // hBit_Mono was created global
bmp_Mono = Bitmap::FromHbitmap((IntPtr)hBit_Mono); // bmp_Mono was created as a global Bitmap
pictureBox_Mono->Image = bmp_Mono;

how to use cv::Mat imgbuf created by pointer to video data

I am reading the raw video data from the read buffer using
cv::Mat imgbuf(Size(640, 480), CV_8UC3, &mem[0], (640*3));
This variable imgbuf I am passing to face detection algorithm which detects the face & draws the rectangle around the face. after that I am getting output something like
I tried with below code where I am performing resize operation before pass to face detection algorithm. by using this method it is working fine. but without resizing function I am getting noticeable output with rectangle around the face.
while(1)
{
unsigned char *mem = (unsigned char*)mmap(NULL, page_offset + len,
PROT_READ |PROT_WRITE, MAP_PRIVATE, fd, page_base);
cv::Mat imgbuf(Size(640, 480), CV_8UC3, &mem[0], (640*3));
cv::resize(imgbuf,imgbuf,(640,480)); //Dummy function to get the right output.
auto result = v->facedetection(imgbuf);
for (const auto &r : result.rects) {
cv::rectangle(imgbuf,cv::Rect{ cv::Point(r.x * imgbuf.cols, r.y *
imgbuf.rows),cv::Size{(int)(r.width * imgbuf.cols), (int)(r.height *
imgbuf.rows) } },0xff);
}
imshow("face-detection", imgbuf);
waitKey(1);
can anybody help be to sort out this problem
Test this method:
unsigned char *mem = (unsigned char*)mmap(NULL, page_offset + len,
PROT_READ |PROT_WRITE, MAP_PRIVATE, fd, page_base);
cv::Mat imgbuf(480,640, CV_8UC3, &mem[0]);
cv::Mat img_2, img_3;
cv::resize(imgbuf,img_2,cv::Size(640,480));
img_2.copyTo(img_3);
auto result = v->facedetection(img_2);
for (const auto &r : result.rects)
{
cv::Rect myR = cv::Rect(r.x * img_2.cols, r.y * img_2.rows, (int)(r.width * img_2.cols),
(int)(r.height * img_2.rows));
cv::rectangle(img_3,myR,Scalar(0, 0, 255), 1);
}
imshow("Result", img_3);
waitKey(0);
After getting a valid result you can optimize this and use less of "Mat"s.

CUDA error with processing the image

I'm trying to get black and white image as the output with color image as the input. I'm using an OpenCV to get the image and write the output, and CUDA to make the image black and white in kernel. I tried the same code, but without OpenCV, and it worked fine. But now the output is slightly different from what I really expect to get.
I think that CUDA code needs some modification to work with OpenCV. I worked a bit with it but failed to find the way to do that. Maybe somebody can give me an advice or modify my code, please? I'm really confused with this problem.
__global__ void addMatrix(uchar4 *DataIn, unsigned char *DataOut)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
DataOut[idx] = (DataIn[idx].x + DataIn[idx].y + DataIn[idx].z)/3;
}
int main()
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
char* c = "";
printf("Input source of image\n Example of right directory file: E:\henrik-evensen-castle-valley-v03.jpg\n Your turn:\n");
char *tbLEN;
tbLEN = new char [1024];
cin.getline(tbLEN,1024);
cout<< endl << "Your image: " << tbLEN << endl;
//Data for input image
IplImage* image;
image = cvLoadImage(tbLEN, 1);
int height = image->height;
int width = image->width;
int step = image->widthStep;
int SizeIn = (step*height);
printf("\nProcessing image\n");
//Data for output image
IplImage *image2 = cvCreateImage(cvSize(width, height), IPL_DEPTH_8U, 1);
int step2 = image2->widthStep;
int SizeOut = step2 * height;
//GPU
uchar4* DatIn = (uchar4*)image->imageData;
unsigned char* DatOut = (unsigned char*)image2->imageData;
uchar4 *datIndev;
unsigned char *datOutdev;
printf("Allocating memory on Device\n");
/* Allocate memory on Device */
cudaMalloc(&datIndev, SizeIn * sizeof(unsigned char));
cudaMalloc(&datOutdev, SizeOut * sizeof(unsigned char));
printf("Copy data on Device\n");
/* Copy data on Device */
cudaMemcpy(datIndev, DatIn, SizeIn * sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(datOutdev, DatOut, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice);
int NumThreadsX = deviceProp.maxThreadsPerBlock;
int NumBlocksX = (width * height)/NumThreadsX;
dim3 blocks(NumBlocksX, 1, 1);
dim3 threads(NumThreadsX, 1, 1);
addMatrix <<< blocks, threads >>> (datIndev, datOutdev);
cudaMemcpy(DatOut, datOutdev, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cvNamedWindow("Imagecolor");
cvShowImage("Imagecolor", image);
cvNamedWindow("Gray");
cvShowImage("Gray", image2);
const char* filename1 = "CcPwSwMW4AELPUc.jpg";
printf("Saving an output image\n");
cvSaveImage( filename1, image2 );
cudaFree(datOutdev);
cudaFree(datIndev);
cvWaitKey(0);
return 0;
}
There are several problems here:
Your assumption about four channel data is incorrect. Your code will load a three channel BGR image into memory from file. So you need to change references from uchar4 to ucharand then have each thread load three bytes from the source image within your kernel
Your kernel itself contains a potential arithmetic error. The sum of three unsigned char pixel values can overflow an unsigned char intermediate result and produce an incorrect average. You should use a larger type for the calculation.
Taken together, your kernel should look something like this:
__global__ void addMatrix(unsigned char *DataIn, unsigned char *DataOut)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int b = DataIn[3*idx];
int g = DataIn[3*idx+1];
int r = DataIn[3*idx+2];
DataOut[idx] = (unsigned char)((b + r + g)/3);
}
Then you might find your image looks correct.

OpenCV replacing specific pixel values with another value

I want to detect a specific pixel value (let's say 128 in a unsigned 8 bit 1-channel image) in a cv::Mat image and replace the value of all the pixels with that specific value with another value (replacing each 128 with 120). Is there any efficient way of doing this? Or should I do the search and assertion operations pixel by pixel?
I started coding but could not completed. Here is the part of my code:
cv::Mat source;
unsigned oldValue = 128;
unsigned newValue = 120;
cv::Mat temp = (source == oldValue);
You can use setTo, using a mask:
Mat src;
// ... src is somehow initialized
int oldValue = 128;
int newValue = 120;
src.setTo(newValue, src == oldValue);
not sure whether it is more efficient than .setTo , but you could use a look-up-table (especially if you have multiple values you want to replace and you have to replace the same values in multiple images (e.g. in each image of a video stream)).
int main()
{
cv::Mat input = cv::imread("../inputData/Lenna.png");
cv::Mat gray;
cv::cvtColor(input,gray,CV_BGR2GRAY);
// prepare this once:
cv::Mat lookUpTable(1, 256, CV_8U);
uchar* p = lookUpTable.data;
for( int i = 0; i < 256; ++i)
{
p[i] = i;
}
// your modifications
p[128] = 120;
// now you can use LUT efficiently
cv::Mat result;
cv::LUT(gray, lookUpTable, result);
cv::imshow("result", result);
cv::imwrite("../outputData/LUT.png", result);
cv::waitKey(0);
return 0;
}
According to http://docs.opencv.org/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#the-core-function this is very efficient in special scenarios.