Related
This is a simple program to change contrast and brightness of an image. I have noticed that there is a an another program with one simple difference:saturate_cast is added to code.
And I don't realize what is the reason of doing this and there is no need to converting to unsigned char or uchar both code (with saturate_cast<uchar> and to not use this) are outputting the same result. I appreciate if anyone help.
Here it is code :
#include "opencv2/imgcodecs.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <iostream>
#include "Source.h"
using namespace cv;
double alpha;
int beta;
int main(int, char** argv)
{
/// Read image given by user
Mat image = imread(argv[1]);
Mat image2 = Mat::zeros(image.size(), image.type());
/// Initialize values
std::cout << " Basic Linear Transforms " << std::endl;
std::cout << "-------------------------" << std::endl;
std::cout << "* Enter the alpha value [1.0-3.0]: ";std::cin >> alpha;
std::cout << "* Enter the beta value [0-100]: "; std::cin >> beta;
for (int x = 0; x < image.rows; x++)
{
for (int y = 0; y < image.cols; y++)
{
for (int c = 0; c < 3; c++)
{
image2.at<Vec3b>(x, y)[c] =
saturate_cast<uchar>(alpha*(image.at<Vec3b>(x, y)[c]) + beta);
}
}
/// Create Windows
namedWindow("Original Image", 1);
namedWindow("New Image", 1);
/// Show stuff
imshow("Original Image", image);
imshow("New Image", image2);
/// Wait until user press some key
waitKey();
return 0;
}
Since the result of your expression may go outside the valid range for uchar, i.e. [0,255], you'd better always use saturate_cast.
In your case, the result of the expression: alpha*(image.at<Vec3b>(x, y)[c]) + beta is a double, so it's safer to use saturate_cast<uchar> to clamp values correctly.
Also, this improves readability, since it's easy to see that you want a uchar out of an expression.
Without using saturate_cast you may have unexpected values:
uchar u1 = 257; // u1 = 1, why a very bright value is set to almost black?
uchar u2 = saturate_cast<uchar>(257); // u2 = 255, a very bright value is set to white
inline unsigned char saturate_cast_uchar(double val) {
val += 0.5; // to round the value
return unsigned char(val < 0 ? 0 : (val > 0xff ? 0xff : val));
}
if val lies between 0 to 255 than this function will return rounded value,
if val lies outside the range [0, 255] than it will return lower or upper boundary value.
I have implemented Neural network using OpenCV ANN Library. I am newbie in this field and I learn everything about it online (Mostly StackOverflow).
I am using this ANN for detection of number plate. I did segmentation part using OpenCV image processing library and it is working good. It performs character segmentation and gives it to the NN part of the project. NN is going to recognize the number plate.
I have sample images of 20x30, therefore I have 600 neurons in input layer. As there are 36 possibilities (0-9,A-Z) I have 36 output neurons. I kept 100 neurons in hidden layer. The predict function of OpenCV is giving me the same output for every segmented image. That output is also showing some large negative(< -1). I have used cv::ml::ANN_MLP::SIGMOID_SYM as an activation function.
Please don't mind as there is lot of code wrongly commented (I am doing trial and error).
I need to find out what is the output of predict function. Thank you for your help.
#include <opencv2/opencv.hpp>
int inputLayerSize = 1;
int outputLayerSize = 1;
int numSamples = 2;
Mat layers = Mat(3, 1, CV_32S);
layers.row(0) =Scalar(600) ;
layers.row(1) = Scalar(20);
layers.row(2) = Scalar(36);
vector<int> layerSizes = { 600,100,36 };
Ptr<ml::ANN_MLP> nnPtr = ml::ANN_MLP::create();
vector <int> n;
//nnPtr->setLayerSizes(3);
nnPtr->setLayerSizes(layers);
nnPtr->setTrainMethod(ml::ANN_MLP::BACKPROP);
nnPtr->setTermCriteria(TermCriteria(cv::TermCriteria::COUNT | cv::TermCriteria::EPS, 1000, 0.00001f));
nnPtr->setActivationFunction(cv::ml::ANN_MLP::SIGMOID_SYM, 1, 1);
nnPtr->setBackpropWeightScale(0.5f);
nnPtr->setBackpropMomentumScale(0.5f);
/*CvANN_MLP_TrainParams params = CvANN_MLP_TrainParams(
// terminate the training after either 1000
// iterations or a very small change in the
// network wieghts below the specified value
cvTermCriteria(CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 1000, 0.000001),
// use backpropogation for training
CvANN_MLP_TrainParams::BACKPROP,
// co-efficents for backpropogation training
// (refer to manual)
0.1,
0.1);*/
/* Mat samples(Size(inputLayerSize, numSamples), CV_32F);
samples.at<float>(Point(0, 0)) = 0.1f;
samples.at<float>(Point(0, 1)) = 0.2f;
Mat responses(Size(outputLayerSize, numSamples), CV_32F);
responses.at<float>(Point(0, 0)) = 0.2f;
responses.at<float>(Point(0, 1)) = 0.4f;
*/
//reading chaos image
// we will read the classification numbers into this variable as though it is a vector
// close the traning images file
/*vector<int> layerInfo;
layerInfo=nnPtr->get;
for (int i = 0; i < layerInfo.size(); i++) {
cout << "size of 0" <<layerInfo[i] << endl;
}*/
cv::imshow("chaos", matTrainingImagesAsFlattenedFloats);
// cout <<abc << endl;
matTrainingImagesAsFlattenedFloats.convertTo(matTrainingImagesAsFlattenedFloats, CV_32F);
//matClassificationInts.reshape(1, 496);
matClassificationInts.convertTo(matClassificationInts, CV_32F);
matSamples.convertTo(matSamples, CV_32F);
std::cout << matClassificationInts.rows << " " << matClassificationInts.cols << " ";
std::cout << matTrainingImagesAsFlattenedFloats.rows << " " << matTrainingImagesAsFlattenedFloats.cols << " ";
std::cout << matSamples.rows << " " << matSamples.cols;
imshow("Samples", matSamples);
imshow("chaos", matTrainingImagesAsFlattenedFloats);
Ptr<ml::TrainData> trainData = ml::TrainData::create(matTrainingImagesAsFlattenedFloats, ml::SampleTypes::ROW_SAMPLE, matSamples);
nnPtr->train(trainData);
bool m = nnPtr->isTrained();
if (m)
std::cout << "training complete\n\n";
// cv::Mat matCurrentChar = Mat(cv::Size(matTrainingImagesAsFlattenedFloats.cols, matTrainingImagesAsFlattenedFloats.rows), CV_32F);
// cout << "samples:\n" << samples << endl;
//cout << "\nresponses:\n" << responses << endl;
/* if (!nnPtr->train(trainData))
return 1;*/
/* cout << "\nweights[0]:\n" << nnPtr->getWeights(0) << endl;
cout << "\nweights[1]:\n" << nnPtr->getWeights(1) << endl;
cout << "\nweights[2]:\n" << nnPtr->getWeights(2) << endl;
cout << "\nweights[3]:\n" << nnPtr->getWeights(3) << endl;*/
//predicting
std::vector <cv::String> filename;
cv::String folder = "./plate/";
cv::glob(folder, filename);
if (filename.empty()) { // if unable to open image
std::cout << "error: image not read from file\n\n"; // show error message on command line
return(0); // and exit program
}
String strFinalString;
for (int i = 0; i < filename.size(); i++) {
cv::Mat matTestingNumbers = cv::imread(filename[i]);
cv::Mat matGrayscale; //
cv::Mat matBlurred; // declare more image variables
cv::Mat matThresh; //
cv::Mat matThreshCopy;
cv::Mat matCanny;
//
cv::cvtColor(matTestingNumbers, matGrayscale, CV_BGR2GRAY); // convert to grayscale
matThresh = cv::Mat(cv::Size(matGrayscale.cols, matGrayscale.rows), CV_8UC1);
for (int i = 0; i < matGrayscale.cols; i++) {
for (int j = 0; j < matGrayscale.rows; j++) {
if (matGrayscale.at<uchar>(j, i) <= 130) {
matThresh.at<uchar>(j, i) = 255;
}
else {
matThresh.at<uchar>(j, i) = 0;
}
}
}
// blur
cv::GaussianBlur(matThresh, // input image
matBlurred, // output image
cv::Size(5, 5), // smoothing window width and height in pixels
0); // sigma value, determines how much the image will be blurred, zero makes function choose the sigma value
// filter image from grayscale to black and white
/* cv::adaptiveThreshold(matBlurred, // input image
matThresh, // output image
255, // make pixels that pass the threshold full white
cv::ADAPTIVE_THRESH_GAUSSIAN_C, // use gaussian rather than mean, seems to give better results
cv::THRESH_BINARY_INV, // invert so foreground will be white, background will be black
11, // size of a pixel neighborhood used to calculate threshold value
2); */ // constant subtracted from the mean or weighted mean
// cv::imshow("thresh" + std::to_string(i), matThresh);
matThreshCopy = matThresh.clone();
std::vector<std::vector<cv::Point> > ptContours; // declare a vector for the contours
std::vector<cv::Vec4i> v4iHierarchy;// make a copy of the thresh image, this in necessary b/c findContours modifies the image
cv::Canny(matBlurred, matCanny, 20, 40, 3);
/*std::vector<std::vector<cv::Point> > ptContours; // declare a vector for the contours
std::vector<cv::Vec4i> v4iHierarchy; // declare a vector for the hierarchy (we won't use this in this program but this may be helpful for reference)
cv::findContours(matThreshCopy, // input image, make sure to use a copy since the function will modify this image in the course of finding contours
ptContours, // output contours
v4iHierarchy, // output hierarchy
cv::RETR_EXTERNAL, // retrieve the outermost contours only
cv::CHAIN_APPROX_SIMPLE); // compress horizontal, vertical, and diagonal segments and leave only their end points
/*std::vector<std::vector<cv::Point> > contours_poly(ptContours.size());
std::vector<cv::Rect> boundRect(ptContours.size());
for (int i = 0; i < ptContours.size(); i++)
{
approxPolyDP(cv::Mat(ptContours[i]), contours_poly[i], 3, true);
boundRect[i] = cv::boundingRect(cv::Mat(contours_poly[i]));
}*/
/*for (int i = 0; i < ptContours.size(); i++) { // for each contour
ContourWithData contourWithData; // instantiate a contour with data object
contourWithData.ptContour = ptContours[i]; // assign contour to contour with data
contourWithData.boundingRect = cv::boundingRect(contourWithData.ptContour); // get the bounding rect
contourWithData.fltArea = cv::contourArea(contourWithData.ptContour); // calculate the contour area
allContoursWithData.push_back(contourWithData); // add contour with data object to list of all contours with data
}
for (int i = 0; i < allContoursWithData.size(); i++) { // for all contours
if (allContoursWithData[i].checkIfContourIsValid()) { // check if valid
validContoursWithData.push_back(allContoursWithData[i]); // if so, append to valid contour list
}
}
//sort contours from left to right
std::sort(validContoursWithData.begin(), validContoursWithData.end(), ContourWithData::sortByBoundingRectXPosition);
// std::string strFinalString; // declare final string, this will have the final number sequence by the end of the program
*/
/*for (int i = 0; i < validContoursWithData.size(); i++) { // for each contour
// draw a green rect around the current char
cv::rectangle(matTestingNumbers, // draw rectangle on original image
validContoursWithData[i].boundingRect, // rect to draw
cv::Scalar(0, 255, 0), // green
2); // thickness
cv::Mat matROI = matThresh(validContoursWithData[i].boundingRect); // get ROI image of bounding rect
cv::Mat matROIResized;
cv::resize(matROI, matROIResized, cv::Size(RESIZED_IMAGE_WIDTH, RESIZED_IMAGE_HEIGHT)); // resize image, this will be more consistent for recognition and storage
*/
cv::Mat matROIFloat;
cv::resize(matThresh, matThresh, cv::Size(RESIZED_IMAGE_WIDTH, RESIZED_IMAGE_HEIGHT));
matThresh.convertTo(matROIFloat, CV_32FC1, 1.0 / 255.0); // convert Mat to float, necessary for call to find_nearest
cv::Mat matROIFlattenedFloat = matROIFloat.reshape(1, 1);
cv::Point maxLoc = { 0,0 };
cv::Point minLoc;
cv::Mat output = cv::Mat(cv::Size(36, 1), CV_32F);
vector<float>output2;
// cv::Mat output2 = cv::Mat(cv::Size(36, 1), CV_32F);
nnPtr->predict(matROIFlattenedFloat, output2);
// float max = output.at<float>(0, 0);
int fo = 0;
float m = output2[0];
imshow("predicted input", matROIFlattenedFloat);
// float b = output.at<float>(0, 0);
// cout <<"\n output0,0:"<<b<<endl;
// minMaxLoc(output, 0, 0, &minLoc, &maxLoc, Mat());
// cout << "\noutput:\n" << maxLoc.x << endl;
for (int j = 1; j < 36; j++) {
float value =output2[j];
if (value > m) {
m = value;
fo = j;
}
}
float * p = 0;
p = &m;
cout << "j value in output " << fo << " Max value " << p << endl;
//imshow("output image" + to_string(i), output);
// cout << "\noutput:\n" << minLoc.x << endl;
//float fltCurrentChar = (float)maxLoc.x;
output.release();
m = 0;
fo = 0;
}
// strFinalString = strFinalString + char(int(fltCurrentChar)); // append current char to full string
// cv::imshow("Predict output", output);
/*cv::Point maxLoc = {0,0};
Mat output=Mat (cv::Size(matSamples.cols,matSamples.rows),CV_32F);
nnPtr->predict(matTrainingImagesAsFlattenedFloats, output);
minMaxLoc(output, 0, 0, 0, &maxLoc, 0);
cout << "\noutput:\n" << maxLoc.x << endl;*/
// getchar();
/*for (int i = 0; i < 10;i++) {
for (int j = 0; j < 36; j++) {
if (matCurrentChar.at<float>(i, j) >= 0.6) {
cout << " "<<j<<" ";
}
}
}*/
waitKey(0);
return(0);
}
void gen() {
std::string dir, filepath;
int num, imgArea, minArea;
int pos = 0;
bool f = true;
struct stat filestat;
cv::Mat imgTrainingNumbers;
cv::Mat imgGrayscale;
cv::Mat imgBlurred;
cv::Mat imgThresh;
cv::Mat imgThreshCopy;
cv::Mat matROIResized=cv::Mat (cv::Size(RESIZED_IMAGE_WIDTH,RESIZED_IMAGE_HEIGHT),CV_8UC1);
cv::Mat matROI;
std::vector <cv::String> filename;
std::vector<std::vector<cv::Point> > ptContours;
std::vector<cv::Vec4i> v4iHierarchy;
int count = 0, contoursCount = 0;
matSamples = cv::Mat(cv::Size(36, 496), CV_32FC1);
matTrainingImagesAsFlattenedFloats = cv::Mat(cv::Size(600, 496), CV_32FC1);
for (int j = 0; j <= 35; j++) {
int tmp = j;
cv::String folder = "./Training Data/" + std::to_string(tmp);
cv::glob(folder, filename);
for (int k = 0; k < filename.size(); k++) {
count++;
// If the file is a directory (or is in some way invalid) we'll skip it
// if (stat(filepath.c_str(), &filestat)) continue;
//if (S_ISDIR(filestat.st_mode)) continue;
imgTrainingNumbers = cv::imread(filename[k]);
imgArea = imgTrainingNumbers.cols*imgTrainingNumbers.rows;
// read in training numbers image
minArea = imgArea * 50 / 100;
if (imgTrainingNumbers.empty()) {
std::cout << "error: image not read from file\n\n";
//return(0);
}
cv::cvtColor(imgTrainingNumbers, imgGrayscale, CV_BGR2GRAY);
//cv::equalizeHist(imgGrayscale, imgGrayscale);
imgThresh = cv::Mat(cv::Size(imgGrayscale.cols, imgGrayscale.rows), CV_8UC1);
/*cv::adaptiveThreshold(imgGrayscale,
imgThresh,
255,
cv::ADAPTIVE_THRESH_GAUSSIAN_C,
cv::THRESH_BINARY_INV,
3,
0);
*/
for (int i = 0; i < imgGrayscale.cols; i++) {
for (int j = 0; j < imgGrayscale.rows; j++) {
if (imgGrayscale.at<uchar>(j, i) <= 130) {
imgThresh.at<uchar>(j, i) = 255;
}
else {
imgThresh.at<uchar>(j, i) = 0;
}
}
}
// cv::imshow("imgThresh"+std::to_string(count), imgThresh);
imgThreshCopy = imgThresh.clone();
cv::GaussianBlur(imgThreshCopy,
imgBlurred,
cv::Size(5, 5),
0);
cv::Mat imgCanny;
// cv::Canny(imgBlurred,imgCanny,20,40,3);
cv::findContours(imgBlurred,
ptContours,
v4iHierarchy,
cv::RETR_EXTERNAL,
cv::CHAIN_APPROX_SIMPLE);
for (int i = 0; i < ptContours.size(); i++) {
if (cv::contourArea(ptContours[i]) > MIN_CONTOUR_AREA) {
contoursCount++;
cv::Rect boundingRect = cv::boundingRect(ptContours[i]);
cv::rectangle(imgTrainingNumbers, boundingRect, cv::Scalar(0, 0, 255), 2); // draw red rectangle around each contour as we ask user for input
matROI = imgThreshCopy(boundingRect); // get ROI image of bounding rect
std::string path = "./" + std::to_string(contoursCount) + ".JPG";
cv::imwrite(path, matROI);
// cv::imshow("matROI" + std::to_string(count), matROI);
cv::resize(matROI, matROIResized, cv::Size(RESIZED_IMAGE_WIDTH, RESIZED_IMAGE_HEIGHT)); // resize image, this will be more consistent for recognition and storage
std::cout << filename[k] << " " << contoursCount << "\n";
//cv::imshow("matROI", matROI);
//cv::imshow("matROIResized"+std::to_string(count), matROIResized);
// cv::imshow("imgTrainingNumbers" + std::to_string(contoursCount), imgTrainingNumbers);
int intChar;
if (j<10)
intChar = j + 48;
else {
intChar = j + 55;
}
/*if (intChar == 27) { // if esc key was pressed
return(0); // exit program
}*/
// if (std::find(intValidChars.begin(), intValidChars.end(), intChar) != intValidChars.end()) { // else if the char is in the list of chars we are looking for . . .
// append classification char to integer list of chars
cv::Mat matImageFloat;
matROIResized.convertTo(matImageFloat,CV_32FC1);// now add the training image (some conversion is necessary first) . . .
//matROIResized.convertTo(matImageFloat, CV_32FC1); // convert Mat to float
cv::Mat matImageFlattenedFloat = matImageFloat.reshape(1, 1);
//matTrainingImagesAsFlattenedFloats.push_back(matImageFlattenedFloat);// flatten
try {
//matTrainingImagesAsFlattenedFloats.push_back(matImageFlattenedFloat);
std::cout << matTrainingImagesAsFlattenedFloats.rows << " " << matTrainingImagesAsFlattenedFloats.cols;
//unsigned char* re;
int ii = 0; // Current column in training_mat
for (int i = 0; i<matImageFloat.rows; i++) {
for (int j = 0; j < matImageFloat.cols; j++) {
matTrainingImagesAsFlattenedFloats.at<float>(contoursCount-1, ii++) = matImageFloat.at<float>(i,j);
}
}
}
catch (std::exception &exc) {
f = false;
exc.what();
}
if (f) {
matClassificationInts.push_back((float)intChar);
matSamples.at<float>(contoursCount-1, j) = 1.0;
}
f = true;
// add to Mat as though it was a vector, this is necessary due to the
// data types that KNearest.train accepts
} // end if
//} // end if
} // end for
}//end i
}//end j
}
Output of predict function
Unfortunately, I don't have the necessary time to really review the code, but I can say off the top that to train a model that performs well for prediction with 36 classes, you will need several things:
A large number of good quality images. Ideally, you'd want thousands of images for each class. Of course, you can see somewhat decent results with less than that, but if you only have a few images per class, it's never going to be able to generalize adequately.
You need a model that is large and sophisticated enough to provide the necessary expressiveness to solve the problem. For a problem like this, a plain old multi-layer perceptron with one hidden layer with 100 units may not be enough. This is actually a problem that would benefit from using a Convolutional Neural Net (CNN) with a couple layers just to extract useful features first. But assuming you don't want to go down that path, you may at least want to tweak the size of your hidden layer.
To even get to a point where the training process converges, you will probably need to experiment and crucially, you need an effective way to test the accuracy of the ANN after each experiment. Ideally, you want to observe the loss as the training is proceeding, but I'm not sure whether that's possible using OpenCV's ML functionality. At a minimum, you should fully expect to have to play around with the various so-called "hyper-parameters" and run many experiments before you have a reasonable model.
Anyway, the most important thing is to make sure you have a solid mechanism for validating the accuracy of the model after training. If you aren't already doing so, set aside some images as a separate test set, and after each experiment, use the trained ANN to predict each test image to see the accuracy.
One final general note: what you're trying to do is complex. You will save yourself a huge number of headaches if you take the time early and often to refactor your code. No matter how many experiments you run, if there's some defect causing (for example) your training data to be fundamentally different in some way than your test data, you will never see good results.
Good luck!
EDIT: I should also point out that seeing the same result for every input image is a classic sign that training failed. Unfortunately, there are many reasons why that might happen and it will be very difficult for anyone to isolate that for you without some cleaner code and access to your image data.
I have solved the issue of not getting the output of predict. The issue was created because of the input Mat image to train (ie. matTrainingImagesAsFlattenedFloats) was having values 255.0 for a white pixel. This happened because I haven't use convertTo() properly. You need to use convertTo(OutputImage name, CV_32FC1, 1.0 / 255.0); like this which will convert all the pixel values with 255.0 to 1.0 and after that I am getting the correct output.
Thank you for all the help.
This is too broad to be in one question. Sorry for the bad news. I tried this over and over and couldn't find a solution. I recommend that you implement a simple AND, OR or XOR first just to make sure that the learning part is working and that you are getting better results the more passes you do. Also I suggest to try the Tangent Hyperbolic as a Transfer Function instead of Sigmoid. And Good luck!
Here is some of my own posts that might help you:
Exact results as yours: HERE
Some codes: HERE
I don't want to say that, but several professors I met said Backpropagation just doesn't work and they had (and me have) to implement my own method of teaching the network.
I am attempting to find Pedestriants/People in images with the help of a cascade classifier which uses HOG as features.
The problem I'm trying to solve is in the initial stage, feature generation.
Where the HOG values in certain areas of the images are too low and hence the classifier fails.
The images below were captured using a Basler aca640-100gc Camera.
The visualization of the HOG was borrowed from the code in the webpage. Code also attached in the end of the question.
This first image here and its HOG is what I'm trying to achieve.
A realistic outdoor scene which can be used to generate features and hopefully find people. This is not what I have captured using my camera.
Captured Outdoor Images results
The images below are what I have created with the camera. I have tried all basic variations where I have played with the brightness and Focus But this still yeilds a poor result in an outdoor scene. Where I am inside the car and the camera is attached close to the windscreen.
But on the Contrary when the same camera was used to record indoor scene It works fine. Why it works when its in an indoor situtation and why not in an outdoor scene is something I can't understand.
Captured Indoor Images results
As seen in the images below same configuration works for an indoor scene.
Desired results
Ideally I would like results of the out door recordings to look like so.
Could anyone give me insight why this happens?
or How I can over come this issue to generate reliable HOGs for detection?
Code to visualize HOG
Mat img_raw = imread("C:\\testimg.png", 1); // load as color image
resize(img_raw, img_raw, Size(64,128) );
Mat img;
cvtColor(img_raw, img, CV_RGB2GRAY);
HOGDescriptor d;
// Size(128,64), //winSize
// Size(16,16), //blocksize
// Size(8,8), //blockStride,
// Size(8,8), //cellSize,
// 9, //nbins,
// 0, //derivAper,
// -1, //winSigma,
// 0, //histogramNormType,
// 0.2, //L2HysThresh,
// 0 //gammal correction,
// //nlevels=64
//);
// void HOGDescriptor::compute(const Mat& img, vector<float>& descriptors,
// Size winStride, Size padding,
// const vector<Point>& locations) const
vector<float> descriptorsValues;
vector<Point> locations;
d.compute( img, descriptorsValues, Size(8,8), Size(8,8), locations);
cout << "HOG descriptor size is " << d.getDescriptorSize() << endl;
cout << "img dimensions: " << img.cols << " width x " << img.rows << "height" << endl;
cout << "Found " << descriptorsValues.size() << " descriptor values" << endl;
cout << "Nr of locations specified : " << locations.size() << endl;
Mat get_hogdescriptor_visual_image(Mat& origImg,
vector<float>& descriptorValues,
Size winSize,
Size cellSize,
int scaleFactor,
double viz_factor)
{
Mat visual_image;
resize(origImg, visual_image, Size(origImg.cols*scaleFactor, origImg.rows*scaleFactor));
int gradientBinSize = 9;
// dividing 180° into 9 bins, how large (in rad) is one bin?
float radRangeForOneBin = 3.14/(float)gradientBinSize;
// prepare data structure: 9 orientation / gradient strenghts for each cell
int cells_in_x_dir = winSize.width / cellSize.width;
int cells_in_y_dir = winSize.height / cellSize.height;
int totalnrofcells = cells_in_x_dir * cells_in_y_dir;
float*** gradientStrengths = new float**[cells_in_y_dir];
int** cellUpdateCounter = new int*[cells_in_y_dir];
for (int y=0; y<cells_in_y_dir; y++)
{
gradientStrengths[y] = new float*[cells_in_x_dir];
cellUpdateCounter[y] = new int[cells_in_x_dir];
for (int x=0; x<cells_in_x_dir; x++)
{
gradientStrengths[y][x] = new float[gradientBinSize];
cellUpdateCounter[y][x] = 0;
for (int bin=0; bin<gradientBinSize; bin++)
gradientStrengths[y][x][bin] = 0.0;
}
}
// nr of blocks = nr of cells - 1
// since there is a new block on each cell (overlapping blocks!) but the last one
int blocks_in_x_dir = cells_in_x_dir - 1;
int blocks_in_y_dir = cells_in_y_dir - 1;
// compute gradient strengths per cell
int descriptorDataIdx = 0;
int cellx = 0;
int celly = 0;
for (int blockx=0; blockx<blocks_in_x_dir; blockx++)
{
for (int blocky=0; blocky<blocks_in_y_dir; blocky++)
{
// 4 cells per block ...
for (int cellNr=0; cellNr<4; cellNr++)
{
// compute corresponding cell nr
int cellx = blockx;
int celly = blocky;
if (cellNr==1) celly++;
if (cellNr==2) cellx++;
if (cellNr==3)
{
cellx++;
celly++;
}
for (int bin=0; bin<gradientBinSize; bin++)
{
float gradientStrength = descriptorValues[ descriptorDataIdx ];
descriptorDataIdx++;
gradientStrengths[celly][cellx][bin] += gradientStrength;
} // for (all bins)
// note: overlapping blocks lead to multiple updates of this sum!
// we therefore keep track how often a cell was updated,
// to compute average gradient strengths
cellUpdateCounter[celly][cellx]++;
} // for (all cells)
} // for (all block x pos)
} // for (all block y pos)
// compute average gradient strengths
for (int celly=0; celly<cells_in_y_dir; celly++)
{
for (int cellx=0; cellx<cells_in_x_dir; cellx++)
{
float NrUpdatesForThisCell = (float)cellUpdateCounter[celly][cellx];
// compute average gradient strenghts for each gradient bin direction
for (int bin=0; bin<gradientBinSize; bin++)
{
gradientStrengths[celly][cellx][bin] /= NrUpdatesForThisCell;
}
}
}
cout << "descriptorDataIdx = " << descriptorDataIdx << endl;
// draw cells
for (int celly=0; celly<cells_in_y_dir; celly++)
{
for (int cellx=0; cellx<cells_in_x_dir; cellx++)
{
int drawX = cellx * cellSize.width;
int drawY = celly * cellSize.height;
int mx = drawX + cellSize.width/2;
int my = drawY + cellSize.height/2;
rectangle(visual_image,
Point(drawX*scaleFactor,drawY*scaleFactor),
Point((drawX+cellSize.width)*scaleFactor,
(drawY+cellSize.height)*scaleFactor),
CV_RGB(100,100,100),
1);
// draw in each cell all 9 gradient strengths
for (int bin=0; bin<gradientBinSize; bin++)
{
float currentGradStrength = gradientStrengths[celly][cellx][bin];
// no line to draw?
if (currentGradStrength==0)
continue;
float currRad = bin * radRangeForOneBin + radRangeForOneBin/2;
float dirVecX = cos( currRad );
float dirVecY = sin( currRad );
float maxVecLen = cellSize.width/2;
float scale = viz_factor; // just a visual_imagealization scale,
// to see the lines better
// compute line coordinates
float x1 = mx - dirVecX * currentGradStrength * maxVecLen * scale;
float y1 = my - dirVecY * currentGradStrength * maxVecLen * scale;
float x2 = mx + dirVecX * currentGradStrength * maxVecLen * scale;
float y2 = my + dirVecY * currentGradStrength * maxVecLen * scale;
// draw gradient visual_imagealization
line(visual_image,
Point(x1*scaleFactor,y1*scaleFactor),
Point(x2*scaleFactor,y2*scaleFactor),
CV_RGB(0,0,255),
1);
} // for (all bins)
} // for (cellx)
} // for (celly)
// don't forget to free memory allocated by helper data structures!
for (int y=0; y<cells_in_y_dir; y++)
{
for (int x=0; x<cells_in_x_dir; x++)
{
delete[] gradientStrengths[y][x];
}
delete[] gradientStrengths[y];
delete[] cellUpdateCounter[y];
}
delete[] gradientStrengths;
delete[] cellUpdateCounter;
return visual_image;
}
I am making an application that uses OCR and I am using OpenCV to threshold the image to improve the OCR results, I have gotten pretty good results but I want to know if anyone has any suggestions for improvement.
Here is what I've done so far:
// Convert to grayscale.
cv::cvtColor(cvMat, cvMat, CV_RGB2GRAY);
// Apply adaptive threshold.
cv::adaptiveThreshold(cvMat, cvMat, 255, CV_ADAPTIVE_THRESH_GAUSSIAN_C, CV_THRESH_BINARY, 3, 5);
// Attempt to sharpen the image.
cv::GaussianBlur(cvMat, cvMat, cv::Size(0, 0), 3);
cv::addWeighted(cvMat, 1.5, cvMat, -0.5, 0, cvMat);
Let me know if you have any suggestions to improve results, thanks.
Sample Images:
After:
One of the best algorithms for thresholding problem in the OCR field is sauvola method.You can use the below code.
#ifndef _THRESHOLDER
#define _THRESHOLDER
#include <cv.h>
#include "type.h"
using namespace cv;
enum class BhThresholdMethod{OTSU,NIBLACK,SAUVOLA,WOLFJOLION};
class BhThresholder
{
public :
void doThreshold(InputArray src ,OutputArray dst,const BhThresholdMethod &method);
private:
};
#endif //_THRESHOLDER
thresholder.cpp
#include "stdafx.h"
#define uget(x,y) at<unsigned char>(y,x)
#define uset(x,y,v) at<unsigned char>(y,x)=v;
#define fget(x,y) at<float>(y,x)
#define fset(x,y,v) at<float>(y,x)=v;
// *************************************************************
// glide a window across the image and
// create two maps: mean and standard deviation.
// *************************************************************
//#define BINARIZEWOLF_VERSION "2.3 (February 26th, 2013)"
double calcLocalStats (Mat &im, Mat &map_m, Mat &map_s, int win_x, int win_y) {
double m,s,max_s, sum, sum_sq, foo;
int wxh = win_x / 2;
int wyh = win_y / 2;
int x_firstth = wxh;
int y_lastth = im.rows-wyh-1;
int y_firstth= wyh;
double winarea = win_x*win_y;
max_s = 0;
for (int j = y_firstth ; j<=y_lastth; j++)
{
// Calculate the initial window at the beginning of the line
sum = sum_sq = 0;
for (int wy=0 ; wy<win_y; wy++)
for (int wx=0 ; wx<win_x; wx++) {
foo = im.uget(wx,j-wyh+wy);
sum += foo;
sum_sq += foo*foo;
}
m = sum / winarea;
s = sqrt ((sum_sq - (sum*sum)/winarea)/winarea);
if (s > max_s)
max_s = s;
map_m.fset(x_firstth, j, m);
map_s.fset(x_firstth, j, s);
// Shift the window, add and remove new/old values to the histogram
for (int i=1 ; i <= im.cols -win_x; i++) {
// Remove the left old column and add the right new column
for (int wy=0; wy<win_y; ++wy) {
foo = im.uget(i-1,j-wyh+wy);
sum -= foo;
sum_sq -= foo*foo;
foo = im.uget(i+win_x-1,j-wyh+wy);
sum += foo;
sum_sq += foo*foo;
}
m = sum / winarea;
s = sqrt ((sum_sq - (sum*sum)/winarea)/winarea);
if (s > max_s)
max_s = s;
map_m.fset(i+wxh, j, m);
map_s.fset(i+wxh, j, s);
}
}
return max_s;
}
void NiblackSauvolaWolfJolion (InputArray _src, OutputArray _dst,const BhThresholdMethod &version,int winx, int winy, double k, double dR) {
Mat src = _src.getMat();
Mat dst = _dst.getMat();
double m, s, max_s;
double th=0;
double min_I, max_I;
int wxh = winx/2;
int wyh = winy/2;
int x_firstth= wxh;
int x_lastth = src.cols-wxh-1;
int y_lastth = src.rows-wyh-1;
int y_firstth= wyh;
int mx, my;
// Create local statistics and store them in a double matrices
Mat map_m = Mat::zeros (src.size(), CV_32FC1);
Mat map_s = Mat::zeros (src.size(), CV_32FC1);
max_s = calcLocalStats (src, map_m, map_s, winx, winy);
minMaxLoc(src, &min_I, &max_I);
Mat thsurf (src.size(), CV_32FC1);
// Create the threshold surface, including border processing
// ----------------------------------------------------
for (int j = y_firstth ; j<=y_lastth; j++) {
// NORMAL, NON-BORDER AREA IN THE MIDDLE OF THE WINDOW:
for (int i=0 ; i <= src.cols-winx; i++) {
m = map_m.fget(i+wxh, j);
s = map_s.fget(i+wxh, j);
// Calculate the threshold
switch (version) {
case BhThresholdMethod::NIBLACK:
th = m + k*s;
break;
case BhThresholdMethod::SAUVOLA:
th = m * (1 + k*(s/dR-1));
break;
case BhThresholdMethod::WOLFJOLION:
th = m + k * (s/max_s-1) * (m-min_I);
break;
default:
cerr << "Unknown threshold type in ImageThresholder::surfaceNiblackImproved()\n";
exit (1);
}
thsurf.fset(i+wxh,j,th);
if (i==0) {
// LEFT BORDER
for (int i=0; i<=x_firstth; ++i)
thsurf.fset(i,j,th);
// LEFT-UPPER CORNER
if (j==y_firstth)
for (int u=0; u<y_firstth; ++u)
for (int i=0; i<=x_firstth; ++i)
thsurf.fset(i,u,th);
// LEFT-LOWER CORNER
if (j==y_lastth)
for (int u=y_lastth+1; u<src.rows; ++u)
for (int i=0; i<=x_firstth; ++i)
thsurf.fset(i,u,th);
}
// UPPER BORDER
if (j==y_firstth)
for (int u=0; u<y_firstth; ++u)
thsurf.fset(i+wxh,u,th);
// LOWER BORDER
if (j==y_lastth)
for (int u=y_lastth+1; u<src.rows; ++u)
thsurf.fset(i+wxh,u,th);
}
// RIGHT BORDER
for (int i=x_lastth; i<src.cols; ++i)
thsurf.fset(i,j,th);
// RIGHT-UPPER CORNER
if (j==y_firstth)
for (int u=0; u<y_firstth; ++u)
for (int i=x_lastth; i<src.cols; ++i)
thsurf.fset(i,u,th);
// RIGHT-LOWER CORNER
if (j==y_lastth)
for (int u=y_lastth+1; u<src.rows; ++u)
for (int i=x_lastth; i<src.cols; ++i)
thsurf.fset(i,u,th);
}
cerr << "surface created" << endl;
for (int y=0; y<src.rows; ++y)
for (int x=0; x<src.cols; ++x)
{
if (src.uget(x,y) >= thsurf.fget(x,y))
{
dst.uset(x,y,255);
}
else
{
dst.uset(x,y,0);
}
}
}
void BhThresholder::doThreshold(InputArray _src ,OutputArray _dst,const BhThresholdMethod &method)
{
Mat src = _src.getMat();
int winx = 0;
int winy = 0;
float optK=0.5;
if (winx==0 || winy==0) {
winy = (int) (2.0 * src.rows - 1)/3;
winx = (int) src.cols-1 < winy ? src.cols-1 : winy;
// if the window is too big, than we asume that the image
// is not a single text box, but a document page: set
// the window size to a fixed constant.
if (winx > 100)
winx = winy = 40;
}
// Threshold
_dst.create(src.size(), CV_8UC1);
Mat dst = _dst.getMat();
//medianBlur(src,dst,5);
GaussianBlur(src,dst,Size(5,5),0);
//#define _BH_SHOW_IMAGE
#ifdef _BH_DEBUG
#define _BH_SHOW_IMAGE
#endif
//medianBlur(src,dst,7);
switch (method)
{
case BhThresholdMethod::OTSU :
threshold(dst,dst,128,255,CV_THRESH_OTSU);
break;
case BhThresholdMethod::SAUVOLA :
case BhThresholdMethod::WOLFJOLION :
NiblackSauvolaWolfJolion (src, dst, method, winx, winy, optK, 128);
}
bitwise_not(dst,dst);
#ifdef _BH_SHOW_IMAGE
#undef _BH_SHOW_IMAGE
#endif
}
Here is comparsion table for thresholding methods: http://clweb.csa.iisc.ernet.in/rahulsharma/binarize/set1.php?id=set1%2Fimage00b
A few thoughts:
Since you're starting with a rectangular object that may be viewed at a non-normal angle, use an affine transform to warp the image so that it appears rectangular with right angle corners.
Before the affine transform, you should probably remove barrel distortion (the curviness of the card edges).
Consider using an adaptive threshold rather than a simple global binarization threshold.
If you can find a proper OCR algorithm that doesn't require binary images, use that. Although binarization will work well for black text on a white background, in general binarization presents a lot of problems if you want to achieve high accuracy (i.e., character recognition approaching 98%+ for arbitrary strings of characters)
Try to sample with better resolution.
I am new at OpenCV and I am trying to write a simple code to get the mean of a block size in an image. I wrote the following code, the build is ok, however, the debug is giving me an unhandled exception at memory location. This exception is at the following line:
mean_img.at<double>(i/block_size, j/block_size) = mean_img.at<double>(i/block_size,j/block_size) + new_img.at<double>(i + x, j + y) / (mean);
So, I will be grateful if anyone give me some hints. Thanks in advance and here is the whole code:
#include "opencv2/highgui/highgui.hpp" // Include Libs for OpenCV and Image Processing
#include <opencv2/opencv.hpp> // check that
#include "opencv2/core/core.hpp" // check that
#include <iostream> // Include Libs for C++
#include "opencv2/imgproc/imgproc.hpp" // Include Libs for OpenCV and Image Processing
#include <math.h>
using namespace cv; // namespace parameters not important in OpenCV2.4.6
using namespace std; // namespace parameters not important in OpenCV2.4.6
int main( int argc, const char** argv )
{
/*This part is to compute the parameters(block size, resize parameter) of the new_img*/
int resize_parameter; // resize parameter must be multiplication of 2
resize_parameter = 500;
int block_size; // block parameter must be divisable by of block size
block_size = 50;
if ((resize_parameter % 2) != 0) resize_parameter = resize_parameter - (resize_parameter % 2);
while ((resize_parameter % block_size) != 0) block_size = block_size - 1;
int mean_size = resize_parameter/block_size; // this is the size of the mean matrix
int mean = block_size * block_size; // this no is ti get the mean of every element in the matrix
//int mean_img [mean_size][mean_size] = {}; // the mean image matrix initialized by zero
/*This part is to allocate the array with dynamic size*/
//int** mean_img = new int*[mean_size];
//for(int x = 0; x < mean_size; x++)
//mean_img[x] = new int[mean_size];
/*Then we can use the array*/
/*This part is to fill all the elements of the mean matrix with zeros*/
//memset(mean_img, 0, sizeof(mean_img[0][0]) * mean_size * mean_size);
/*This part is the definition of the matrices that are used for the images*/
Mat mean_img = Mat(mean_size,mean_size,CV_64FC4, cv::Scalar(0)); // define a new matrix with meansize*meansize elements to compute the mean
Mat mean_img_full = Mat(resize_parameter,resize_parameter,CV_64FC4, cv::Scalar(0)); // define a new matrix with resizeparameter*resizeparameter elements to compute the mean
Mat new_img = Mat(resize_parameter,resize_parameter,CV_64FC4); // define a new matrix with resize_parameter*resize_parameter elements
Mat original_img = imread("Desert.JPG", CV_LOAD_IMAGE_GRAYSCALE); //define a new matrix and read the image data in the file "Desert.JPG" and store it in 'original_img'
// notes: the location of the image must be in the same directory of the C++ file
if (original_img.empty()) //check whether the image is loaded or not
{
cout << "Error : Image cannot be loaded..!!" << endl;
//system("pause"); //wait for a key press
return -1;
}
// explicitly specify dsize=dst.size(); fx and fy will be computed from that.
// resize( src matrix, dst matrix, dst.size to get the size of the dst matrix, 0, 0 "to deal with the dst matrix size, may be 0.5 or any fraction from the src size, "AREA,CUBIC,LINEAR")
resize(original_img, new_img, new_img.size(), 0, 0, CV_INTER_AREA);
/*This part is to compute the mean of each block*/
for ( int i = 0; i < resize_parameter; i = i + block_size) // i represents the index of the raw
{
for ( int j = 0; j < resize_parameter; j = j + block_size) // for the blocks in the same raw with different columns
{
for ( int x = 0; x < block_size; x++) // x represents the index of the raw
{
for ( int y = 0; y < block_size; y++) // y represents the index of the column
{
//cout << i ; //cout << "\n"; //cout << j ; //cout << "\n"; //cout << x ; //cout << "\n"; //cout << y ; //cout << "\n";
mean_img.at<double>(i/block_size, j/block_size) = mean_img.at<double>(i/block_size,j/block_size) + new_img.at<double>(i + x, j + y) / (mean);
}
}
}
}
/*This is the end of the part to compute the mean of each block*/
/*This part is to fill all the resize matrix with the mean value*/
for ( int x = 0; x < resize_parameter/block_size; x++) // x represents the index of the raw in the mean matrix
{
for ( int y = 0; y < resize_parameter/block_size; y++) // y represents the index of the column in the mean matrix
{
for ( int i = 0; i < block_size; i++) // i represents the index of the raw in the mean_full matrix
{
for ( int j = 0; j < block_size; j++) // j represents the index of the column in the mean_full matrix
{
mean_img_full.at<double>((x*block_size)+i,(y*block_size)+j) = mean_img.at<double>(x,y);
}
}
}
}
//cout << cv::getBuildInformation() << endl;
/*This is the end of the part to fill all the resize matrix with the mean value*/
namedWindow("OriginalImage", CV_WINDOW_AUTOSIZE); //create a window with the name "OriginalImage"
imshow("OriginalImage", original_img); //display the image which is stored in the 'original_img' in the "OriginalImage" window
namedWindow("NewImage", CV_WINDOW_AUTOSIZE); //create a window with the name "NewImage"
imshow("NewImage", new_img); //display the image which is stored in the 'new_img' in the "NewImage" window
namedWindow("MeanImage", CV_WINDOW_AUTOSIZE); //create a window with the name "MeanImage"
imshow("MeanImage", mean_img); //display the image which is stored in the 'mean_img' in the "MeanImage" window
namedWindow("MeanFullImage", CV_WINDOW_AUTOSIZE); //create a window with the name "MeanFullImage"
imshow("MeanFullImage", mean_img_full); //display the image which is stored in the 'mean_img_full' in the "MeanFullImage" window
waitKey(0); //wait infinite time for a keypress
destroyWindow("OriginalImage"); //destroy the window with the name, "OriginalImage"
destroyWindow("NewImage"); //destroy the window with the name, "NewImage"
destroyWindow("MeanImage"); //destroy the window with the name, "MeanImage"
destroyWindow("MeanFullImage"); //destroy the window with the name, "MeanImage"
return 0;
}
The problem was at the definition of the type of each matrix. It has to be 8 Bits Unsigned Character. It is working now. Thanks a lot ,,,