Improving accuracy in Python Tesseract OCR - django

I am using pytesseract along with openCV in a simple django application in Python to extract text in Bengali language from image files. I have a form that lets you upload an image and on clicking the submit button sends it to the server side in an ajax call in jQuery to extract the text from the image to serve the purpose of OCR (Optical Character Recognition).
Template part :
<div style="text-align: center;">
<div id="result" class="text-center"></div>
<form enctype="multipart/form-data" id="ocrForm" action="{% url 'process_image' %}" method="post"> <!-- Do not forget to add: enctype="multipart/form-data" -->
{% csrf_token %}
{{ form }}
<button type="submit" class="btn btn-success">OCRzed</button>
</form>
<br><br><hr>
<div id="content" style="width: 50%; margin: 0 auto;">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){
function submitFile(){
var fd = new FormData();
fd.append('file', getFile())
$("#result").html('<span class="wait">Please wait....</span>');
$('#content').html('');
$.ajax({
url: "{% url 'process_image' %}",
type: "POST",
data: fd,
processData: false,
contentType: false,
success: function(data){
// console.log(data.content);
$("#result").html('');
if(data.content){
$('#content').html(
"<p>" + data.content + "</p>"
)
}
}
})
}
function getFile(){
var fp = $("#file_id")
var item = fp[0].files
return item[0]
}
// Submit the file for OCRization
$("#ocrForm").on('submit', function(event){
event.preventDefault();
submitFile()
})
});
</script>
The urls.py file has:
from django.urls import path, re_path
from .views import *
urlpatterns = [
path('process_image', OcrView.process_image, name='process_image') ,
]
The view part :
from django.contrib.auth.models import User
from django.shortcuts import render, redirect, get_object_or_404
from .forms import NewTopicForm
from .models import Board, Topic, Post
from django.shortcuts import render
from django.http import HttpResponse
from django.http import Http404
from django.http import JsonResponse
from django.views.generic import FormView
from django.views.decorators.csrf import csrf_exempt
import json
import cv2
import numpy as np
import pytesseract # ======= > Add
try:
from PIL import Image
except:
import Image
def ocr(request):
return render(request, 'ocr.html')
# {'board': board,'form':form})
# get grayscale image
def get_grayscale(image):
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# noise removal
def remove_noise(image):
return cv2.medianBlur(image,5)
#thresholding
def thresholding(image):
return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
#dilation
def dilate(image):
kernel = np.ones((5,5),np.uint8)
return cv2.dilate(image, kernel, iterations = 1)
#erosion
def erode(image):
kernel = np.ones((5,5),np.uint8)
return cv2.erode(image, kernel, iterations = 1)
#opening - erosion followed by dilation
def opening(image):
kernel = np.ones((5,5),np.uint8)
return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
#canny edge detection
def canny(image):
return cv2.Canny(image, 100, 200)
#skew correction
def deskew(image):
coords = np.column_stack(np.where(image > 0))
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return rotated
#template matching
def match_template(image, template):
return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
class OcrView(FormView):
form_class = UploadForm
template_name = 'ocr.html'
success_url = '/'
#csrf_exempt
def process_image(request):
if request.method == 'POST':
response_data = {}
upload = request.FILES['file']
filestr = request.FILES['file'].read()
#convert string data to numpy array
npimg = np.fromstring(filestr, np.uint8)
image = cv2.imdecode(npimg, cv2.IMREAD_UNCHANGED)
# image=Image.open(upload)
gray = get_grayscale(image)
thresh = thresholding(gray)
opening1 = opening(gray)
canny1 = canny(gray)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# content = pytesseract.image_to_string(Image.open(upload), lang = 'ben')
# content = pytesseract.image_to_string( image, lang = 'ben')
content = pytesseract.image_to_string( image, lang = 'eng+ben')
# data_ben = process_image("test_ben.png", "ben")
response_data['content'] = content
return JsonResponse(response_data)
I am attaching a sample image just below here which when I give as the input file, the extracted text I get from there is not up to any satisfactory level of accuracy. The input image is:
I am attaching a screenshot of the extracted text with wrong words underlined in red below. Note that the spaces and indentations are not preserved there. The screenshot of extracted text is :
In the above code snippet, I have done the image processing with the following code lines:
gray = get_grayscale(image)
thresh = thresholding(gray)
opening1 = opening(gray)
canny1 = canny(gray)
After that I have fed tesserect with the processed image in the following line:
content = pytesseract.image_to_string( image, lang = 'eng+ben')
But my point of confusion is that I have nowhere saved the image before or after processing. So when I use the above line , I am not sure whether the processed or unprocessed image is supplied to tesserect engine.
Q1) Do I need to save the image after processing it and then supply it to the tesserect engine ? If yes , how to do that ?
Q2) What else steps should I take to improve the accuracy ?
NB: Even if you are not familiar with Bengali language, I think this wont be any problem as you can just look at the red-underlined words and make a comparison.
EDIT:
TL;DR:
You can just look at the code in view.py and urls.py files and exclude the template code for the sake of understanding easily.

Q1) No need to save the image. The image is stored in your variable image
Q2) You are not actually doing OCR on the image post-processing functions applied to, i.e. variable canny1. The below code would successively perform the processing steps on image and then apply OCR to the post-processed image stored in canny1.
gray = get_grayscale(image)
thresh = thresholding(gray)
opening1 = opening(thresh )
canny1 = canny(opening1 )
content = pytesseract.image_to_string( canny1 , lang = 'eng+ben')

Related

How to include images in xhtml2pdf generated pdf files?

I am running a streamlit app which generates reports containing images and dataframes. I have used jinja2 to generate the html file from a template. Then, I would now like to convert to a pdf file using xhtml2pdf to download.
How to do that?
from jinja2 import Environment, FileSystemLoader
def convert_html_to_pdf(source_html, output_filename="temp/report.pdf"):
result_file = io.BytesIO()
pdf = pisa.CreatePDF(
source_html,
dest=result_file)
return pdf.getvalue()
def load_template():
env = Environment(loader=FileSystemLoader('templates'))
template = env.get_template('catAnalysisTemplate.html')
return template
def render_report(data, filename="report"):
template = load_template()
html = template.render(data)
# with open(f'temp/{filename}.html', 'w') as f:
# f.write(html)
pdf = convert_html_to_pdf(html)
return [html, pdf]
This works fine except the images are not included in the pdf file. My static images are stored in
img/
logo.png
and the charts I may generate it in memory as like
def plot_co_attainment(qp):
img = io.BytesIO()
data = qp.co_attainment()[["Level", "Perc_Attainment"]]
plt.figure(dpi=150)
plt.bar(data["Level"], data["Perc_Attainment"], width=0.5, color=colors)
for i, val in enumerate(data["Perc_Attainment"].values):
plt.text(i, val, str(val) + "%",
horizontalalignment='center',
verticalalignment='bottom',
fontdict={'fontweight': 500, 'size': 20})
plt.xlabel("Course Outcomes")
plt.ylabel("Percentage of Attainment")
plt.ylim((0, 110))
plt.savefig(buf, format='jpg')
return buf
How do I connect the dots and get the images in my pdf file?
I am having the same issue. The way I solved it was to use a link_handler and return the data as a data: uri containing the png image data.
This example will take the src attribute and use it to generate a square image in that color, which will be embedded in the PDF. Sadly this doesn't let you modify the image tag itself so you can't change the sizes/classes or anything else.
Using something like this opens the way to embedding just about anything without having to add them to your template directly.
from base64 import b64encode
from io import BytesIO
from xhtml2pdf import pisa
from PIL import Image
html_src = """
<body>
<div>
<img src="red"/>
<img src="green"/>
<img src="blue"/>
</div>
</body>
"""
def link_callback(src_attr, *args):
"""
Returns the image data for use by the pdf renderer
"""
img_out = BytesIO()
img = Image.new("RGB", (100, 100), src_attr)
img.save(img_out, "png")
return f"data:image/png;base64,{b64encode(img_out.getvalue())}"
def main():
with open("one.pdf", "wb") as f:
pizza = pisa.CreatePDF(
html_src,
dest=f,
link_callback=link_callback,
)
if __name__ == "__main__":
main()

Failing to display images

I am writing this Django program which is a clone of Craigslist but displaying images of the searched products. The issue is I failing to display the actual image on the card, I am only getting the image icon at the top left corner of the card
import requests
from bs4 import BeautifulSoup
from django.shortcuts import render
from urllib.parse import quote_plus
from . import models
BASE_CRAIGSLIST_URL = 'https://losangeles.craigslist.org/d/services/search/bbb?query={}'
BASE_IMAGE_URL = 'https://images.craigslist.org/{}_300x300.jpg'
# Create your views here.
def home(request):
return render(request, 'base.html')
def new_search(request):
search = request.POST.get('search')
models.Search.objects.create(search=search)
final_url = BASE_CRAIGSLIST_URL.format(quote_plus(search))
response = requests.get(final_url)
data = response.text
soup = BeautifulSoup(data, features='html.parser')
post_listings = soup.find_all('li', {'class': 'result-row'})
final_postings = []
for post in post_listings:
post_title = post.find(class_='result-title').text
post_url = post.find('a').get('href')
if post.find(class_='result-price'):
post_price = post.find(class_='result-price').text
else:
post_price = 'N/A'
if post.find(class_='result-image').get('data-ids'):
post_image_id = post.find(class_='result-image').get('data-ids').split(',')[0].split(':')
post_image_url = BASE_IMAGE_URL.format(post_image_id)
print(post_image_url)
else:
post_image_url = 'https://craigslist.org/images/peace.jpg'
final_postings.append((post_title, post_url, post_price, post_image_url))
stuff_for_frontend = {
'search': search,
'final_postings': final_postings,
}
return render(request, 'my_app/new_search.html', stuff_for_frontend)
so i have figured it out, i was trying to access a single image yet the url had like a slide of images so i had to select the first image and display that one like this
post_image_id = post.find(class_='result-image').get('data-ids').split(',')[0].split(':')[1]

Matplotlib image not displaying in Django template

I have a seaborn barplot image that I would like to display inside a django template.
I have looked at a few solutions and I managed to display the image with HttpResponse. However, the image it displays is just an image. What I want is an image of the plot on a webpage and you can still click the navbar and home button etc.
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg
from io import BytesIO
from django.template.loader import render_to_string
In views.py
def top_causal(request):
# code for data manipulation
f = plt.figure(figsize=(10, 5))
ax = sns.barplot(data)
# labels, title etc...
FigureCanvasAgg(f)
buf = BytesIO()
plt.savefig(buf, format='png)
plt.close(f)
# this works
response = HttpResponse(buf.getvalue(), content_type='image/png')
return response
# What I want which is to display the image inside a template(does not work)
buffer = buf.getvalue()
content_type="image/png"
graphic = (buffer, content_type)
rendered = render_to_string('top_causal.html', {'graphic': graphic})
return HttpResponse(rendered)
in top_causal.html
<img src="data:image/png;base64,{{ graphic|safe }}" alt="Not working.">
HttpResponse displays the image as a page.
The response using render_to_string returns this instead of an image inside the webpage. I can still see the navbar.
\x06\x1d\x00\x80kT\xb7n]eee\x95\x18\xff\xe1\x87\x1f$I\xf5\xea\xd5\xb3\x19\xcf\xce\xce.\x11\x9b\x9d\x9d-WWW999U\xb8~\xa7N\x9d$\xfd\xf6\x02\xb2\xb0\xb00IR\x9b6mt\xe4\xc8\x91\x12\xb1G\x8e\x1c\x91\xc5b\xb9\xea\xe7\xe1\xda\xb5k\xa7\xf7\xdf\x7f_\xc5\xc5\xc5:|\xf8\xb0V\xadZ\xa5\xe7\x9f\x7f^\xb5j\xd5\xd2\xd4\xa9S\xafx\xee\xdbo\xbf\xad\x81\x03\x07j\xc9\x92%6\xe3\xb9\xb9\xb9\xe5\xba\x9e\xbau\xeb\xcab\xb1h\xc2\x84\t\x9a0aB\xa91M\x9b6-W\xae\xab\xa9W\xaf\x9e\xaaU\xab\xa6\xbd{\xf7\xaaj\xd5\xaa%\xe6]\\\\\xecR\xe7\xb2;\xef\xbcSqqq\x92~{Q\xde\xbb\xef\xbe\xaby\xf3\xe6\xa9\xb8\xb8X\x8b\x16-\xb2k-\x00\x80\xe3b\x07\x1d\x00\x80k\xd4\xb7o_\xa5\xa4\xa4\x94x\x13w||\xbc\xaaT\xa9\xa2\x90\x90\x10\x9b\xf1\xf7\xdf\x7f\xdfz\x8b\xb5$\x9d;wN[\xb7n-\x11W^\x97o\x8do\xd6\xac\x99ul\xc8\x90!JIIQJJ\x8au\xec\xd7 ...
You need to convert the image to byte array, encode it to base64 and embed it to your template. Here is how you do it:
<!DOCTYPE html>
<!--my_template.html-->
<html>
<div class='section'>
<div class='flex column plot mt15 mlr15 justify-left align-left'>
<img src='{{ imuri }}'/>
</div>
</div>
</html>
from django.template.loader import get_template
def top_causal(request):
# .......................
# Get your plot ready ...
# .......................
figure = plt.gcf()
buf = io.BytesIO()
figure.savefig(buf, format='png', transparent=True, quality=100, dpi=200)
buf.seek(0)
imsrc = base64.b64encode(buf.read())
imuri = 'data:image/png;base64,{}'.format(urllib.parse.quote(imsrc))
context = { 'plot': imuri}
# Now embed that to your template
template = get_template('my_template.html')
html = template.render(context=context)
# Your template is ready to go...
I discarded the idea of using matplotlib and went to use Highcharts https://www.highcharts.com/ instead to plot my graph and display it on the webpage.
The data displayed was done by using Django's ORM and Query API.

Store matplotlib plots in a Django model BinaryField then render them directly from the database

How can I store a matplotlib plot in a Django BinaryField then render it directly to a template?
These are the commands I use to save a matplotlib image to a BinaryField type:
The field (I haven't seen anything saying storing binary in a separate table is good practice):
class Blob(models.Model):
blob = models.BinaryField(blank=True, null=True, default=None)
To generate and save the image:
import io
import matplotlib.pyplot as plt
import numpy as np
from myapp.models import Blob
# Any old code to generate a plot - NOTE THIS MATPLOTLIB CODE IS NOT THREADSAFE, see http://stackoverflow.com/questions/31719138/matplotlib-cant-render-multiple-contour-plots-on-django
t = np.arange(0.0, gui_val_in, gui_val_in/200)
s = np.sin(2*np.pi*t)
plt.figure(figsize=(7, 6), dpi=300, facecolor='w')
plt.plot(t, s)
plt.xlabel('time (n)')
plt.ylabel('temp (c)')
plt.title('A sample matplotlib graph')
plt.grid(True)
# Save it into a BytesIO type then use BytesIO.getvalue()
f = io.BytesIO() # StringIO if Python <3
plt.savefig(f)
b = Blob(blob=f.getvalue())
b.save()
To display it, I create the following in myapp/views.py:
def image(request, blob_id):
b = Blob.objects.get(id=blob_id)
response = HttpResponse(b.blob)
response['Content-Type'] = "image/png"
response['Cache-Control'] = "max-age=0"
return response
Add to myapp/urls.py:
url(r'^image/(?P<blob_id>\d+)/$', views.image, name='image'),
And in the template:
<img src="{% url 'myapp:image' item.blob_id %}" alt="{{ item.name }}" />

Embedding local video using Django

I'm using Django to create a webpage which displays a videoclip which will be created by the visitor her/himself. The following is my code:
from django.shortcuts import render
from django.http import HttpResponse
from django import forms
import numpy as np
from math import e
from matplotlib import pyplot as plt
from matplotlib import animation
from wsgiref.util import FileWrapper
class NumForm(forms.Form):
n1 = forms.FloatField(label = 'Slope_Upper')
n2 = forms.FloatField(label = 'Slope_Lower')
fig = plt.figure()
ax = plt.axes(xlim=(-3, 3), ylim=(0, 1))
ax.grid()
line, = ax.plot([], [], lw=2)
def psych(x,y,z):
return 1/(1+e**(-1*y*(x-z)))
def init():
line.set_data([], [])
return line,
def animate(i, lo, up):
x = np.linspace(-3,3,1000)
dif = up - lo
y = psych(x,.001*(up-lo)*i,0)
line.set_data(x, y)
return line,
def graph(request):
if request.method == 'POST':
form = NumForm(request.POST)
if form.is_valid():
sl_u = form.cleaned_data['n1']
sl_l = form.cleaned_data['n2']
anim = animation.FuncAnimation(fig, animate, init_func=init,
fargs=(sl_l, sl_u), frames=300, interval=20, blit=True)
anim.save(filename='video.mp4', fps=30, extra_args=['-vcodec', 'libx264'])
return render(request, 'plotting/video2.html')
else:
form = NumForm()
return render(request, 'plotting/retry.html', {'form': form})
And 'video2.html' is like this:
<html>
<body>
<embed src="file:///c:/plot/video.mp4">
</body>
</html>
When I run the local server and visit the webpage, I simply can't play the video. (The play button is not activated.) The question is: How could I play the video 'video.mp4', which is in my local folder, on the webpage I've created using Django?
The "controls" attribute adds video controls, like play, pause, and volume.
<video width="320" height="240" controls>
<source src="{{MEDIA_URL}}/movie.mp4" type="video/mp4">
</video>