I have written an content_based recommender system in python3 using the data from a mysql database. Now i have to use django for production so that I need not to take input each time new articles are added in the database. How to convert this python code into django production. i will connect the database with django database connections. I am really confused how to write this code in django?
my_recommender_system
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from string import punctuation
import functools
from matplotlib import pyplot as plt
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords =True)
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
import numpy as np
import math
from sklearn.metrics.pairwise import linear_kernel
#import text
from collections import Counter
df = pd.read_csv('target.csv')
df = df.loc[:,['id','combined_text']].astype(str)
df["combined_text"] = df["combined_text"].apply(lambda x: ' '.join(pd.unique(x.split())))
df.combined_text = df.combined_text.apply(lambda x: x.lower())
df.combined_text = df.combined_text.str.replace('[^\w\s]',' ')
df['combined_text'] = df['combined_text'].str.replace('\d+', ' ')
df.combined_text = df.combined_text.str.replace('nbsp?' , ' ')
#df.combined_text = df.combined_text.str.replace('nan?' , ' ')
df.combined_text = df.combined_text.str.replace('value?' , ' ')
df = df.dropna(subset = ['combined_text'])
df.combined_text = df.combined_text.str.replace('\s+', ' ')
#df.combined_text.map(len).hist(figsize=(15, 5), bins=100)
df = df[(df.combined_text.map(len) > 600)]
df.reset_index(inplace=True, drop=True)
#df1 = df[(df.combined_text.map(len) > 7500)]
stop_words = []
f = open('stopwords.txt', 'r')
for l in f.readlines():
stop_words.append(l.replace('\n', ''))
additional_stop_words = ['t','aah','aap','don','doesn','isn','ve','ll','add', 'ndash','will','nan','q','article','lsquo','rsquo','ldquo','rdquo','personalised','please','read','download','app','here','more','experience','based','explore','bull','fact','myth','ndash','middot','lifestage','entire','collection','articles','reading','website','android','phone','a','zero']
stop_words += additional_stop_words
stop_words = list(filter(None, stop_words))
#print(len(stop_words))
def _removeNonAscii(s):
return "".join(i for i in s if ord(i)<128)
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = text.replace('(ap)', '')
text = re.sub(r"\'s", " is ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r'\W+', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r"\\", "", text)
text = re.sub(r"\'", "", text)
text = re.sub(r"\"", "", text)
text = re.sub('[^a-zA-Z ?!]+', '', text)
text = _removeNonAscii(text)
text = text.strip()
return text
def tokenizer(text):
text = clean_text(text)
tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
tokens = list(functools.reduce(lambda x,y: x+y, tokens))
tokens = list(filter(lambda token: token not in (stop_words + list(punctuation)) , tokens))
return tokens
#df['combined_text'] = df['combined_text'].map(lambda d: str.encode(d.decode('utf-8')))
df['tokens'] = ''
df['tokens'] = df['combined_text'].progress_map(lambda d: tokenizer(d))
df['text_stemmed']=df['tokens'].apply(lambda x : [stemmer.stem(y) for y in x])
df['text_stemmed_sentence']=df['text_stemmed'].apply(lambda x : " ".join(x))
df['stemmed_tokens'] = df['text_stemmed_sentence'].progress_map(lambda d: tokenizer(d))
df = df[['id','text_stemmed_sentence','stemmed_tokens']]
# =============================================================================
# for descripition, tokens in zip(df['combined_text'].head(5), df['tokens'].head(5)):
# print('description:', descripition)
# print('tokens:', tokens)
# print()
#
# =============================================================================
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, analyzer='word', ngram_range=(1, 2), stop_words='english')
vz = vectorizer.fit_transform(list(df['stemmed_tokens'].map(lambda tokens: ' '.join(tokens))))
cosine_similarities = linear_kernel(vz,vz)
articlesRecommend = pd.DataFrame(cosine_similarities, columns = df.id, index = df.id)
y = np.array([articlesRecommend[c].nlargest(10).index.values for c in articlesRecommend])
articles_df = pd.DataFrame(data = y, index = articlesRecommend.columns)
The complete answer to this question will be lengthy but i can just wrap it up simply as:
First make virtualenv and install django. Also, you will need to install all the python packages that you used in your python program like pandas etc.
run this simple command django-admin startproject <project_name>. Next, run django-admin startapp <app_name> this is for making an app in the django project since django can have many apps.
Open source/source/settings.py and in your INSTALLED_APPS list, mention the name of your app.
You will need to render the same code in /views.py . But there should be atleast one function with request argument which will achieve the same task.
Something like this:
import pandas # and import other libs
def some_func(request):
## your code
Next you will have to map this function with url in urls.py, that is something you can find here: mapping the urls to functions in views.py
Of course, you will have to run the server using python manage.py runserver, you'll be able to locate your project on 127.0.0.1/8000.
Honestly, If you understand the basic architechure of django, this is very easy task to do. This documentation can be of help to you.
Coming to the crux of your question:
Since you explained that you'll be suggesting most related articles on the basis of already existing article. First, the source of the data from your Laravel project should flow data in JSON format, you can read that data in your views.py 's functions, once your read the data and run your already working code, next you should be able to send the most related articles information like id or something back through some url. For this purpose you can either do this using Django's rest framework or simply return JsonResponse from your function.
Related
I am trying to deploy my custom container in vertex ai endpoint for predictions. The contents of the application are as follows.
Flask - app.py
import pandas as pd
from flask import Flask, jsonify,request
import tensorflow
import pre_process
import post_process
app = Flask(__name__)
#app.route('/predict',methods=['POST'])
def predict():
req = request.json.get('instances')
input_data = req[0]['email']
#preprocessing
text = pre_process.preprocess(input_data)
vector = pre_process.preprocess_tokenizing(text)
model = tensorflow.keras.models.load_model('model')
#predict
prediction = model.predict(vector)
#postprocessing
value = post_process.postprocess(list(prediction[0]))
return jsonify({'output':{'doc_class':value}})
if __name__=='__main__':
app.run(host='0.0.0.0')
Dockerfile
FROM python:3.7
WORKDIR /app
COPY . /app
RUN pip install --trusted-host pypi.python.org -r requirements.txt
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]
EXPOSE 5050
pre_process.py
#import
import pandas as pd
import pickle
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
def preprocess(text):
"""Do all the Preprocessing as shown above and
return a tuple contain preprocess_email,preprocess_subject,preprocess_text for that Text_data"""
#After you store it in the list, Replace those sentances in original text by space.
text = re.sub("(Subject:).+"," ",text,re.I)
#Delete all the sentances where sentence starts with "Write to:" or "From:".
text = re.sub("((Write to:)|(From:)).+","",text,re.I)
#Delete all the tags like "< anyword >"
text = re.sub("<[^><]+>","",text)
#Delete all the data which are present in the brackets.
text = re.sub("\([^()]+\)","",text)
#Remove all the newlines('\n'), tabs('\t'), "-", "".
text = re.sub("[\n\t\\-]+","",text)
#Remove all the words which ends with ":".
text = re.sub("(\w+:)","",text)
#Decontractions, replace words like below to full words.
lines = re.sub(r"n\'t", " not", text)
lines = re.sub(r"\'re", " are", lines)
lines = re.sub(r"\'s", " is", lines)
lines = re.sub(r"\'d", " would", lines)
lines = re.sub(r"\'ll", " will", lines)
lines = re.sub(r"\'t", " not", lines)
lines = re.sub(r"\'ve", " have", lines)
lines = re.sub(r"\'m", " am", lines)
text = lines
#replace numbers with spaces
text = re.sub("\d+"," ",text)
# remove _ from the words starting and/or ending with _
text = re.sub("(\s_)|(_\s)"," ",text)
#remove 1 or 2 letter word before _
text = re.sub("\w{1,2}_","",text)
#convert all letters to lowercase and remove the words which are greater
#than or equal to 15 or less than or equal to 2.
text = text.lower()
text =" ".join([i for i in text.split() if len(i)<15 and len(i)>2])
#replace all letters except A-Z,a-z,_ with space
preprocessed_text = re.sub("\W+"," ",text)
return preprocessed_text
def preprocess_tokenizing(text):
#from tf.keras.preprocessing.text import Tokenizer
#from tf.keras.preprocessing.sequence import pad_sequences
tokenizer = pickle.load(open('tokenizer.pkl','rb'))
max_length = 1019
tokenizer.fit_on_texts([text])
encoded_docs = tokenizer.texts_to_sequences([text])
text_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
return text_padded
post_process.py
def postprocess(vector):
index = vector.index(max(vector))
classes = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
return classes[index]
requirements.txt
gunicorn
pandas==1.3.3
numpy==1.19.5
flask
flask-cors
h5py==3.1.0
scikit-learn==0.24.2
tensorflow==2.6.0
model
tokenizer.pkl
I am following this blog vertex ai deployment for gcloud console commands to containerise and deploy the model to endpoint.But the model is taking forever to get deployed and ultimately fails to get deployed.
After running the container in local host, it runs as expected but it is not getting deployed into vertex ai endpoint. I don't understand whether the problem is in flask app.py or Dockerfile or whether the problem lies somewhere else.
I was able to resolve this issue by adding health route to http server. I added the following piece of code in my flask app.
#app.route('/healthz')
def healthz():
return "OK"
I am trying to make a Friday like virtual assistant using this code
import os
from gtts import gTTS
import time
import playsound
import speech_recognition as sr
while True:
def speak(text):
tts = gTTS(text=text, lang="en")
filename = "voice.mp3"
tts.save(filename)
playsound.playsound(filename)
def get_audio():
r = sr.Recognizer()
with sr.Microphone() as source:
audio = r.listen(source)
said = ""
try:
said = r.recognize_google(audio)
print(said)
except Exception as e:
print("Exception: " + str(e))
return said
text = get_audio()
if "who are you" in text:
speak(" I am Monday the virtual assistant")
And i was wondering how to put wolfram alpha in it so i would, say search for ..., then it would speak the answer from wolfram alpha.
Any help would be amazing :)
Install wolframalpha
Then add the following to your code:
import wolframalpha
if 'search for ' in text:
text = text.replace("search for ", "")
client = wolframalpha.Client(app_id)
res = client.query(text)
print(next(res.results).text)
speak(next(res.results).text)
To use the API, you have to go to the homepage, sign up for an account, create an app and get an app id.
To avoid getting any errors, keep the indentation in your 'speak' function uniform.
I'm trying to speed up a process of webscraping by sending raw data to python in lieu of correctly formatted data.
Current data is received as an excel file with data formatted as:
26 EXAMPLE RD EXAMPLEVILLE SA 5000
Data is formatted in excel via macros to:
Replace all spaces with hyphen
Change all text to lower-case
Paste text onto end of http://example.com/property/
Formatted data is http://www.example.com/property/26-example-rd-exampleville-sa-5000
What i'm trying to accomplish:
Get python to go into excel sheet and follow formatting rules listed above, then pass the records to the scraper.
Here is the code I have been trying to compile - please go easy i am VERY new.
Any advice or reading sources related to python formatting would be appreciated.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import csv
from lxml import html
import xlrd
# URL_BUILDER
# Source File for UNFORMATTED DATA
file_location = "C:\Python27\Projects\REA_SCRAPER\NewScraper\ScrapeFile.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_name('((PythonScraperDNC))')
# REA_SCRAPER
# Pass Data from URL_BUILDER to URL_LIST []
URL_LIST = []
# Search Phrase to capture suitable URL's for Scraping
text2search = \
'''<p class="property-value__title">
RECENTLY SOLD
</p>'''
# Write Sales .CSV file
with open('Results.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for (index, url) in enumerate(URL_LIST):
page = requests.get(url)
print '<Scanning Url For Sale>'
if text2search in page.text:
tree = html.fromstring(page.content)
(title, ) = (x.text_content() for x in tree.xpath('//title'))
(price, ) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold, ) = (x.text_content().strip() for x intree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
else:
writer.writerow(['No Sale'])
If you're just trying to figure out how to do the formatting in Python:
text = '26 EXAMPLE RD EXAMPLEVILLE SA 5000'
url = 'http://example.com/property/' + text.replace(' ', '-').lower()
print(url)
# Output:
# http://example.com/property/26-example-rd-exampleville-sa-5000
I've read though the other stack overflow questions regarding this but it doesn't answer my issue, so down vote away. Its version 2.7.
All I want to do is use python to convert a PDF to a Word doc. At minimum convert to text so I can copy and paste into a word doc.
This is the code I have so far. All it prints is the female gender symbol.
Is my code wrong? Am I approaching this wrong? Do some PDFs just not work with PDFMiner? Do you know of any other alternatives to accomplish my goal of converting a PDF to Word, besides using PyPDF2 or PDFMiner?
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file('Bottom Dec.pdf', 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
print convert_pdf_to_txt(1)
from pdf2docx import Converter
pdf_file = 'E:\Muhammad UMER LAR.pdf'
doc_file= 'E:\Lari.docx'
c=Converter(pdf_file)
c.convert(doc_file)
c.close()
Another alternative solution is Aspose.Words Cloud SDK for Python, you can install it from pip for PDF to DOC conversion.
import asposewordscloud
import asposewordscloud.models.requests
api_client = asposewordscloud.ApiClient()
api_client.configuration.host = 'https://api.aspose.cloud'
# Get AppKey and AppSID from https://dashboard.aspose.cloud/
api_client.configuration.api_key['api_key'] = 'xxxxxxxxxxxxxxxxxxxxx' # Put your appKey here
api_client.configuration.api_key['app_sid'] = 'xxxxxxxxx-xxxx-xxxxx-xxxx-xxxxxxxxxx' # Put your appSid here
words_api = asposewordscloud.WordsApi(api_client)
filename = '02_pages.pdf'
remote_name = 'TestPostDocumentSaveAs.pdf'
dest_name = 'TestPostDocumentSaveAs.doc'
#upload PDF file to storage
request_stoarge = asposewordscloud.models.requests.UploadFileRequest(filename,remote_name)
response = words_api.upload_file(request_stoarge)
#Convert PDF to DOC and save to storage
save_options = asposewordscloud.SaveOptionsData(save_format='doc', file_name=dest_name)
request = asposewordscloud.models.requests.SaveAsRequest(remote_name, save_options)
result = words_api.save_as(request)
print("Result {}".format(result))
I'm developer evangelist at Aspose.
I am creating an application in python that can parse weather data from yr.no in Python. It works fine with regular ASCII strings, but fails when I use unicode.
def GetYRNOWeatherData(country, province, place):
#Parse the XML file
wtree = ET.parse(urllib.urlopen("http://www.yr.no/place/" + string.replace(country, ' ', '_').encode('utf-8') + "/" + string.replace(province, ' ', '_').encode('utf-8') + "/" + string.replace(place, ' ', '_').encode('utf-8') + "/forecast.xml"))
For example, when I try
GetYRNOWeatherData("France", "Île-de-France", "Paris")
I get this error
'charmap' codec can't encode character u'\xce' in position 0: character maps to <undefined>
Is it true that urllib doesn't handle unicode very well? Since I am using Tkinter as a frontend to this function, would that be the source of the problem (does the Tkinter Entry widget handle unicode well?)
You can handle this by keeping every string as a unicode right up until you actually make the urllib.urlopen request, at which point you encode to utf-8:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This import makes all literal strings in the file default to
# type 'unicode' rather than type 'str'. You don't need to use this,
# but you'd need to do u"France" instead of just "France" below, and
# everywhere else you have a string literal.
from __future__ import unicode_literals
import urllib
import xml.etree.ElementTree as ET
def do_format(*args):
ret = []
for arg in args:
ret.append(arg.replace(" ", "_"))
return ret
def GetYRNOWeatherData(country, province, place):
country, province, place = do_format(country, province, place)
url = "http://www.yr.no/place/{}/{}/{}/forecast.xml".format(country, province, place)
wtree = ET.parse(urllib.urlopen(url.encode('utf-8')))
return wtree
if __name__ == "__main__":
GetYRNOWeatherData("France", "Île-de-France", "Paris")