Custom Container deployment in vertex ai - flask

I am trying to deploy my custom container in vertex ai endpoint for predictions. The contents of the application are as follows.
Flask - app.py
import pandas as pd
from flask import Flask, jsonify,request
import tensorflow
import pre_process
import post_process
app = Flask(__name__)
#app.route('/predict',methods=['POST'])
def predict():
req = request.json.get('instances')
input_data = req[0]['email']
#preprocessing
text = pre_process.preprocess(input_data)
vector = pre_process.preprocess_tokenizing(text)
model = tensorflow.keras.models.load_model('model')
#predict
prediction = model.predict(vector)
#postprocessing
value = post_process.postprocess(list(prediction[0]))
return jsonify({'output':{'doc_class':value}})
if __name__=='__main__':
app.run(host='0.0.0.0')
Dockerfile
FROM python:3.7
WORKDIR /app
COPY . /app
RUN pip install --trusted-host pypi.python.org -r requirements.txt
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]
EXPOSE 5050
pre_process.py
#import
import pandas as pd
import pickle
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
def preprocess(text):
"""Do all the Preprocessing as shown above and
return a tuple contain preprocess_email,preprocess_subject,preprocess_text for that Text_data"""
#After you store it in the list, Replace those sentances in original text by space.
text = re.sub("(Subject:).+"," ",text,re.I)
#Delete all the sentances where sentence starts with "Write to:" or "From:".
text = re.sub("((Write to:)|(From:)).+","",text,re.I)
#Delete all the tags like "< anyword >"
text = re.sub("<[^><]+>","",text)
#Delete all the data which are present in the brackets.
text = re.sub("\([^()]+\)","",text)
#Remove all the newlines('\n'), tabs('\t'), "-", "".
text = re.sub("[\n\t\\-]+","",text)
#Remove all the words which ends with ":".
text = re.sub("(\w+:)","",text)
#Decontractions, replace words like below to full words.
lines = re.sub(r"n\'t", " not", text)
lines = re.sub(r"\'re", " are", lines)
lines = re.sub(r"\'s", " is", lines)
lines = re.sub(r"\'d", " would", lines)
lines = re.sub(r"\'ll", " will", lines)
lines = re.sub(r"\'t", " not", lines)
lines = re.sub(r"\'ve", " have", lines)
lines = re.sub(r"\'m", " am", lines)
text = lines
#replace numbers with spaces
text = re.sub("\d+"," ",text)
# remove _ from the words starting and/or ending with _
text = re.sub("(\s_)|(_\s)"," ",text)
#remove 1 or 2 letter word before _
text = re.sub("\w{1,2}_","",text)
#convert all letters to lowercase and remove the words which are greater
#than or equal to 15 or less than or equal to 2.
text = text.lower()
text =" ".join([i for i in text.split() if len(i)<15 and len(i)>2])
#replace all letters except A-Z,a-z,_ with space
preprocessed_text = re.sub("\W+"," ",text)
return preprocessed_text
def preprocess_tokenizing(text):
#from tf.keras.preprocessing.text import Tokenizer
#from tf.keras.preprocessing.sequence import pad_sequences
tokenizer = pickle.load(open('tokenizer.pkl','rb'))
max_length = 1019
tokenizer.fit_on_texts([text])
encoded_docs = tokenizer.texts_to_sequences([text])
text_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
return text_padded
post_process.py
def postprocess(vector):
index = vector.index(max(vector))
classes = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
return classes[index]
requirements.txt
gunicorn
pandas==1.3.3
numpy==1.19.5
flask
flask-cors
h5py==3.1.0
scikit-learn==0.24.2
tensorflow==2.6.0
model
tokenizer.pkl
I am following this blog vertex ai deployment for gcloud console commands to containerise and deploy the model to endpoint.But the model is taking forever to get deployed and ultimately fails to get deployed.
After running the container in local host, it runs as expected but it is not getting deployed into vertex ai endpoint. I don't understand whether the problem is in flask app.py or Dockerfile or whether the problem lies somewhere else.

I was able to resolve this issue by adding health route to http server. I added the following piece of code in my flask app.
#app.route('/healthz')
def healthz():
return "OK"

Related

django for production of my recommender system

I have written an content_based recommender system in python3 using the data from a mysql database. Now i have to use django for production so that I need not to take input each time new articles are added in the database. How to convert this python code into django production. i will connect the database with django database connections. I am really confused how to write this code in django?
my_recommender_system
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from string import punctuation
import functools
from matplotlib import pyplot as plt
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords =True)
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
import numpy as np
import math
from sklearn.metrics.pairwise import linear_kernel
#import text
from collections import Counter
df = pd.read_csv('target.csv')
df = df.loc[:,['id','combined_text']].astype(str)
df["combined_text"] = df["combined_text"].apply(lambda x: ' '.join(pd.unique(x.split())))
df.combined_text = df.combined_text.apply(lambda x: x.lower())
df.combined_text = df.combined_text.str.replace('[^\w\s]',' ')
df['combined_text'] = df['combined_text'].str.replace('\d+', ' ')
df.combined_text = df.combined_text.str.replace('nbsp?' , ' ')
#df.combined_text = df.combined_text.str.replace('nan?' , ' ')
df.combined_text = df.combined_text.str.replace('value?' , ' ')
df = df.dropna(subset = ['combined_text'])
df.combined_text = df.combined_text.str.replace('\s+', ' ')
#df.combined_text.map(len).hist(figsize=(15, 5), bins=100)
df = df[(df.combined_text.map(len) > 600)]
df.reset_index(inplace=True, drop=True)
#df1 = df[(df.combined_text.map(len) > 7500)]
stop_words = []
f = open('stopwords.txt', 'r')
for l in f.readlines():
stop_words.append(l.replace('\n', ''))
additional_stop_words = ['t','aah','aap','don','doesn','isn','ve','ll','add', 'ndash','will','nan','q','article','lsquo','rsquo','ldquo','rdquo','personalised','please','read','download','app','here','more','experience','based','explore','bull','fact','myth','ndash','middot','lifestage','entire','collection','articles','reading','website','android','phone','a','zero']
stop_words += additional_stop_words
stop_words = list(filter(None, stop_words))
#print(len(stop_words))
def _removeNonAscii(s):
return "".join(i for i in s if ord(i)<128)
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = text.replace('(ap)', '')
text = re.sub(r"\'s", " is ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r'\W+', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r"\\", "", text)
text = re.sub(r"\'", "", text)
text = re.sub(r"\"", "", text)
text = re.sub('[^a-zA-Z ?!]+', '', text)
text = _removeNonAscii(text)
text = text.strip()
return text
def tokenizer(text):
text = clean_text(text)
tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
tokens = list(functools.reduce(lambda x,y: x+y, tokens))
tokens = list(filter(lambda token: token not in (stop_words + list(punctuation)) , tokens))
return tokens
#df['combined_text'] = df['combined_text'].map(lambda d: str.encode(d.decode('utf-8')))
df['tokens'] = ''
df['tokens'] = df['combined_text'].progress_map(lambda d: tokenizer(d))
df['text_stemmed']=df['tokens'].apply(lambda x : [stemmer.stem(y) for y in x])
df['text_stemmed_sentence']=df['text_stemmed'].apply(lambda x : " ".join(x))
df['stemmed_tokens'] = df['text_stemmed_sentence'].progress_map(lambda d: tokenizer(d))
df = df[['id','text_stemmed_sentence','stemmed_tokens']]
# =============================================================================
# for descripition, tokens in zip(df['combined_text'].head(5), df['tokens'].head(5)):
# print('description:', descripition)
# print('tokens:', tokens)
# print()
#
# =============================================================================
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, analyzer='word', ngram_range=(1, 2), stop_words='english')
vz = vectorizer.fit_transform(list(df['stemmed_tokens'].map(lambda tokens: ' '.join(tokens))))
cosine_similarities = linear_kernel(vz,vz)
articlesRecommend = pd.DataFrame(cosine_similarities, columns = df.id, index = df.id)
y = np.array([articlesRecommend[c].nlargest(10).index.values for c in articlesRecommend])
articles_df = pd.DataFrame(data = y, index = articlesRecommend.columns)
The complete answer to this question will be lengthy but i can just wrap it up simply as:
First make virtualenv and install django. Also, you will need to install all the python packages that you used in your python program like pandas etc.
run this simple command django-admin startproject <project_name>. Next, run django-admin startapp <app_name> this is for making an app in the django project since django can have many apps.
Open source/source/settings.py and in your INSTALLED_APPS list, mention the name of your app.
You will need to render the same code in /views.py . But there should be atleast one function with request argument which will achieve the same task.
Something like this:
import pandas # and import other libs
def some_func(request):
## your code
Next you will have to map this function with url in urls.py, that is something you can find here: mapping the urls to functions in views.py
Of course, you will have to run the server using python manage.py runserver, you'll be able to locate your project on 127.0.0.1/8000.
Honestly, If you understand the basic architechure of django, this is very easy task to do. This documentation can be of help to you.
Coming to the crux of your question:
Since you explained that you'll be suggesting most related articles on the basis of already existing article. First, the source of the data from your Laravel project should flow data in JSON format, you can read that data in your views.py 's functions, once your read the data and run your already working code, next you should be able to send the most related articles information like id or something back through some url. For this purpose you can either do this using Django's rest framework or simply return JsonResponse from your function.

Replace White-space with hyphen then create URL

I'm trying to speed up a process of webscraping by sending raw data to python in lieu of correctly formatted data.
Current data is received as an excel file with data formatted as:
26 EXAMPLE RD EXAMPLEVILLE SA 5000
Data is formatted in excel via macros to:
Replace all spaces with hyphen
Change all text to lower-case
Paste text onto end of http://example.com/property/
Formatted data is http://www.example.com/property/26-example-rd-exampleville-sa-5000
What i'm trying to accomplish:
Get python to go into excel sheet and follow formatting rules listed above, then pass the records to the scraper.
Here is the code I have been trying to compile - please go easy i am VERY new.
Any advice or reading sources related to python formatting would be appreciated.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import csv
from lxml import html
import xlrd
# URL_BUILDER
# Source File for UNFORMATTED DATA
file_location = "C:\Python27\Projects\REA_SCRAPER\NewScraper\ScrapeFile.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_name('((PythonScraperDNC))')
# REA_SCRAPER
# Pass Data from URL_BUILDER to URL_LIST []
URL_LIST = []
# Search Phrase to capture suitable URL's for Scraping
text2search = \
'''<p class="property-value__title">
RECENTLY SOLD
</p>'''
# Write Sales .CSV file
with open('Results.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for (index, url) in enumerate(URL_LIST):
page = requests.get(url)
print '<Scanning Url For Sale>'
if text2search in page.text:
tree = html.fromstring(page.content)
(title, ) = (x.text_content() for x in tree.xpath('//title'))
(price, ) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold, ) = (x.text_content().strip() for x intree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
else:
writer.writerow(['No Sale'])
If you're just trying to figure out how to do the formatting in Python:
text = '26 EXAMPLE RD EXAMPLEVILLE SA 5000'
url = 'http://example.com/property/' + text.replace(' ', '-').lower()
print(url)
# Output:
# http://example.com/property/26-example-rd-exampleville-sa-5000

Google Dataflow seems to drop 1000th record

I have set up a small test using Google Dataflow (apache-beam). The use case for the experiment is to take a (csv) file and write a selected column to a (txt) file.
The code for the experiment is as listed below:
from __future__ import absolute_import
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class EmitColDoFn(beam.DoFn):
first = True
header = ""
def __init__(self, i):
super(EmitColDoFn, self).__init__()
self.line_count = Metrics.counter(self.__class__, 'lines')
self.i = i
def process(self, element):
if self.first:
self.header = element
self.first = False
else:
self.line_count.inc()
cols = re.split(',', element)
return (cols[self.i],)
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
dest='input',
default='/users/sms/python_beam/data/MOCK_DATA (4).csv',
# default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument('--output',
dest='output',
default="/users/sms/python_beam/data/",
# required=True,
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
# Read the text file[pattern] into a PCollection.
lines = p | 'read' >> ReadFromText(known_args.input)
column = (lines
| 'email col' >> (beam.ParDo(EmitColDoFn(3)))
| "col file" >> WriteToText(known_args.output, ".txt", shard_name_template="SS_Col"))
result = p.run()
result.wait_until_finish()
if (not hasattr(result, 'has_job') # direct runner
or result.has_job): # not just a template creation
lines_filter = MetricsFilter().with_name('lines')
query_result = result.metrics().query(lines_filter)
if query_result['counters']:
lines_counter = query_result['counters'][0]
print "Lines committed", lines_counter.committed
run()
The last few lines of sample 1 below:
990,Corabel,Feldbau,cfeldbaurh#deliciousdays.com,Female,84.102.162.190,DJ
991,Kiley,Rottcher,krottcherri#stanford.edu,Male,91.97.155.28,CA
992,Glenda,Clist,gclistrj#state.gov,Female,24.98.253.127,UA
993,Ingunna,Maher,imaherrk#army.mil,Female,159.31.127.19,PL
994,Megan,Giacopetti,mgiacopettirl#instagram.com,Female,115.6.63.52,RU
995,Briny,Dutnall,bdutnallrm#xrea.com,Female,102.81.33.24,SE
996,Jan,Caddan,jcaddanrn#jalbum.net,Female,115.142.222.106,PL
Running this produces the expected output of:
/usr/local/bin/python2.7
/Users/sms/Library/Preferences/PyCharmCE2017.1/scratches/scratch_4.py
No handlers could be found for logger "oauth2client.contrib.multistore_file"
Lines committed 996
Process finished with exit code 0
Now for the strange results. In the next run, the number of lines is increased to 1000.
994,Megan,Giacopetti,mgiacopettirl#instagram.com,Female,115.6.63.52,RU
995,Briny,Dutnall,bdutnallrm#xrea.com,Female,102.81.33.24,SE
996,Jan,Caddan,jcaddanrn#jalbum.net,Female,115.142.222.106,PL
997,Shannen,Gaisford,sgaisfordr7#rediff.com,Female,167.255.222.92,RU
998,Lorianna,Slyne,lslyner8#cbc.ca,Female,54.169.60.13,CN
999,Franklin,Yaakov,fyaakovr9#latimes.com,Male,122.1.92.236,CN
1000,Wilhelmine,Cariss,wcarissra#creativecommons.org,Female,237.48.113.255,PL
But this time the out put is
/usr/local/bin/python2.7
/Users/sms/Library/Preferences/PyCharmCE2017.1/scratches/scratch_4.py
No handlers could be found for logger "oauth2client.contrib.multistore_file"
Lines committed 999
Process finished with exit code 0
Inspection of the output file shows that the last line was NOT processed.
bdutnallrm#xrea.com
jcaddanrn#jalbum.net
sgaisfordr7#rediff.com
lslyner8#cbc.ca
fyaakovr9#latimes.com
Any ideas what is going on here?
'EditColDoFn' skips first line, assuming there is one instance of it for each file. When you have more 1000 lines, the DirectRunner creates two bundles : 1000 lines in first one, and 1 line in second. In a Beam application, the input might be split into multiple bundles for processing in parallel. There is no correlation to number of files and number of bundles. Same application can process terra bytes of data spread across many files.
ReadFromText has an option 'skip_header_lines', which you can set to 1 in order to skip header line in each of your input files.

Python Web scraper using Beautifulsoup 4

I wanted to create a database with commonly used words. Right now when I run this script it works fine but my biggest issue is I need all of the words to be in one column. I feel like what I did was more of a hack than a real fix. Using Beautifulsoup, can you print everything in one column without having extra blank lines?
import requests
import re
from bs4 import BeautifulSoup
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
# Creating the CSV file
commonFile = open('common_words.csv', 'wb')
# Grabbing the lines you want
for node in soup.findAll("tr"):
# Getting just the text and removing the html
words = ''.join(node.findAll(text=True))
# Removing the extra lines
ID = re.sub(r'[\t\r\n]', '', words)
# Needed to add a break in the line to make the rows
update = ''.join(ID)+'\n'
# Now we add this to the file
commonFile.write(update)
commonFile.close()
How about this?
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open("common_words.csv", "w"))
f.writerow(["common_words"])
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
words = soup.select('div[class=file] tr')
for i in range(len(words)):
word = words[i].text
f.writerow([word.replace('\n', '')])

how to lookup the numbers next to character using python

this is just part of the long python script. there is a file called aqfile and it has many parameters. I would like to extract what is next to "OWNER" and "NS".
Note:
OWNER = text
NS = numbers
i could extract what is next to OWNER, because they were just text and i could extract.
for line in aqfile.readlines():
if string.find(line,"OWNER")>0:
print line
m=re.search('<(.*)>',line)
owner=incorp(m.group(1))
break
but when i try to modify the script to extract the numbers
for line in aqfile.readlines():
if string.find(line,"NS")>0:
print line
m=re.search('<(.*)>',line)
ns=incorp(m.group(1))
break
it doesnt work any more.
Can anyone help me?
this is the whole script
#Make a CSV file of datasetnames. pulseprog and, if avaible, (part of) the title
#Note: the whole file tree is read into memory!!! Do not start too high in the tree!!!
import os
import os.path
import fnmatch
import re
import string
max=20000
outfiledesc=0
def incorp(c):
#Vervang " door """ ,CRLF door blankos
c=c.replace('"','"""')
c=c.replace("\r"," ")
c=c.replace("\n"," ")
return "\"%s\"" % (c)
def process(arg,root,files):
global max
global outfiledesc
#Get name,expno,procno from the root
if "proc" in files:
procno = incorp(os.path.basename(root))
oneup = os.path.dirname(root)
oneup = os.path.dirname(oneup)
aqdir=oneup
expno = incorp(os.path.basename(oneup))
oneup = os.path.dirname(oneup)
dsname = incorp(os.path.basename(oneup))
#Read the titlefile, if any
if (os.path.isfile(root + "/title")):
f=open(root+"/title","r")
title=incorp(f.read(max))
f.close()
else:
title=""
#Grab the pulse program name from the acqus parameter
aqfile=open(aqdir+"/acqus")
for line in aqfile.readlines():
if string.find(line,"PULPROG")>0:
print line
m=re.search('<(.*)>',line)
pulprog=incorp(m.group(1))
break
towrite= "%s;%s;%s;%s;%s\n" % (dsname,expno,procno,pulprog,title)
outfiledesc.write(towrite)
#Main program
dialogline1="Starting point of the search"
dialogline2="Maximum length of the title"
dialogline3="output CSV file"
def1="/opt/topspin3.2/data/nmrafd/nmr"
def2="20000"
def3="/home/nmrafd/filelist.csv"
result = INPUT_DIALOG("CSV file creator","Create a CSV list",[dialogline1,dialogline2,dialogline3],[def1,def2,def3])
start=result[0]
tlength=int(result[1])
outfile=result[2]
#Search for procs files. They should be in any dataset.
outfiledesc = open(outfile,"w")
print start
os.path.walk(start,process,"")
outfiledesc.close()