Django function for views takes too long - django

I'm currently using a Docker & Django setup. I have to fill a database with data from API requests. I was hoping to do this everytime you went on a certain page (pretty easy: just have your views.py call the function that fills the database and voila).
But the problem is, the function takes a long time, several minutes from within django (and about half the time with Spyder).
So I usually just get a TimeOut and the page never loads (I admit I have a lot of API requests being made).
I've read some stuff on using Celery but am not quite sure how it's supposed to work.
Anyone know how I could get around this to be able to load the database?
Edit: some code
Views.py
def index(request):
fill_db()
context = {}
context['segment'] = 'index'
html_template = loader.get_template( 'index.html' )
return HttpResponse(html_template.render(context, request))
fill_db function
def fill_db():
fill_agencies()
fill_companies()
fill_contracts()
fill_orders()
fill_projects()
fill_resources()
Example of a fill function:
r = pip._vendor.requests.get(BASE_URL+EXTENSION,auth=(USER,PASS))
data0 = json.loads(r.text)
conn = sqlite3.connect('/app/database.sqlite3')
c = conn.cursor()
for client in data0['data']:
BoondID = client['id']
name = client['attributes']['name']
expertiseArea = client['attributes']['expertiseArea']
town = client['attributes']['town']
country = client['attributes']['country']
mainManager = client['relationships']['mainManager']['data']['id']
values = (BoondID, name, expertiseArea, town, country, mainManager)
c.execute("INSERT OR REPLACE INTO COMPANIES (BoondID,name,expertiseArea,town,country,mainManager) VALUES (?,?,?,?,?,?);", values)
conn.commit()
conn.close()

Solved.
I used python's threading library.
I defined
agencies_thread = threading.Thread(target=fill_agencies, name="Database Updater")
and called agencies_thread.start() inside my views function.
This works fine.

Related

Wagtail add functions to models.py

i'm trying to make a custom plotly-graphic on a wagtail homepage.
I got this far. I'm overriding the wagtail Page-model by altering the context returned to the template. Am i doing this the right way, is this possible in models.py ?
Thnx in advanced.
from django.db import models
from wagtail.models import Page
from wagtail.fields import RichTextField
from wagtail.admin.panels import FieldPanel
import psycopg2
from psycopg2 import sql
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import plot
class CasPage(Page):
body = RichTextField(blank=True)
content_panels = Page.content_panels + [
FieldPanel('body'),
]
def get_connection(self):
try:
return psycopg2.connect(
database="xxxx",
user="xxxx",
password="xxxx",
host="xxxxxxxxxxxxx",
port=xxxxx,
)
except:
return False
conn = get_connection()
cursor = conn.cursor()
strquery = (f'''SELECT t.datum, t.grwaarde - LAG(t.grwaarde,1) OVER (ORDER BY datum) AS
gebruiktgas
FROM XXX
''')
data = pd.read_sql(strquery, conn)
fig1 = go.Figure(
data = data,
layout=go.Layout(
title="Gas-verbruik",
yaxis_title="aantal M3")
)
output = plotly.plot(fig1, output_type='div', include_plotlyjs=False)
# https://stackoverflow.com/questions/32626815/wagtail-views-extra-context
def get_context(self, request):
context = super(CasPage, self).get_context(request)
context['output'] = output
return context
Kind of the right track. You should move all the plot code into its own method though. At the moment, it runs the plot code when the site initialises then stays stored in memory.
There's three usual ways to get the plot to the rendered page then.
As you've done with context
As a property or method of the page class
As a template tag called from the template
The first two have more or less the same effect, except the 2nd makes the property available anywhere, not just the template. The context method runs before the page starts rendering, the other two happen during that process. I guess the only real difference there is that if you're using template caching, the context will always run each time the page is loaded, the other two only run when the cache is invalid, or if the code is escaped out of the cache (for fragment caching).
To call the plot as a property of your page class, you'd just pull out the code into a def with the #property decorator:
class CasPage(Page):
....
#property
def plot(self):
try:
conn = psycopg2.connect(
database="xxxx",
user="xxxx",
password="xxxx",
host="xxxxxxxxxxxxx",
port=xxxxx,
)
cursor = conn.cursor()
strquery = (f'''SELECT t.datum, t.grwaarde - LAG(t.grwaarde,1) OVER (ORDER BY datum) AS
gebruiktgas FROM XXX''')
data = pd.read_sql(strquery, conn)
fig1 = go.Figure(
data = data,
layout=go.Layout(
title="Gas-verbruik",
yaxis_title="aantal M3")
)
return plotly.plot(fig1, output_type='div', include_plotlyjs=False)
except Exception as e:
print(f"{type(e).__name__} at line {e.__traceback__.tb_lineno} of {__file__}: {e}")
return None
^ I haven't tried this code ... it should work as is, but no guarantees I didn't make a typo ;)
Now you can access your plot with {{ self.plot }} in the template.
If you want to stick with context, then you'd stay with the def above but just amend your output line to
context['output'] = self.plot
Template tags are more useful when they're being used in StructBlocks and not part of a page class like this, or where you have code that you want to re-use in multiple templates.
Then you'd move all that plot code into a template tag file, register it and call it in the template with {% plot %}. Wagtail template tags work the same as Django: https://docs.djangoproject.com/en/4.1/howto/custom-template-tags/
Is the plot data outside of the site database? If not, you could probably get the data via the ORM if it was defined as a model. If so, it's probably worth writing a view (or stored procedure if you want to pass parameters) on the db server and calling that rather than hard coding the SQL into your python.
The other consideration is the page load time - if the dataset is big, this could take a while and prevent the page from loading. You'd probably want a front-end solution in that case.

Django asyncio for saving (big amount of) objects - Nothing saved

I want to fetch categories from a Magento API and display them in a template. In the same time, I want to save them in DB for an ulterior use.
Categories are too many and the render of the template takes more than 30 sec.
I start to learn using asyncio but couldn't get my way with it. I surely missed something.
First, my URL leads to the function that retrieves the categories
#login_required
def get_categories(request):
Category.objects.all().delete()
try:
cats = fetch_categories()
tree = cats['children_data']
except:
print('erreur : impossible de récupérer les catégories (fetch_categories)')
asyncio.run(parse_categories(tree))
return render(request, 'categories/categories_list.html', {'tree': tree})
When I get the "categories tree", I send it to
async def parse_categories(tree):
for lvl1 in tree:
all_tasks = []
asyncio.create_task(save_cat(lvl1))
# main products categories (turbo, injectors ...)
for lvl2 in lvl1['children_data']:
asyncio.create_task(save_cat(lvl2))
# sub categories like RENAULT, DACIA
for lvl3 in lvl2['children_data']:
asyncio.create_task(save_cat(lvl3))
for lvl4 in lvl3['children_data']:
asyncio.create_task(save_cat(lvl4))
for lvl5 in lvl4['children_data']:
asyncio.create_task(save_cat(lvl5))
My save() function is async. I'm not sure it should be. Before I started using async, it was working.
async def save_cat(cat):
cat_id = cat['id']
new_cat = Category()
new_cat.id = cat_id
new_cat.name = cat.get('name', None)
new_cat.parent = cat.get('parent_id', None)
new_cat.url = cat.get('path', None)
new_cat.is_active = cat.get('is_active', None)
new_cat.position = cat.get('position', None)
new_cat.level = cat.get('level', None)
new_cat.save()
When I run, no error. The context is well sent to the template and displays well. But no category is saved.
I also tried to make a task list with asyncio.create_task in each level and execute the loop at the end of parse_categories() like said in this thread, without success.
all_tasks.append(asyncio.create_task(save_cat(lvl1)))
[...]
responses = asyncio.gather(*all_tasks, return_exceptions=True)
loop = asyncio.get_event_loop()
loop.run_until_complete(responses)
loop.close()
Any clue to solve my case will be welcome

Python scrapy working (only half of the time)

I created a python scrapy project to extract the prices of some google flights.
I configured the middleware to use PhantomJS instead of a normal browser.
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
try:
driver.get(request.url)
time.sleep(1.5)
except e:
raise ValueError("request url failed - \n url: {},\n error:
{}").format(request.url, e)
body = driver.page_source
#encoding='utf-8' - add to html response if necessary
return HtmlResponse(driver.current_url, body=body,encoding='utf-8',
request=request)
In the settings.py i added:
DOWNLOADER_MIDDLEWARES = {
# key path intermediate class, order value of middleware
'scraper_module.middlewares.middleware.JSMiddleware' : 543 ,
# prohibit the built-in middleware
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None , } `
I also created the following spider class:
import scrapy
from scrapy import Selector
class Gspider(scrapy.Spider):
name = "google_spider"
def __init__(self):
self.start_urls = ["https://www.google.pt/flights/#search;f=LIS;t=POR;d=2017-06-18;r=2017-06-22"]
self.prices = []
self.links = []
def clean_price(self, part):
#part received as a list
#the encoding is utf-8
part = part[0]
part = part.encode('utf-8')
part = filter(str.isdigit, part)
return part
def clean_link(self, part):
part = part[0]
part = part.encode('utf-8')
return part
def get_part(self, var_holder, response, marker, inner_marker, amount = 1):
selector = Selector(response)
divs = selector.css(marker)
for n, div in enumerate(divs):
if n < amount:
part = div.css(inner_marker).extract()
if inner_marker == '::text':
part = self.clean_price(part)
else:
part = self.clean_link(part)
var_holder.append(part)
else:
break
return var_holder
def parse(self, response):
prices, links = [], []
prices = self.get_part(prices, response, 'div.OMOBOQD-d-Ab', '::text')
print prices
links = self.get_part(links, response, 'a.OMOBOQD-d-X', 'a::attr(href)')
print links
The problem is, I run the code in the shell, and around half of the times I successfully get the prices and links requested, but another half of the time, the final vectors which should contain the extracted data, are empty.
I do not get any errors during execution.
Does anyone have any idea about why this is happening?
here are the logs from the command line:
Google has a very strict policy in terms of crawling. (Pretty hypocritical when you know that they constently crawl all the web...)
You should either find an API, as said previously in the comments or maybe use proxies. An easy way is to use Crawlera. It manages thousands of proxies so you don't have to bother. I personnaly use it to crawl google and it works perfectly. The downside is that it is not free.

get() in Google Datastore doesn't work as intended

I'm building a basic blog from the Web Development course by Steve Hoffman on Udacity. This is my code -
import os
import webapp2
import jinja2
from google.appengine.ext import db
template_dir = os.path.join(os.path.dirname(__file__), 'templates')
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader(template_dir), autoescape = True)
def datetimeformat(value, format='%H:%M / %d-%m-%Y'):
return value.strftime(format)
jinja_env.filters['datetimeformat'] = datetimeformat
def render_str(template, **params):
t = jinja_env.get_template(template)
return t.render(params)
class Entries(db.Model):
title = db.StringProperty(required = True)
body = db.TextProperty(required = True)
created = db.DateTimeProperty(auto_now_add = True)
class MainPage(webapp2.RequestHandler):
def get(self):
entries = db.GqlQuery('select * from Entries order by created desc limit 10')
self.response.write(render_str('mainpage.html', entries=entries))
class NewPost(webapp2.RequestHandler):
def get(self):
self.response.write(render_str('newpost.html', error=""))
def post(self):
title = self.request.get('title')
body = self.request.get('body')
if title and body:
e = Entries(title=title, body=body)
length = db.GqlQuery('select * from Entries order by created desc').count()
e.put()
self.redirect('/newpost/' + str(length+1))
else:
self.response.write(render_str('newpost.html', error="Please type in a title and some content"))
class Permalink(webapp2.RequestHandler):
def get(self, id):
e = db.GqlQuery('select * from Entries order by created desc').get()
self.response.write(render_str('permalink.html', id=id, entry = e))
app = webapp2.WSGIApplication([('/', MainPage),
('/newpost', NewPost),
('/newpost/(\d+)', Permalink)
], debug=True)
In the class Permalink, I'm using the get() method on the query than returns all records in the descending order of creation. So, it should return the most recently added record. But when I try to add a new record, permalink.html (it's just a page with shows the title, the body and the date of creation of the new entry) shows the SECOND most recently added. For example, I already had three records, so when I added a fourth record, instead of showing the details of the fourth record, permalink.html showed me the details of the third record. Am I doing something wrong?
I don't think my question is a duplicate of this - Read delay in App Engine Datastore after put(). That question is about read delay of put(), while I'm using get(). The accepted answer also states that get() doesn't cause any delay.
This is because of eventual consistency used by default for GQL queries.
You need to read:
https://cloud.google.com/appengine/docs/python/datastore/data-consistency
https://cloud.google.com/appengine/docs/python/datastore/structuring_for_strong_consistency
https://cloud.google.com/datastore/docs/articles/balancing-strong-and-eventual-consistency-with-google-cloud-datastore/
search & read on SO and other source about strong & eventual consistency in Google Cloud Datastore.
You can specify read_policy=STRONG_CONSISTENCY for your query but it has associated costs that you should be aware of and take into account.

mongoengine know when to delete document

New to django. I'm doing my best to implement CRUD using Django, mongodb, and mongoengine. I'm able to query the database and render my page with the correct information from the database. I'm also able to change some document fields using javascript and do an Ajax POST back to the original Django View class with the correct csrf token.
The data payload I'm sending back and forth is a list of each Document Model (VirtualPageModel) serialized to json (each element contains ObjectId string along with the other specific fields from the Model.)
This is where it starts getting murky. In order to update the original document in my View Class post function I do an additional query using the object id and loop through the dictionary items, setting the respective fields each time. I then call save and any new data is pushed to the Mongo collection correctly.
I'm not sure if what I'm doing to update existing documents is correct or in the spirit of django's abstracted database operations. The deeper I get the more I feel like I'm not using some fundamental facility earlier on (provided by either django or mongoengine) and because of this I'm having to make things up further downstream.
The way my code is now I would not be able to create a new document (although that's easy enough to fix). However what I'm really curious about is how I would know when to delete a document which existed in the initial query, but was removed by the user/javascript code? Am I overthinking things and the contents of my POST should contain a list of ObjectIds to delete (sounds like a security risk although this would be an internal tool.)
I was assuming that my View Class might maintain either the original document objects (or simply ObjectIds) it queried and I could do my comparisions off of that set, but I can't seem to get that information to persist (as a class variable in VolumeSplitterView) from its inception to when I received the POST at the end.
I would appreciate if anyone could take a look at my code. It really seems like the "ease of use" facilities of Django start to break when paired with Mongo and/or a sufficiently complex Model schema which needs to be directly available to javascript as opposed to simple Forms.
I was going to use this dev work to become django battle-hardened in order to tackle a future app which will be much more complicated and important. I can hack on this thing all day and make it functional, but what I'm really interested in is anyone's experience in using Django + MongoDB + MongoEngine to implement CRUD on a Database Schema which is not vary Form-centric (think more nested metadata).
Thanks.
model.py: uses mongoengine Field types.
class MongoEncoder(JSONEncoder):
def default(self, o):
if isinstance(o, VirtualPageModel):
data_dict = (o.to_mongo()).to_dict()
if isinstance(data_dict.get('_id'), ObjectId):
data_dict.update({'_id': str(data_dict.get('_id'))})
return data_dict
else:
return JSONEncoder.default(self, o)
class SubTypeModel(EmbeddedDocument):
filename = StringField(max_length=200, required=True)
page_num = IntField(required=True)
class VirtualPageModel(Document):
volume = StringField(max_length=200, required=True)
start_physical_page_num = IntField()
physical_pages = ListField(EmbeddedDocumentField(SubTypeModel),
default=list)
error_msg = ListField(StringField(),
default=list)
def save(self, *args, **kwargs):
print('In save: {}'.format(kwargs))
for k, v in kwargs.items():
if k == 'physical_pages':
self.physical_pages = []
for a_page in v:
tmp_pp = SubTypeModel()
for p_k, p_v in a_page.items():
setattr(tmp_pp, p_k, p_v)
self.physical_pages.append(tmp_pp)
else:
setattr(self, k, v)
return super(VirtualPageModel, self).save(*args, **kwargs)
views.py: My attempt at a view
class VolumeSplitterView(View):
#initial = {'key': 'value'}
template_name = 'click_model/index.html'
vol = None
start = 0
end = 20
def get(self, request, *args, **kwargs):
self.vol = self.kwargs.get('vol', None)
records = self.get_records()
records = records[self.start:self.end]
vp_json_list = []
img_filepaths = []
for vp in records:
vp_json = json.dumps(vp, cls=MongoEncoder)
vp_json_list.append(vp_json)
for pp in vp.physical_pages:
filepath = get_file_path(vp, pp.filename)
img_filepaths.append(filepath)
data_dict = {
'img_filepaths': img_filepaths,
'vp_json_list': vp_json_list
}
return render_to_response(self.template_name,
{'data_dict': data_dict},
RequestContext(request))
def get_records(self):
return VirtualPageModel.objects(volume=self.vol)
def post(self, request, *args, **kwargs):
if request.is_ajax:
vp_dict_list = json.loads(request.POST.get('data', []))
for vp_dict in vp_dict_list:
o_id = vp_dict.pop('_id')
original_doc = VirtualPageModel.objects.get(id=o_id)
try:
original_doc.save(**vp_dict)
except Exception:
print(traceback.format_exc())