get() in Google Datastore doesn't work as intended - python-2.7

I'm building a basic blog from the Web Development course by Steve Hoffman on Udacity. This is my code -
import os
import webapp2
import jinja2
from google.appengine.ext import db
template_dir = os.path.join(os.path.dirname(__file__), 'templates')
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader(template_dir), autoescape = True)
def datetimeformat(value, format='%H:%M / %d-%m-%Y'):
return value.strftime(format)
jinja_env.filters['datetimeformat'] = datetimeformat
def render_str(template, **params):
t = jinja_env.get_template(template)
return t.render(params)
class Entries(db.Model):
title = db.StringProperty(required = True)
body = db.TextProperty(required = True)
created = db.DateTimeProperty(auto_now_add = True)
class MainPage(webapp2.RequestHandler):
def get(self):
entries = db.GqlQuery('select * from Entries order by created desc limit 10')
self.response.write(render_str('mainpage.html', entries=entries))
class NewPost(webapp2.RequestHandler):
def get(self):
self.response.write(render_str('newpost.html', error=""))
def post(self):
title = self.request.get('title')
body = self.request.get('body')
if title and body:
e = Entries(title=title, body=body)
length = db.GqlQuery('select * from Entries order by created desc').count()
e.put()
self.redirect('/newpost/' + str(length+1))
else:
self.response.write(render_str('newpost.html', error="Please type in a title and some content"))
class Permalink(webapp2.RequestHandler):
def get(self, id):
e = db.GqlQuery('select * from Entries order by created desc').get()
self.response.write(render_str('permalink.html', id=id, entry = e))
app = webapp2.WSGIApplication([('/', MainPage),
('/newpost', NewPost),
('/newpost/(\d+)', Permalink)
], debug=True)
In the class Permalink, I'm using the get() method on the query than returns all records in the descending order of creation. So, it should return the most recently added record. But when I try to add a new record, permalink.html (it's just a page with shows the title, the body and the date of creation of the new entry) shows the SECOND most recently added. For example, I already had three records, so when I added a fourth record, instead of showing the details of the fourth record, permalink.html showed me the details of the third record. Am I doing something wrong?
I don't think my question is a duplicate of this - Read delay in App Engine Datastore after put(). That question is about read delay of put(), while I'm using get(). The accepted answer also states that get() doesn't cause any delay.

This is because of eventual consistency used by default for GQL queries.
You need to read:
https://cloud.google.com/appengine/docs/python/datastore/data-consistency
https://cloud.google.com/appengine/docs/python/datastore/structuring_for_strong_consistency
https://cloud.google.com/datastore/docs/articles/balancing-strong-and-eventual-consistency-with-google-cloud-datastore/
search & read on SO and other source about strong & eventual consistency in Google Cloud Datastore.
You can specify read_policy=STRONG_CONSISTENCY for your query but it has associated costs that you should be aware of and take into account.

Related

Django function for views takes too long

I'm currently using a Docker & Django setup. I have to fill a database with data from API requests. I was hoping to do this everytime you went on a certain page (pretty easy: just have your views.py call the function that fills the database and voila).
But the problem is, the function takes a long time, several minutes from within django (and about half the time with Spyder).
So I usually just get a TimeOut and the page never loads (I admit I have a lot of API requests being made).
I've read some stuff on using Celery but am not quite sure how it's supposed to work.
Anyone know how I could get around this to be able to load the database?
Edit: some code
Views.py
def index(request):
fill_db()
context = {}
context['segment'] = 'index'
html_template = loader.get_template( 'index.html' )
return HttpResponse(html_template.render(context, request))
fill_db function
def fill_db():
fill_agencies()
fill_companies()
fill_contracts()
fill_orders()
fill_projects()
fill_resources()
Example of a fill function:
r = pip._vendor.requests.get(BASE_URL+EXTENSION,auth=(USER,PASS))
data0 = json.loads(r.text)
conn = sqlite3.connect('/app/database.sqlite3')
c = conn.cursor()
for client in data0['data']:
BoondID = client['id']
name = client['attributes']['name']
expertiseArea = client['attributes']['expertiseArea']
town = client['attributes']['town']
country = client['attributes']['country']
mainManager = client['relationships']['mainManager']['data']['id']
values = (BoondID, name, expertiseArea, town, country, mainManager)
c.execute("INSERT OR REPLACE INTO COMPANIES (BoondID,name,expertiseArea,town,country,mainManager) VALUES (?,?,?,?,?,?);", values)
conn.commit()
conn.close()
Solved.
I used python's threading library.
I defined
agencies_thread = threading.Thread(target=fill_agencies, name="Database Updater")
and called agencies_thread.start() inside my views function.
This works fine.

Flask app-builder how to make REST API with file items

I'm making a REST api that files can be uploaded based in MODEL-VIEW in flask-appbuilder like this.
But I don't know how to call REST API (POST /File).
I tried several different ways. but I couldn't.
Let me know the correct or the alternative ways.
[client code]
file = {'file':open('test.txt', 'rb'),'description':'test'}
requests.post(url, headers=headers, files=file)
==> Failed
model.py
class Files(Model):
__tablename__ = "project_files"
id = Column(Integer, primary_key=True)
file = Column(FileColumn, nullable=False)
description = Column(String(150))
def download(self):
return Markup(
'<a href="'
+ url_for("ProjectFilesModelView.download", filename=str(self.file))
+ '">Download</a>'
)
def file_name(self):
return get_file_original_name(str(self.file))
view.py
class FileApi(ModelRestApi):
resource_name = "File"
datamodel = SQLAInterface(Files)
allow_browser_login = True
appbuilder.add_api(FileApi)
FileColumn is only a string field that saves the file name in the database. The actual file is saved to config['UPLOAD_FOLDER'].
This is taken care of by flask_appbuilder.filemanager.FileManager.
Furthermore, ModelRestApi assumes that you are POSTing JSON data. In order to upload files, I followed Flask's documentation, which suggests to send a multipart/form-data request. Because of this, one needs to override ModelRestApi.post_headless().
This is my solution, where I also make sure that when a Files database row
is deleted, so is the relative file from the filesystem.
from flask_appbuilder.models.sqla.interface import SQLAInterface
from flask_appbuilder.api import ModelRestApi
from flask_appbuilder.const import API_RESULT_RES_KEY
from flask_appbuilder.filemanager import FileManager
from flask import current_app, request
from marshmallow import ValidationError
from sqlalchemy.exc import IntegrityError
from app.models import Files
class FileApi(ModelRestApi):
resource_name = "file"
datamodel = SQLAInterface(Files)
def post_headless(self):
if not request.form or not request.files:
msg = "No data"
current_app.logger.error(msg)
return self.response_400(message=msg)
file_obj = request.files.getlist('file')
if len(file_obj) != 1:
msg = ("More than one file provided.\n"
"Please upload exactly one file at a time")
current_app.logger.error(msg)
return self.response_422(message=msg)
else:
file_obj = file_obj[0]
fm = FileManager()
uuid_filename = fm.generate_name(file_obj.filename, file_obj)
form = request.form.to_dict(flat=True)
# Add the unique filename provided by FileManager, which will
# be saved to the database. The original filename can be
# retrieved using
# flask_appbuilder.filemanager.get_file_original_name()
form['file'] = uuid_filename
try:
item = self.add_model_schema.load(
form,
session=self.datamodel.session)
except ValidationError as err:
current_app.logger.error(err)
return self.response_422(message=err.messages)
# Save file to filesystem
fm.save_file(file_obj, item.file)
try:
self.datamodel.add(item, raise_exception=True)
return self.response(
201,
**{API_RESULT_RES_KEY: self.add_model_schema.dump(
item, many=False),
"id": self.datamodel.get_pk_value(item),
},
)
except IntegrityError as e:
# Delete file from filesystem if the db record cannot be
# created
fm.delete_file(item.file)
current_app.logger.error(e)
return self.response_422(message=str(e.orig))
def pre_delete(self, item):
"""
Delete file from filesystem before removing the record from the
database
"""
fm = FileManager()
current_app.logger.info(f"Deleting {item.file} from filesystem")
fm.delete_file(item.file)
You can use this.
from app.models import Project, ProjectFiles
class DataFilesModelView(ModelView):
datamodel = SQLAInterface(ProjectFiles)
label_columns = {"file_name": "File Name", "download": "Download"}
add_columns = ["file", "description", "project"]
edit_columns = ["file", "description", "project"]
list_columns = ["file_name", "download"]
show_columns = ["file_name", "download"]
Last add the view to the menu.
appbuilder.add_view(DataFilesModelView,"File View")

How to update existing data and create new one django base command?

i am trying to store data in from json file and i added its not a problem to add data but when i trigger data again again they copy data and create same new one that i don't want , i want that it will update existing data and if there will new data in json file it will add in model .
here is my django base command code.
from django.core.management.base import BaseCommand
import requests
from demo.models import CoronaAge, CoronaSex, CoronaComorbidity
class Command(BaseCommand):
def handle(self, *args, **kwargs):
url = 'https://api.the2019ncov.com/api/fatality-rate'
r = requests.get(url)
titles = r.json()
print(titles)
# For between age
for title in titles['byAge'] or []:
CoronaAge.objects.update_or_create(
age=title['age'],
rate=title['rate']
)
context = {'titles': CoronaAge.objects.all()}
# for sex wise male and female
for title in titles['bySex'] or []:
CoronaSex.objects.update_or_create(
sex=title['sex'],
rate=title['rate']
)
context = {'titles': CoronaSex.objects.all()}
for title in titles['byComorbidity'] or []:
CoronaComorbidity.objects.update_or_create(
condition=title['preExistingCondition'],
rate=title['rate']
)
context = {'titles': CoronaComorbidity.objects.all()}
This is how I would solve it. Get a list existing data. Then, for each new entry check if it exists in the db, create new object, add to list and at the end run bulk_create to insert all of them in one hit. If exists, then update all fields that you want and again, run bulk update at the end.
corona_ages = CoronaAge.objects.all()
new_ages = []
existing_ages = []
for title in titles['byAge'] or []:
entry = corona_ages.filter(age=title['age']).first():
if not entry:
new_data = CoronaAge(**title)
new_ages.append(new_data)
else:
entry['some_param'] = title['some_param']
entry['other_param'] = title['other_param']
existing_ages.append(new_date)
CoronaAge.objects.bulk_create(new_ages)
CoronaAge.objects.bulk_update(existing_ages)

Python scrapy working (only half of the time)

I created a python scrapy project to extract the prices of some google flights.
I configured the middleware to use PhantomJS instead of a normal browser.
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
try:
driver.get(request.url)
time.sleep(1.5)
except e:
raise ValueError("request url failed - \n url: {},\n error:
{}").format(request.url, e)
body = driver.page_source
#encoding='utf-8' - add to html response if necessary
return HtmlResponse(driver.current_url, body=body,encoding='utf-8',
request=request)
In the settings.py i added:
DOWNLOADER_MIDDLEWARES = {
# key path intermediate class, order value of middleware
'scraper_module.middlewares.middleware.JSMiddleware' : 543 ,
# prohibit the built-in middleware
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None , } `
I also created the following spider class:
import scrapy
from scrapy import Selector
class Gspider(scrapy.Spider):
name = "google_spider"
def __init__(self):
self.start_urls = ["https://www.google.pt/flights/#search;f=LIS;t=POR;d=2017-06-18;r=2017-06-22"]
self.prices = []
self.links = []
def clean_price(self, part):
#part received as a list
#the encoding is utf-8
part = part[0]
part = part.encode('utf-8')
part = filter(str.isdigit, part)
return part
def clean_link(self, part):
part = part[0]
part = part.encode('utf-8')
return part
def get_part(self, var_holder, response, marker, inner_marker, amount = 1):
selector = Selector(response)
divs = selector.css(marker)
for n, div in enumerate(divs):
if n < amount:
part = div.css(inner_marker).extract()
if inner_marker == '::text':
part = self.clean_price(part)
else:
part = self.clean_link(part)
var_holder.append(part)
else:
break
return var_holder
def parse(self, response):
prices, links = [], []
prices = self.get_part(prices, response, 'div.OMOBOQD-d-Ab', '::text')
print prices
links = self.get_part(links, response, 'a.OMOBOQD-d-X', 'a::attr(href)')
print links
The problem is, I run the code in the shell, and around half of the times I successfully get the prices and links requested, but another half of the time, the final vectors which should contain the extracted data, are empty.
I do not get any errors during execution.
Does anyone have any idea about why this is happening?
here are the logs from the command line:
Google has a very strict policy in terms of crawling. (Pretty hypocritical when you know that they constently crawl all the web...)
You should either find an API, as said previously in the comments or maybe use proxies. An easy way is to use Crawlera. It manages thousands of proxies so you don't have to bother. I personnaly use it to crawl google and it works perfectly. The downside is that it is not free.

mongoengine know when to delete document

New to django. I'm doing my best to implement CRUD using Django, mongodb, and mongoengine. I'm able to query the database and render my page with the correct information from the database. I'm also able to change some document fields using javascript and do an Ajax POST back to the original Django View class with the correct csrf token.
The data payload I'm sending back and forth is a list of each Document Model (VirtualPageModel) serialized to json (each element contains ObjectId string along with the other specific fields from the Model.)
This is where it starts getting murky. In order to update the original document in my View Class post function I do an additional query using the object id and loop through the dictionary items, setting the respective fields each time. I then call save and any new data is pushed to the Mongo collection correctly.
I'm not sure if what I'm doing to update existing documents is correct or in the spirit of django's abstracted database operations. The deeper I get the more I feel like I'm not using some fundamental facility earlier on (provided by either django or mongoengine) and because of this I'm having to make things up further downstream.
The way my code is now I would not be able to create a new document (although that's easy enough to fix). However what I'm really curious about is how I would know when to delete a document which existed in the initial query, but was removed by the user/javascript code? Am I overthinking things and the contents of my POST should contain a list of ObjectIds to delete (sounds like a security risk although this would be an internal tool.)
I was assuming that my View Class might maintain either the original document objects (or simply ObjectIds) it queried and I could do my comparisions off of that set, but I can't seem to get that information to persist (as a class variable in VolumeSplitterView) from its inception to when I received the POST at the end.
I would appreciate if anyone could take a look at my code. It really seems like the "ease of use" facilities of Django start to break when paired with Mongo and/or a sufficiently complex Model schema which needs to be directly available to javascript as opposed to simple Forms.
I was going to use this dev work to become django battle-hardened in order to tackle a future app which will be much more complicated and important. I can hack on this thing all day and make it functional, but what I'm really interested in is anyone's experience in using Django + MongoDB + MongoEngine to implement CRUD on a Database Schema which is not vary Form-centric (think more nested metadata).
Thanks.
model.py: uses mongoengine Field types.
class MongoEncoder(JSONEncoder):
def default(self, o):
if isinstance(o, VirtualPageModel):
data_dict = (o.to_mongo()).to_dict()
if isinstance(data_dict.get('_id'), ObjectId):
data_dict.update({'_id': str(data_dict.get('_id'))})
return data_dict
else:
return JSONEncoder.default(self, o)
class SubTypeModel(EmbeddedDocument):
filename = StringField(max_length=200, required=True)
page_num = IntField(required=True)
class VirtualPageModel(Document):
volume = StringField(max_length=200, required=True)
start_physical_page_num = IntField()
physical_pages = ListField(EmbeddedDocumentField(SubTypeModel),
default=list)
error_msg = ListField(StringField(),
default=list)
def save(self, *args, **kwargs):
print('In save: {}'.format(kwargs))
for k, v in kwargs.items():
if k == 'physical_pages':
self.physical_pages = []
for a_page in v:
tmp_pp = SubTypeModel()
for p_k, p_v in a_page.items():
setattr(tmp_pp, p_k, p_v)
self.physical_pages.append(tmp_pp)
else:
setattr(self, k, v)
return super(VirtualPageModel, self).save(*args, **kwargs)
views.py: My attempt at a view
class VolumeSplitterView(View):
#initial = {'key': 'value'}
template_name = 'click_model/index.html'
vol = None
start = 0
end = 20
def get(self, request, *args, **kwargs):
self.vol = self.kwargs.get('vol', None)
records = self.get_records()
records = records[self.start:self.end]
vp_json_list = []
img_filepaths = []
for vp in records:
vp_json = json.dumps(vp, cls=MongoEncoder)
vp_json_list.append(vp_json)
for pp in vp.physical_pages:
filepath = get_file_path(vp, pp.filename)
img_filepaths.append(filepath)
data_dict = {
'img_filepaths': img_filepaths,
'vp_json_list': vp_json_list
}
return render_to_response(self.template_name,
{'data_dict': data_dict},
RequestContext(request))
def get_records(self):
return VirtualPageModel.objects(volume=self.vol)
def post(self, request, *args, **kwargs):
if request.is_ajax:
vp_dict_list = json.loads(request.POST.get('data', []))
for vp_dict in vp_dict_list:
o_id = vp_dict.pop('_id')
original_doc = VirtualPageModel.objects.get(id=o_id)
try:
original_doc.save(**vp_dict)
except Exception:
print(traceback.format_exc())