How to have scrapy spider run on flask app form submit?

How to have scrapy spider run on flask app form submit? - flask

I'm setting up a flask app that will allow me to input a string and it will pass that string argument to my spider to webscrape a page. I'm having difficulty getting the spider to run on the press of a form submit(integrating scrapy&flask).
I've looked at the following code snippet solutions to no avail:
Run Scrapy from Flask,
Running Scrapy spiders in a Celery task,
Scrapy and celery `update_state`
It definitely appears that there are different ways to complete the task. However - each of the code snippets above does not appear to be working.
routes.py
from flask import render_template, flash, redirect, url_for, session, jsonify
from flask import request
from flask_login import login_required
from flask_login import logout_user
from app import app, db
from app.forms import LoginForm
from flask_login import current_user, login_user
from app.models import User
from werkzeug.urls import url_parse
from app.forms import RegistrationForm, SearchForm
#from app.tasks import scrape_async_job
import pprint
import requests
import json
#app.route('/')
#app.route('/index', methods=['GET','POST'])
#login_required
def index():
jobvisuals = [
{
'Job': 'Example',
'Desc': 'This job requires a degree...',
'link': 'fakelink',
'salary': '10$/hr',
'applied': 'Boolean',
'interview': 'Boolean'}]
params = {
'spider_name': 'Indeedspider',
'start_requests': True
}
response = requests.get('http://localhost:9080/crawl.json', params).json()
data = response
pprint.pprint(data)
form = SearchForm()
if request.method == 'GET':
return render_template('index.html', title='home', jobvisuals=jobvisuals, form=form, search=session.get('search',''))
job_find = request.form['search']
session['search'] = job_find
if form.validate_on_submit():
print('Working on this feature :D')
flash('Searching for job {}').format(form.search.data)
return render_template('index.html', title='Home', jobvisuals=jobvisuals, form=form)
spider
import scrapy
class IndeedSpider(scrapy.Spider):
name = 'indeedspider'
allowed_domains = ['indeed.com']
def __init__(self, job='', **kwargs):
self.start_url('http://www.indeed.com/jobs?q={job}&l=San+Marcos%2C+CA')
super().__init__(**kwargs)
def parse(self, response):
for item in response.xpath("//div[contains(#class,'jobsearch-SerpJobCard unifiedRow row result clickcard')]").getall():
yield {
'title': item.xpath("//div[contains(#class,'title')]/text()").get(default='None'),
'desc': item.xpath("//div[contains(#class,'summary')]/text()").get(default='None'),
'link': item.xpath("//div[contains(#class,'title')]/#href").get(default='None'),
'location': item.xpath("//span[contains(#class,'location')]/text()").get(default='None'),
'salary': item.xpath("//div[contains(#class,'salarySnippet')]/text()").get(default='None')
}
Expected:
I type in a input box the job, job gets passed to spider on submit, spider scrapes indeed.com and pulls the first page only and returns that data on the index page.
Actual:
Unsure of where to start.
Can anyone point me in the right direction?

Related

How can I send scrapy result to django views so that frontend can get scrapy result by axios?

I am planning making backend using django + scrapy. my goal is this.
frontend(react) send 'get' methods by axios to django views endpoint.
this activate scrapy to start crawling (spiders)
send scraping result to django views.
frontend get json result (scraped result, not jobid or log file)
from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapyApp.items import ScrapyappItem
from scrapy.utils.project import get_project_settings
class MySpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://www.google.com',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = ScrapyappItem()
item['title'] = response.css('title::text').get()
yield item
def show1(request):
# configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(MySpider)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished
return HttpResponse({"result":d})

mailgun and flask forms

I keep getting the following error when I run my code:
TypeError: 'Response' object is not callable
Here is my code...
from flask import Flask, render_template, flash
import os
import requests
import forms
app = Flask(__name__)
app.secret_key = 'jfdsjajfjds'
mg_key = os.environ['MAILGUN_API_KEY']
#app.route("/", methods=('GET', 'POST'))
def landing():
form = forms.OptinForm()
if form.validate_on_submit():
return requests.post(
"https://api.mailgun.net/v3/lists/test_list#sandbox.mailgun.org/members",
auth=('api', 'mg_key'),
data={'subscribed': True,
'address': form.email.data,
'name': form.first_name.data})
flash("Thanks! Check your email.")
return render_template('landing.html', form=form)
if __name__ == "__main__":
app.run()

I figured out the problem. I was calling my API key variable as a string. fixed it by changing 'mg_key' to mg_key

python flask mini-application global name not defined

I keep getting the error NameError: global name 'NameForm' is not defined
Here is my views.py
from flask import Flask, render_template
from flask_bootstrap import Bootstrap
from app import forms
app = Flask(__name__)
bootstrap = Bootstrap(app)
#app.route('/', methods=['GET', 'POST'])
def index():
name = None
form = NameForm()
if form.validate_on_submit():
name = form.name.data
form.name.data = ''
return render_template('index.html', form=form, name=name)
if __name__ == '__main__':
app.run(debug=True)
And here is my forms.py
from flask_wtf import Form
from wtforms import StringField, SubmitField
from wtforms.validators import Required
class NameForm(Form):
"""docstring for ClassName"""
name = StringField('Your name please', validators=[Required()])
submit = SubmitField('Submit')
Any ideas on what i might be doing wrong???

Emails won't send after upgrading from Django 1.6.x to Django > 1.7.x

I am currently using Django Allauth and a modified version of Django Invitations (https://github.com/bee-keeper/django-invitations). The only thing added is a field for which group to add the user to, and the application works perfectly when Django 1.6.x is being used. I would like to upgrade to Django 1.7.x or 1.8 but this somehow breaks the emailing feature.
The specific piece of code is here:
'import datetime
from django.db import models
from django.utils.translation import ugettext_lazy as _
from django.utils import timezone
from django.utils.crypto import get_random_string
from django.utils.encoding import python_2_unicode_compatible
from django.contrib.sites.models import Site
from django.core.urlresolvers import reverse
from allauth.account.adapter import DefaultAccountAdapter
from allauth.account.adapter import get_adapter
from .managers import InvitationManager
from . import app_settings
from . import signals
...(other code)
def send_invitation(self, request, **kwargs):
current_site = (kwargs['site'] if 'site' in kwargs
else Site.objects.get_current())
invite_url = reverse('invitations:accept-invite',
args=[self.key])
invite_url = request.build_absolute_uri(invite_url)
ctx = {
'invite_url': invite_url,
'current_site': current_site,
'email': self.email,
'key': self.key,
}
email_template = 'invitations/email/email_invite'
get_adapter().send_mail(email_template,
self.email,
ctx)
self.sent = timezone.now()
self.save()
signals.invite_url_sent.send(
sender=self.__class__,
instance=self,
invite_url_sent=invite_url)'
found here (https://github.com/bee-keeper/django-invitations/blob/master/invitations/models.py)
This also references the code from allauth here:
from __future__ import unicode_literals import re
import warnings
import json
from django.conf import settings
from django.http import HttpResponse
from django.template.loader import render_to_string
from django.template import TemplateDoesNotExist
from django.contrib.sites.models import Site
from django.core.mail import EmailMultiAlternatives, EmailMessage
from django.utils.translation import ugettext_lazy as _
from django import forms
from django.contrib import messages
try:
from django.utils.encoding import force_text
except ImportError:
from django.utils.encoding import force_unicode as force_text
from ..utils import (import_attribute, get_user_model,
generate_unique_username,
resolve_url)
from . import app_settings
USERNAME_REGEX = re.compile(r'^[\w.#+-]+$', re.UNICODE)
........ (other code)
def render_mail(self, template_prefix, email, context):
"""
Renders an e-mail to `email`. `template_prefix` identifies the
e-mail that is to be sent, e.g. "account/email/email_confirmation"
"""
subject = render_to_string('{0}_subject.txt'.format(template_prefix),
context)
# remove superfluous line breaks
subject = " ".join(subject.splitlines()).strip()
subject = self.format_email_subject(subject)
bodies = {}
for ext in ['html', 'txt']:
try:
template_name = '{0}_message.{1}'.format(template_prefix, ext)
bodies[ext] = render_to_string(template_name,
context).strip()
except TemplateDoesNotExist:
if ext == 'txt' and not bodies:
# We need at least one body
raise
if 'txt' in bodies:
msg = EmailMultiAlternatives(subject,
bodies['txt'],
settings.DEFAULT_FROM_EMAIL,
[email])
if 'html' in bodies:
msg.attach_alternative(bodies['html'], 'text/html')
else:
msg = EmailMessage(subject,
bodies['html'],
settings.DEFAULT_FROM_EMAIL,
[email])
msg.content_subtype = 'html' # Main content is now text/html
return msg
def send_mail(self, template_prefix, email, context):
msg = self.render_mail(template_prefix, email, context)
msg.send()'
found at (allauth/account/adapter.py)
The form always saves an invitation element into the database but breaks at the sending email line. (all infor stored is correct, so that isn't breaking it). If the email is removed, all code afterwards runs fine. I have even tried to just send a basic email like such in place:
from django.core.mail import EmailMessage
msg = EmailMessage("TEST", "HELLO", my_email, [some_email])
msg.send()
but this, too does not send emails.
I am hoping this is super simple, but any help would be appreciated.

I had the same problem, the execution just hung when running this code in a django shell (Django 1.7):
from django.core.mail import send_mail
send_mail('Subject here', 'Here is the message.', 'from#example.com',
['to#example.com'], fail_silently=False)
Following the Django docs on email settings, I used in settings.py:
EMAIL_USE_TLS = False
EMAIL_USE_SSL = True
EMAIL_PORT = 465
This worked.

Cannot login again after resuming crawl. Cookies are not sticky after resuming scrapy

I have a CrawlSpider, the code is below. I use Tor through tsocks.
When I start my spider, everything works fine. Using init_request I can login on site and crawl with sticky cookies.
But problem occurred when I stopped and resumed spider. Cookies became not sticky.
I give you the response from Scrapy.
=======================INIT_REQUEST================
2013-01-30 03:03:58+0300 [my] INFO: Spider opened
2013-01-30 03:03:58+0300 [my] INFO: Resuming crawl (675 requests scheduled)
............ And here crawling began
So... callback=self.login_url in def init_request is not fired!!!
I thought that scrapy engine didn't want to send again request on login page. Before resuming scrapy I changed login_page (I can login from every page on site) to different that not included in restrict_xpaths.
Result is - After resuming I cannot login and previous cookies are lost.
Does anyone have some assumptions?
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import TakeFirst, MapCompose, Join, Identity
from beles_com_ua.items import Product
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.markup import remove_entities
from django.utils.html import strip_tags
from datetime import datetime
from scrapy import log
import re
from scrapy.http import Request, FormRequest
class ProductLoader(XPathItemLoader):
.... some code is here ...
class MySpider(CrawlSpider):
name = 'my'
login_page = 'http://test.com/index.php?section=6&type=12'
allowed_domains = ['test.com']
start_urls = [
'http://test.com/index.php?section=142',
]
rules = (
Rule(SgmlLinkExtractor(allow=('.',),restrict_xpaths=('...my xpath...')),callback='parse_item', follow=True),
)
def start_requests(self):
return self.init_request()
def init_request(self):
print '=======================INIT_REQUEST================'
return [Request(self.login_page, callback=self.login_url)]
def login_url(self, response):
print '=======================LOGIN======================='
"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'login': 'mylogin', 'pswd': 'mypass'},
callback=self.after_login)
def after_login(self, response):
print '=======================AFTER_LOGIN ...======================='
if "images/info_enter.png" in response.body:
print "==============Bad times :(==============="
else:
print "=========Successfully logged in.========="
for url in self.start_urls:
yield self.make_requests_from_url(url)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
entry = hxs.select("//div[#class='price']/text()").extract()
l = ProductLoader(Product(), hxs)
if entry:
name = hxs.select("//div[#class='header_box']/text()").extract()[0]
l.add_value('name', name)
... some code is here ...
return l.load_item()

The init_request(self): is available only when you subclass from InitSpider not CrawlSpider
You need to subclass your spider from InitSpider like this
class WorkingSpider(InitSpider):
login_page = 'http://www.example.org/login.php'
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
But then remember that you can't define Rules in initSpider as its only avaiable in CrawlSpider you need to manually extract the links

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How to have scrapy spider run on flask app form submit? - flask

Related

How can I send scrapy result to django views so that frontend can get scrapy result by axios?

mailgun and flask forms

python flask mini-application global name not defined

Emails won't send after upgrading from Django 1.6.x to Django > 1.7.x

Cannot login again after resuming crawl. Cookies are not sticky after resuming scrapy

Categories

Resources