Scrapy: Login with Selenium webdriver, transfer cookies to spider object? - cookies

I was just wondering if there's any reasonable way to pass authentication cookies from webdriver.Firefox() instance to the spider itself? It would be helpful to perform some webdriver stuff and then go about scraping "business as usual". Something to the effect of:
def __init__(self):
BaseSpider.__init__(self)
self.selenium = webdriver.Firefox()
def __del__(self):
self.selenium.quit()
print self.verificationErrors
def parse(self, response):
# Initialize the webdriver, get login page
sel = self.selenium
sel.get(response.url)
sleep(3)
##### Transfer (sel) cookies to (self) and crawl normally??? #####
...
...

Transfer Cookies from Selenium to Scrapy Spider
Scrapying File
from selenium import webdriver
driver=webdriver.Firefox()
data=driver.get_cookies()
# write to temp file
with open('cookie.json', 'w') as outputfile:
json.dump(data, outputfile)
driver.close()
outputfile.close()
....
Spider
import os
if os.stat("cookie.json").st_size > 2:
with open('./cookie.json', 'r') as inputfile:
self.cookie = json.load(inputfile)
inputfile.close()

You can try to override BaseSpider.start_requests method to attach to starting requests needed cookies using scrapy.http.cookies.CookieJar.
See also: Scrapy - how to manage cookies/sessions

This works with chrome driver but not Firefox (Tested OK)
refer https://christopher.su/2015/selenium-chromedriver-ubuntu/ for installation.
import scrapy
from scrapy.spiders.init import InitSpider
from scrapy.http import Request
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pickle
class HybridSpider(InitSpider):
name = 'hybrid'
def init_request(self):
driver = webdriver.Chrome()`
driver.get('https://example.com')
driver.find_element_by_id('js-login').click()
driver.find_element_by_id('email').send_keys('mymail#example.net')
driver.find_element_by_id('password').send_keys('mypasssword',Keys.ENTER)
pickle.dump( driver.get_cookies() , open(os.getenv("HOME")+"/my_cookies","wb"))
cookies = pickle.load(open(os.getenv("HOME")+"/my_cookies", "rb"))
FH = open(os.getenv("HOME")+"/my_urls", 'r')
for url in FH.readlines():
pass
yield Request(url,cookies=cookies,callback=self.parse)
def parse(self, response):
pass
Haven't tried directly passing the cookies like
yield Request(url,cookies=driver.get_cookies(),callback=self.parse)
Might work too..

driver = webdriver.Chrome()
Then perform the login or interact with the page through the browser. Now when using the crawler in scrapy, set the cookies parameter:
request = Request(URL, cookies=driver.get_cookies(), callback=self.mycallback)

Related

How can I send scrapy result to django views so that frontend can get scrapy result by axios?

I am planning making backend using django + scrapy. my goal is this.
frontend(react) send 'get' methods by axios to django views endpoint.
this activate scrapy to start crawling (spiders)
send scraping result to django views.
frontend get json result (scraped result, not jobid or log file)
from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapyApp.items import ScrapyappItem
from scrapy.utils.project import get_project_settings
class MySpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://www.google.com',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = ScrapyappItem()
item['title'] = response.css('title::text').get()
yield item
def show1(request):
# configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(MySpider)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished
return HttpResponse({"result":d})

signal only works in main thread: scrappy

I am making an api which return the JsonResponse as my text from the scrapy. When i run the scripts individually it runs perfectly. But when i try to integrate the scrapy script with python django i am not getting the output.
What i want is only return the response to the request(which in my case is POSTMAN POST request.
Here is the code which i am trying
from django.http import HttpResponse, JsonResponse
from django.views.decorators.csrf import csrf_exempt
import scrapy
from scrapy.crawler import CrawlerProcess
#csrf_exempt
def some_view(request, username):
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'LOG_ENABLED': 'false'
})
process_test = process.crawl(QuotesSpider)
process.start()
return JsonResponse({'return': process_test})
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/random',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
return response.css('.text::text').extract_first()
I am very new to python and django stuff.Any kind of help would be much appreciated.
In your code, process_test is a CrawlerProcess, not the output from the crawling.
You need additional configuration to make your spider store its output "somewhere". See this SO Q&A about writing a custom pipeline.
If you just want to synchronously retrieve and parse a single page, you may be better off using requests to retrieve the page, and parsel to parse it.

python.failure.Failure OpenSSL.SSL.Error in Scrapy (version 1.0.4)

I'm working on a data scraping project and my code uses Scrapy (version 1.0.4) and Selenium (version 2.47.1).
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.spiders import CrawlSpider
from selenium import webdriver
class TradesySpider(CrawlSpider):
name = 'tradesy'
start_urls = ['My Start url',]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
while True:
tradesy_urls = Selector(response).xpath('//div[#id="right-panel"]"]')
data_urls = tradesy_urls.xpath('div[#class="item streamline"]/a/#href').extract()
for link in data_urls:
url = 'My base url'+link
yield Request(url=url,callback=self.parse_data)
time.sleep(10)
try:
data_path = self.driver.find_element_by_xpath('//*[#id="page-next"]')
except:
break
data_path.click()
time.sleep(10)
def parse_data(self,response):
'Scrapy Operations...'
When I execute my code, I'm getting expected output for some urls but for others I'm getting the following error.
2016-01-19 15:45:17 [scrapy] DEBUG: Retrying <GET MY_URL> (failed 1 times): [<twisted.python.failure.Failure OpenSSL.SSL.Error: [('SSL routines', 'SSL3_READ_BYTES', 'ssl handshake failure')]>]
Please provide a solution for this query.
according to this reported issue you could create your own ContextFactory to handle SSL.
context.py:
from OpenSSL import SSL
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
class CustomContextFactory(ScrapyClientContextFactory):
"""
Custom context factory that allows SSL negotiation.
"""
def __init__(self):
# Use SSLv23_METHOD so we can use protocol negotiation
self.method = SSL.SSLv23_METHOD
settings.py
DOWNLOADER_CLIENTCONTEXTFACTORY = 'yourproject.context.CustomContextFactory'
Using Scrapy 1.5.0 I was running into this error:
Error downloading: https://my.website.com>: [<twisted.python.failure.Failure OpenSSL.SSL.Error: [('SSL routines', 'tls12_check_peer_sigalg', 'wrong curve')]>]
What ended up working was updating my version of Twisted (from 17.9.0 -> 19.10.0). I also updated Scrapy to 2.4.0, and a few others:
cryptography==2.2.2 -> 2.3
parsel==1.4.0 -> 1.5.0
pyOpenSSL==17.5.0 -> 19.0.0
urllib3==1.22 -> 1.24.3
Update as of 7/27/2022
As mentioned here. You could set the DOWNLOADER_CLIENT_TLS_METHOD to a value of TLSv1.2 in the settings.py module. Other versions can be tried as well, just try to tweak which would work.
Just set the DOWNLOADER_CLIENT_TLS_METHOD property to 'TLSv1.2' in the
settings.py of your project. There is no more need for you to use the
custom context factory to solve this problem.
A variation on eLRuLL's answer that does not require additional files. It 'decorates' the init method of the ScrapyClientContextFactory class.
from OpenSSL import SSL
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
init = ScrapyClientContextFactory.__init__
def init2(self, *args, **kwargs):
init(self, *args, **kwargs)
self.method = SSL.SSLv23_METHOD
ScrapyClientContextFactory.__init__ = init2

Django Lettuce built-in server 500 response

I am running the Lettuce built-in server to test that it returns a given reponse however, it shows a 500 response.
My features file:
Feature: home page loads
Scenario: Check that home page loads with header
Given I access the home url
then the home page should load with the title "Movies currently showing"
My steps file:
#step(u'Given I access the home url')
def given_i_access_the_home_url(step):
world.response = world.browser.get(django_url('/'))
sleep(10)
#step(u'then the home page should load with the title "([^"]*)"')
def then_the_home_page_should_load_with_the_title_group1(step, group1):
assert group1 in world.response
My Terrains file:
from django.core.management import call_command
from django.test.simple import DjangoTestSuiteRunner
from lettuce import before, after, world
from logging import getLogger
from selenium import webdriver
try:
from south.management.commands import patch_for_test_db_setup
except:
pass
logger = getLogger(__name__)
logger.info("Loading the terrain file...")
#before.runserver
def setup_database(actual_server):
'''
This will setup your database, sync it, and run migrations if you are using South.
It does this before the Test Django server is set up.
'''
logger.info("Setting up a test database...")
# Uncomment if you are using South
# patch_for_test_db_setup()
world.test_runner = DjangoTestSuiteRunner(interactive=False)
DjangoTestSuiteRunner.setup_test_environment(world.test_runner)
world.created_db = DjangoTestSuiteRunner.setup_databases(world.test_runner)
call_command('syncdb', interactive=False, verbosity=0)
# Uncomment if you are using South
# call_command('migrate', interactive=False, verbosity=0)
#after.runserver
def teardown_database(actual_server):
'''
This will destroy your test database after all of your tests have executed.
'''
logger.info("Destroying the test database ...")
DjangoTestSuiteRunner.teardown_databases(world.test_runner, world.created_db)
#before.all
def setup_browser():
world.browser = webdriver.Firefox()
#after.all
def teardown_browser(total):
world.browser.quit()
What could be the problem with the server, why a 500 response error?
I managed to find what the problem is, the migrations were not running on syncdb

Cannot login again after resuming crawl. Cookies are not sticky after resuming scrapy

I have a CrawlSpider, the code is below. I use Tor through tsocks.
When I start my spider, everything works fine. Using init_request I can login on site and crawl with sticky cookies.
But problem occurred when I stopped and resumed spider. Cookies became not sticky.
I give you the response from Scrapy.
=======================INIT_REQUEST================
2013-01-30 03:03:58+0300 [my] INFO: Spider opened
2013-01-30 03:03:58+0300 [my] INFO: Resuming crawl (675 requests scheduled)
............ And here crawling began
So... callback=self.login_url in def init_request is not fired!!!
I thought that scrapy engine didn't want to send again request on login page. Before resuming scrapy I changed login_page (I can login from every page on site) to different that not included in restrict_xpaths.
Result is - After resuming I cannot login and previous cookies are lost.
Does anyone have some assumptions?
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import TakeFirst, MapCompose, Join, Identity
from beles_com_ua.items import Product
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.markup import remove_entities
from django.utils.html import strip_tags
from datetime import datetime
from scrapy import log
import re
from scrapy.http import Request, FormRequest
class ProductLoader(XPathItemLoader):
.... some code is here ...
class MySpider(CrawlSpider):
name = 'my'
login_page = 'http://test.com/index.php?section=6&type=12'
allowed_domains = ['test.com']
start_urls = [
'http://test.com/index.php?section=142',
]
rules = (
Rule(SgmlLinkExtractor(allow=('.',),restrict_xpaths=('...my xpath...')),callback='parse_item', follow=True),
)
def start_requests(self):
return self.init_request()
def init_request(self):
print '=======================INIT_REQUEST================'
return [Request(self.login_page, callback=self.login_url)]
def login_url(self, response):
print '=======================LOGIN======================='
"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'login': 'mylogin', 'pswd': 'mypass'},
callback=self.after_login)
def after_login(self, response):
print '=======================AFTER_LOGIN ...======================='
if "images/info_enter.png" in response.body:
print "==============Bad times :(==============="
else:
print "=========Successfully logged in.========="
for url in self.start_urls:
yield self.make_requests_from_url(url)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
entry = hxs.select("//div[#class='price']/text()").extract()
l = ProductLoader(Product(), hxs)
if entry:
name = hxs.select("//div[#class='header_box']/text()").extract()[0]
l.add_value('name', name)
... some code is here ...
return l.load_item()
The init_request(self): is available only when you subclass from InitSpider not CrawlSpider
You need to subclass your spider from InitSpider like this
class WorkingSpider(InitSpider):
login_page = 'http://www.example.org/login.php'
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
But then remember that you can't define Rules in initSpider as its only avaiable in CrawlSpider you need to manually extract the links