I have written the following code:
spiders.test.py code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from wscraper.items import WscraperItem
class MySpider(BaseSpider):
name = "ExampleSpider"
allowed_domains = ["timeanddate.com"]
start_urls = ["https://www.timeanddate.com/worldclock/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("/html/body/div[1]/div[8]/section[2]/div[1]/table/tbody").extract()
#for titles in titles:
#title = titles.select("a/text()").extract()
#link = titles.select("a/#href").extract()
print title
The code for scraper.items is:
from scrapy.item import Item, Field
class WscraperItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
pass
I'm getting the following error on running the command "scrapy crawl ExampleSpider":
[boto] ERROR: Caught exception reading instance data
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/boto/utils.py", line 210, in
retry_url
r = opener.open(req, timeout=timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1198, in do_open
raise URLError(err)
URLError: <urlopen error [Errno 101] Network is unreachable>
[boto] ERROR: Unable to read instance data, giving up
[scrapy] ERROR: Error downloading <GET
https://www.timeanddate.com/worldclock/>
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 45,
in mustbe_deferred
result = f(*args, **kw)
File "/usr/lib/python2.7/dist-
packages/scrapy/core/downloader/handlers/__init__.py", line 41, in
download_request
return handler(request, spider)
File "/usr/lib/python2.7/dist-
packages/scrapy/core/downloader/handlers/http11.py", line 44, in
download_request
return agent.download_request(request)
d = super(CachingThreadedResolver, self).getHostByName(name, timeout)
File "/home/priyanka/.local/lib/python2.7/site-
packages/twisted/internet/base.py", line 276, in getHostByName
timeoutDelay = sum(timeout)
TypeError: 'float' object is not iterable
[scrapy] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 1,
'downloader/exception_type_count/exceptions.TypeError': 1,
'downloader/request_bytes': 228,
'log_count/DEBUG': 2,
'log_count/ERROR': 3,
'log_count/INFO': 7,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
Spider name has to be str, not list, so:
class ExampleSpider(BaseSpider):
name = "timeandzone"
otherwise Scrapy spider loader fails to load it.
Basically, I had a compatibility issue.So, I installed Scrapy1.3.3 and this resolved the issue and yes as mentioned in the answer above spider name should be a string.
Related
I am learning django rest_framework , But here's the problem, I get the following error. What am I doing wrong?I've done a lot of research on trying to resolve the jsondecodeerror. However, I'm not finding a solution.
import requests
import json
URL = "http://127.0.0.1:8000/studentapi/"
def get_data(id=None):
data = {}
if id is not None:
data = {'id':id}
json_data= json.dumps(data)
r= requests.get(url = URL , data = json_data)
data = r.json()
print(data)
get_data()```
error---
python myapp.py
Traceback (most recent call last):
File "C:\Users\P.ARYAPRAKASH\Documents\Djangovs\fun_api_view\myapp.py", line 15, in <module>
get_data()
File "C:\Users\P.ARYAPRAKASH\Documents\Djangovs\fun_api_view\myapp.py", line 12, in get_data
data = r.json()
File "C:\Users\P.ARYAPRAKASH\AppData\Roaming\Python\Python39\site-packages\requests\models.py", line 910, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Python39\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Python39\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python39\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
I'm attempting to write a unit test for a url in my application. I used django's Client class to simulate a get() request and compare the response's status code.
Here's the test i'm running:
from unittest.mock import patch
from django.shortcuts import reverse
class DashboardViewTest(TestCase):
#patch("ordering.mixins.OrderingAppPermissionRequired.handle_not_logged_in")
#patch("ordering.mixins.OrderingAppPermissionRequired.handle_no_profile")
#patch("ordering.mixins.OrderingAppPermissionRequired.handle_no_id")
def test_order_list_view(self, *mocks):
client = Client()
response = client.get(reverse('ordering:list'))
self.assertEqual(response.status_code, 200)
I'm facing the following error (path redacted for privacy):
Traceback (most recent call last):
File "[python_root]\python\python37\Lib\unittest\mock.py", line 1191, in patched
return func(*args, **keywargs)
File "[project_root]\ordering\tests\test_dashboard.py", line 20, in test_order_list_view
response = client.get(reverse('ordering:list'))
File "[virtual_env_root]\lib\site-packages\django\test\client.py", line 527, in get
response = super().get(path, data=data, secure=secure, **extra)
File "[virtual_env_root]\lib\site-packages\django\test\client.py", line 339, in get
**extra,
File "[virtual_env_root]\lib\site-packages\django\test\client.py", line 414, in generic
return self.request(**r)
File "[virtual_env_root]\lib\site-packages\django\test\client.py", line 495, in request
raise exc_value
File "[virtual_env_root]\lib\site-packages\django\core\handlers\exception.py", line 34, in inner
response = get_response(request)
File "[virtual_env_root]\lib\site-packages\django\utils\deprecation.py", line 93, in __call__
response = self.process_response(request, response)
File "[virtual_env_root]\lib\site-packages\django\contrib\sessions\middleware.py", line 45, in process_response
patch_vary_headers(response, ('Cookie',))
File "[virtual_env_root]\lib\site-packages\django\utils\cache.py", line 266, in patch_vary_headers
vary_headers = cc_delim_re.split(response['Vary'])
TypeError: expected string or bytes-like object
Help is appreciated. Thank you.
I'm getting acquainted with Scrapy and heavily relying on the interactive debugger, inspect_response().
I'm starting by scraping events off a calendar. Step 1 is to get the overview, step 2 is to get the details:
|-- Calendar List Page
| `-- Calendar Event Details Page
Here is my implementation:
class QuotesSpider(scrapy.Spider):
name = "my first spider"
def start_requests(self):
urls = ['https://www.ticketfly.com/venue/7337-output/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_overview)
def parse_overview(self, response):
for event in response.css('.list-view-item'):
# This works fine: inspect_response(response, self)
details = event.css('span.ticket-link.primary-link a::attr(href)').extract_first()
if details is not None:
yield scrapy.Request(response.urljoin(details),
callback=self.parse_url)
def parse_url(self, response):
inspect_response(response, self) # This breaks
inspect_response() works fine on in step 1, but fails in step 2:
2017-04-22 10:30:47 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.ticketfly.com/purchase/event/1465896/tfly> (referer: https://www.ticketfly.com/venue/7337-output/)
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/me/test/scrapy/test/test/spiders/my_spider.py", line 37, in parse_url
inspect_response(response, self)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 167, in inspect_response
Shell(spider.crawler).start(response=response)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 81, in start
banner=self.vars.pop('banner', ''))
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 82, in start_python_console
shell(namespace=namespace, banner=banner)
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 22, in wrapper
banner1=banner, user_ns=namespace, config=config)
File "//anaconda/lib/python2.7/site-packages/traitlets/config/configurable.py", line 416, in instance
'%s are being created.' % cls.__name__
MultipleInstanceError: Multiple incompatible subclass instances of InteractiveShellEmbed are being created.
2017-04-22 10:30:47 [root] DEBUG: Using default logger
2017-04-22 10:30:47 [root] DEBUG: Using default logger
2017-04-22 10:30:47 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.ticketfly.com/purchase/event/1457966/tfly> (referer: https://www.ticketfly.com/venue/7337-output/)
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/me/test/scrapy/test/test/spiders/my_spider.py", line 37, in parse_url
inspect_response(response, self)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 167, in inspect_response
Shell(spider.crawler).start(response=response)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 81, in start
banner=self.vars.pop('banner', ''))
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 82, in start_python_console
shell(namespace=namespace, banner=banner)
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 22, in wrapper
banner1=banner, user_ns=namespace, config=config)
File "//anaconda/lib/python2.7/site-packages/traitlets/config/configurable.py", line 416, in instance
'%s are being created.' % cls.__name__
MultipleInstanceError: Multiple incompatible subclass instances of InteractiveShellEmbed are being created.
When i run my spider from terminal like this:
scrapy crawl GeneralSpider --set JOBDIR=will
everything works find and i can see the JOBDIR folder
however, when i try to set it programatically, like this:
from scrapy.utils.project import get_project_settings
myS = get_project_settings()
myS.set(myS, 'JOBDIR', "folder")
I get the following error:
Unhandled error in Deferred:
CRITICAL:twisted:Unhandled error in Deferred:
2016-02-15 17:49:35 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 57, in run
self.crawler_process.crawl(spname, **opts.spargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/crawler.py", line 153, in crawl
d = crawler.crawl(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py", line 1274, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py", line 1128, in _inlineCallbacks
result = g.send(result)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/crawler.py", line 70, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spiders/__init__.py", line 50, in from_crawler
spider = cls(*args, **kwargs)
File "/bla bla bla spider.py", line 47, in __init__
myS.set(myS, 'JOBDIR', "myfolder")
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/settings/__init__.py", line 94, in set
priority = SETTINGS_PRIORITIES[priority]
exceptions.KeyError: 'myfolder'
CRITICAL:twisted:
2016-02-15 17:49:35 [twisted] CRITICAL:
The Settings API set method has this signature:
set(name, value, priority='project')
You're passing a settings object as setting name, and "myfolder" as priority, which is not expected.
Try doing this instead:
from scrapy.utils.project import get_project_settings
...
myS = get_project_settings()
myS.set('JOBDIR', "folder")
I took this example from scrapy.org. It worked fine until I tried to save everything in an items object. The items.append(item) is apparently invalid syntax, but all other examples on this website have the same assignment.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/#href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)
return items
Error is:
computerito#computerito-the-great ~/SHITSHOW/tutorial $ scrapy crawl dmoz
2015-03-10 22:00:40-0700 [scrapy] INFO: Scrapy 0.14.4 started (bot: tutorial)
2015-03-10 22:00:40-0700 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, MemoryUsage, SpiderState
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 132, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 97, in _run_print_help
func(*a, **kw)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 139, in _run_command
cmd.run(args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 43, in run
spider = self.crawler.spiders.create(spname, **opts.spargs)
File "/usr/lib/python2.7/dist-packages/scrapy/command.py", line 34, in crawler
self._crawler.configure()
File "/usr/lib/python2.7/dist-packages/scrapy/crawler.py", line 36, in configure
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 37, in from_crawler
return cls.from_settings(crawler.settings)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 33, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 23, in __init__
for module in walk_modules(name):
File "/usr/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 65, in walk_modules
submod = __import__(fullpath, {}, {}, [''])
File "/home/computerito/SHITSHOW/tutorial/tutorial/spiders/dmoz_spider.py", line 25
items.append(item)
^
SyntaxError: invalid syntax