python scrapy crawler with append() issue - python-2.7

I took this example from scrapy.org. It worked fine until I tried to save everything in an items object. The items.append(item) is apparently invalid syntax, but all other examples on this website have the same assignment.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/#href').extract()
item['desc'] = site.select('text()').extract()
items.append(item)
return items
Error is:
computerito#computerito-the-great ~/SHITSHOW/tutorial $ scrapy crawl dmoz
2015-03-10 22:00:40-0700 [scrapy] INFO: Scrapy 0.14.4 started (bot: tutorial)
2015-03-10 22:00:40-0700 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, MemoryUsage, SpiderState
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 132, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 97, in _run_print_help
func(*a, **kw)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 139, in _run_command
cmd.run(args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 43, in run
spider = self.crawler.spiders.create(spname, **opts.spargs)
File "/usr/lib/python2.7/dist-packages/scrapy/command.py", line 34, in crawler
self._crawler.configure()
File "/usr/lib/python2.7/dist-packages/scrapy/crawler.py", line 36, in configure
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 37, in from_crawler
return cls.from_settings(crawler.settings)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 33, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 23, in __init__
for module in walk_modules(name):
File "/usr/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 65, in walk_modules
submod = __import__(fullpath, {}, {}, [''])
File "/home/computerito/SHITSHOW/tutorial/tutorial/spiders/dmoz_spider.py", line 25
items.append(item)
^
SyntaxError: invalid syntax

Related

Unable to run Scrapy code

I have written the following code:
spiders.test.py code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from wscraper.items import WscraperItem
class MySpider(BaseSpider):
name = "ExampleSpider"
allowed_domains = ["timeanddate.com"]
start_urls = ["https://www.timeanddate.com/worldclock/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("/html/body/div[1]/div[8]/section[2]/div[1]/table/tbody").extract()
#for titles in titles:
#title = titles.select("a/text()").extract()
#link = titles.select("a/#href").extract()
print title
The code for scraper.items is:
from scrapy.item import Item, Field
class WscraperItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
pass
I'm getting the following error on running the command "scrapy crawl ExampleSpider":
[boto] ERROR: Caught exception reading instance data
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/boto/utils.py", line 210, in
retry_url
r = opener.open(req, timeout=timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1198, in do_open
raise URLError(err)
URLError: <urlopen error [Errno 101] Network is unreachable>
[boto] ERROR: Unable to read instance data, giving up
[scrapy] ERROR: Error downloading <GET
https://www.timeanddate.com/worldclock/>
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 45,
in mustbe_deferred
result = f(*args, **kw)
File "/usr/lib/python2.7/dist-
packages/scrapy/core/downloader/handlers/__init__.py", line 41, in
download_request
return handler(request, spider)
File "/usr/lib/python2.7/dist-
packages/scrapy/core/downloader/handlers/http11.py", line 44, in
download_request
return agent.download_request(request)
d = super(CachingThreadedResolver, self).getHostByName(name, timeout)
File "/home/priyanka/.local/lib/python2.7/site-
packages/twisted/internet/base.py", line 276, in getHostByName
timeoutDelay = sum(timeout)
TypeError: 'float' object is not iterable
[scrapy] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 1,
'downloader/exception_type_count/exceptions.TypeError': 1,
'downloader/request_bytes': 228,
'log_count/DEBUG': 2,
'log_count/ERROR': 3,
'log_count/INFO': 7,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
Spider name has to be str, not list, so:
class ExampleSpider(BaseSpider):
name = "timeandzone"
otherwise Scrapy spider loader fails to load it.
Basically, I had a compatibility issue.So, I installed Scrapy1.3.3 and this resolved the issue and yes as mentioned in the answer above spider name should be a string.

Scrapy Inspect_Response in second-level crawls: MultipleInstanceError errors for iPython

I'm getting acquainted with Scrapy and heavily relying on the interactive debugger, inspect_response().
I'm starting by scraping events off a calendar. Step 1 is to get the overview, step 2 is to get the details:
|-- Calendar List Page
| `-- Calendar Event Details Page
Here is my implementation:
class QuotesSpider(scrapy.Spider):
name = "my first spider"
def start_requests(self):
urls = ['https://www.ticketfly.com/venue/7337-output/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_overview)
def parse_overview(self, response):
for event in response.css('.list-view-item'):
# This works fine: inspect_response(response, self)
details = event.css('span.ticket-link.primary-link a::attr(href)').extract_first()
if details is not None:
yield scrapy.Request(response.urljoin(details),
callback=self.parse_url)
def parse_url(self, response):
inspect_response(response, self) # This breaks
inspect_response() works fine on in step 1, but fails in step 2:
2017-04-22 10:30:47 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.ticketfly.com/purchase/event/1465896/tfly> (referer: https://www.ticketfly.com/venue/7337-output/)
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/me/test/scrapy/test/test/spiders/my_spider.py", line 37, in parse_url
inspect_response(response, self)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 167, in inspect_response
Shell(spider.crawler).start(response=response)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 81, in start
banner=self.vars.pop('banner', ''))
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 82, in start_python_console
shell(namespace=namespace, banner=banner)
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 22, in wrapper
banner1=banner, user_ns=namespace, config=config)
File "//anaconda/lib/python2.7/site-packages/traitlets/config/configurable.py", line 416, in instance
'%s are being created.' % cls.__name__
MultipleInstanceError: Multiple incompatible subclass instances of InteractiveShellEmbed are being created.
2017-04-22 10:30:47 [root] DEBUG: Using default logger
2017-04-22 10:30:47 [root] DEBUG: Using default logger
2017-04-22 10:30:47 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.ticketfly.com/purchase/event/1457966/tfly> (referer: https://www.ticketfly.com/venue/7337-output/)
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/me/test/scrapy/test/test/spiders/my_spider.py", line 37, in parse_url
inspect_response(response, self)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 167, in inspect_response
Shell(spider.crawler).start(response=response)
File "//anaconda/lib/python2.7/site-packages/scrapy/shell.py", line 81, in start
banner=self.vars.pop('banner', ''))
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 82, in start_python_console
shell(namespace=namespace, banner=banner)
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/console.py", line 22, in wrapper
banner1=banner, user_ns=namespace, config=config)
File "//anaconda/lib/python2.7/site-packages/traitlets/config/configurable.py", line 416, in instance
'%s are being created.' % cls.__name__
MultipleInstanceError: Multiple incompatible subclass instances of InteractiveShellEmbed are being created.

scrapy how to set JOBDIR from the code not cmd

When i run my spider from terminal like this:
scrapy crawl GeneralSpider --set JOBDIR=will
everything works find and i can see the JOBDIR folder
however, when i try to set it programatically, like this:
from scrapy.utils.project import get_project_settings
myS = get_project_settings()
myS.set(myS, 'JOBDIR', "folder")
I get the following error:
Unhandled error in Deferred:
CRITICAL:twisted:Unhandled error in Deferred:
2016-02-15 17:49:35 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 57, in run
self.crawler_process.crawl(spname, **opts.spargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/crawler.py", line 153, in crawl
d = crawler.crawl(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py", line 1274, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py", line 1128, in _inlineCallbacks
result = g.send(result)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/crawler.py", line 70, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spiders/__init__.py", line 50, in from_crawler
spider = cls(*args, **kwargs)
File "/bla bla bla spider.py", line 47, in __init__
myS.set(myS, 'JOBDIR', "myfolder")
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/settings/__init__.py", line 94, in set
priority = SETTINGS_PRIORITIES[priority]
exceptions.KeyError: 'myfolder'
CRITICAL:twisted:
2016-02-15 17:49:35 [twisted] CRITICAL:
The Settings API set method has this signature:
set(name, value, priority='project')
You're passing a settings object as setting name, and "myfolder" as priority, which is not expected.
Try doing this instead:
from scrapy.utils.project import get_project_settings
...
myS = get_project_settings()
myS.set('JOBDIR', "folder")

Scrapy dmoz tutorial: _init_() takes at most 2 arguments (3 given)

PS C:\users\steve\tutorial> scrapy crawl dmoz
Traceback (most recent call last):
File "c:\python27\scripts\scrapy-script.py", line 9, in <module>
load_entry_point('scrapy==1.0.3', 'console_scripts', 'scrapy')()
File "C:\Python27\lib\site-packages\scrapy-1.0.3-py2.7.egg\scrapy\cmdline.py",
cmd.crawler_process = CrawlerProcess(settings)
File "C:\Python27\lib\site-packages\scrapy-1.0.3-py2.7.egg\scrapy\crawler.py",
super(CrawlerProcess, self).__init__(settings)
File "C:\Python27\lib\site-packages\scrapy-1.0.3-py2.7.egg\scrapy\crawler.py",
self.spider_loader = _get_spider_loader(settings)
File "C:\Python27\lib\site-packages\scrapy-1.0.3-py2.7.egg\scrapy\crawler.py",
return loader_cls.from_settings(settings.frozencopy())
File "C:\Python27\lib\site-packages\scrapy-1.0.3-py2.7.egg\scrapy\spiderloader.
return cls(settings)
File "C:\Python27\lib\site-packages\scrapy-1.0.3-py2.7.egg\scrapy\spiderloader.
for module in walk_modules(name):
File "C:\Python27\lib\site-packages\scrapy-1.0.3-py2.7.egg\scrapy\utils\misc.py
submod = import_module(fullpath)
File "C:\Python27\lib\importlib\__init__.py", line 37, in import_module
__import__(name)
File "C:\users\steve\tutorial\tutorial\spiders\dmoz.py", line 4, in <module>
class dmozspider(spiders):
TypeError: Error when calling the metaclass bases module.__init__() takes at most 2 arguments (3 given)
My dmoz spider python script is here
from scrapy import spiders
class dmozspider(spiders):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2] + '.html'
with open(filename, 'wb') as f:
f.write(response.body)
The problem is you're importing "spiders", and using it as your base class. "spiders" is the package that contains the spiders, namely the Spider class. To use it, use:
from scrapy.spiders import Spider
class dmozspider(Spider):
... # Rest of your code

Attribute 'type': The QName value '{http://www.w3.org/2001/XMLSchema}EmailString' does not resolve to a(n) type definition., line 4

I want to extend spyne Unicode field with regex to ensure it is a valid E-Mail format. But even when copy-pasting basic example from spyne documentation http://spyne.io/docs/2.10/manual/03_types.html, I get the above error (see title) when visiting localhost/my-url-endpoint?wsdl.
I use Django 1.6 and Spyne 2.10.10. on Windows8 64-bit.
Any suggestion why it fails?
The code:
from django.views.decorators.csrf import csrf_exempt
from spyne.protocol.soap import Soap11
from spyne.interface import Wsdl11
from spyne.service import ServiceBase
from spyne.decorator import srpc, rpc
from spyne.model.primitive import Unicode, Integer, Mandatory
from spyne.model.complex import Iterable
from spyne.application import Application
from spyne.server.django import DjangoApplication
class EmailString(Unicode):
__type_name__ = 'EmailString'
class Attributes(Unicode.Attributes):
max_length = 128
pattern = '[^#]+#[^#]+'
class MyService(ServiceBase):
#rpc(EmailString, _returns=Unicode)
def my_function(ctx, my_email):
return "Your email is %s" % my_email
application = Application(
[
MyService
],
tns="http://tempuri.org",
interface=Wsdl11(),
in_protocol=Soap11(validator='lxml'),
out_protocol=Soap11()
)
myServiceApp = csrf_exempt(DjangoApplication(application))
MyServiceApp is then pointed to in urls.py:
urlpatterns += patterns('',
(r'^my-url-endpoint$', 'myapp.views.myServiceApp'),
)
Stack trace:
Internal Server Error: /en/wsCRMService
Traceback (most recent call last):
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\core\handlers\base.py", line 101, in get_response
resolver_match = resolver.resolve(request.path_info)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\core\urlresolvers.py", line 320, in resolve
sub_match = pattern.resolve(new_path)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\core\urlresolvers.py", line 320, in resolve
sub_match = pattern.resolve(new_path)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\core\urlresolvers.py", line 320, in resolve
sub_match = pattern.resolve(new_path)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\core\urlresolvers.py", line 222, in resolve
return ResolverMatch(self.callback, args, kwargs, self.name)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\core\urlresolvers.py", line 229, in callback
self._callback = get_callable(self._callback_str)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\utils\functional.py", line 32, in wrapper
result = func(*args)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\core\urlresolvers.py", line 96, in get_callable
mod = import_module(mod_name)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\django-1.6.1-py2.7.egg\django\utils\importlib.py", line 40, in import_module
__import__(name)
File "E:\my_project\myapp\views.py", line 146, in <module>
out_protocol=Soap11()
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\spyne\application.py", line 104, in __init__
self.in_protocol.set_app(self)
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\spyne\protocol\xml\_base.py", line 413, in set_app
xml_schema.build_validation_schema()
File "C:\Users\Anze\Virtual Environments\my_project\lib\site-packages\spyne\interface\xml_schema\_base.py", line 189, in build_validation_schema
self.validation_schema = etree.XMLSchema(etree.parse(f))
File "xmlschema.pxi", line 102, in lxml.etree.XMLSchema.__init__ (src\lxml\lxml.etree.c:154067)
XMLSchemaParseError: element decl. '{http://tempuri.org}my_email', attribute 'type': The QName value '{http://www.w3.org/2001/XMLSchema}EmailString' does not resolve to a(n) type definition., line 4
Please help.
It will work if you do:
EmailString = Unicode(128, pattern='[^#]+#[^#]+', type_name="EmailStringType")
That pattern is just an example though, you can find better ones out there.
After hours of research I found out that it also works if you do
class EmailString(Unicode(pattern='[^#]+#[^#]+')):
__namespace__ = 'tempuri.org'
__type_name__ = 'EmailString'
Is seems like meta class Attributes of EmailString should not be overwritten and then it works. Instead, you must put your customization in constructor of extended class (Unicode in this case)