I've tried everything but I can't seem to figure it out how I can call the next page in the parse_category.
I've tried LinkExtractor as I do when I go directly to a catergory page but this didn't work.
import scrapy.selector
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from msh_final.items import CrawlerMshFinalItem
def complete_url(string):
return "http://www.mediamarkt.be" + string
def get_base_url(url):
if url != "":
u = urlparse.urlparse(url)
return "%s://%s" % (u.scheme, u.netloc)
else:
return ""
def encode(str):
return str.encode('utf8', 'ignore')
class msh_finalSpider(CrawlSpider):
name = 'msh_final'
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']
def parse(self, response):
items = response.xpath('//ul[#class="infield cf"]//div[#class="infield-wrapper"]/h2/a/#href')
for item in items:
link = item.extract()
yield Request(complete_url(link), callback=self.parse_category)
def parse_category(self, response):
items = response.xpath("//ul[#class='products-list']/li/div")
for item in items:
msh = CrawlerMshFinalItem()
msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
yield msh
You should inherite your spider from Spider instead of CrawlSpider and use following code:
class msh_finalSpider(Spider):
name = 'msh_final'
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']
def parse(self, response):
items = response.xpath('//ul[#class="infield cf"]//div[#class="infield-wrapper"]/h2/a/#href')
for item in items:
link = item.extract()
yield Request(complete_url(link), callback=self.parse_category)
def parse_category(self, response):
items = response.xpath("//ul[#class='products-list']/li/div")
for item in items:
msh = CrawlerMshFinalItem()
msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
yield msh
new_link = response.xpath('//li[#class="pagination-next"]/a/#href').extract()[0]
yield Request(
complete_url(new_link),
callback=self.parse_category
)
i have different adresse for crawling, i want to put them in an array and crawl all of them.
i tried this:
class Myclass(CrawlSpider):
reload(sys)
pageNumber = 0
cmt = 0
sys.setdefaultencoding('utf8')
name = 'myclass'
allowed_domains = ["amazon.fr"]
firstPage = True
rules = [
Rule(LinkExtractor(restrict_xpaths=('//div[#id="mainResults"]//h3[#class="newaps"]/a',)),
callback='parse_page1', follow=True),
Rule(LinkExtractor(restrict_xpaths=('//div[#id="bottomBar"]/div[#id="pagn"]/span[#class="pagnLink"]/a',)),
follow=True),
Rule(LinkExtractor(restrict_xpaths=(
'//div[#class="s-item-container"]//a[#class="a-link-normal s-access-detail-page a-text-normal"]',)),
callback='parse_page', follow=True),
]
arrayCategories = []
pageCrawled = []
fileNumbers = 0
first = 0
start_urls = ['https://www.amazon.fr/s/ref=sr_nr_p_6_0?fst=as%3Aoff&rh=n%3A197861031%2Cn%3A!197862031%2Cn%3A212130031%2Cn%3A3008171031%2Cp_76%3A211708031%2Cp_6%3AA1X6FK5RDHNB96&bbn=3008171031&ie=UTF8&qid=1463074601&rnid=211045031'
,'https://www.amazon.fr/s/ref=sr_nr_p_6_0?fst=as%3Aoff&rh=n%3A197861031%2Cn%3A!197862031%2Cn%3A212130031%2Cn%3A3008171031%2Cp_76%3A211708031%2Cp_6%3AA1X6FK5RDHNB96&bbn=3008171031&ie=UTF8&qid=1463074601&rnid=211045031',
'https://www.amazon.fr/s/ref=sr_nr_n_1/275-0316831-3563928?fst=as%3Aoff&rh=n%3A197861031%2Cn%3A%21197862031%2Cn%3A212130031%2Cn%3A3008171031%2Cp_76%3A211708031%2Cp_6%3AA1X6FK5RDHNB96%2Cn%3A212136031&bbn=3008171031&ie=UTF8&qid=1463075247&rnid=3008171031',
]
def __init__(self, idcrawl=None, iddrive=None, idrobot=None, proxy=None, *args, **kwargs):
super(Myclass, self).__init__(*args, **kwargs)
def start_requests(self):
for i in range (0, len(self.start_urls)):
yield Request(self.start_urls[i], callback=self.parse)
def parse(self, response):
yield Request(response.url, callback = self.parse_produit)
hxs = HtmlXPathSelector(response)
try:
nextPageLink = hxs.select("//a[#id='pagnNextLink']/#href").extract()[0]
nextPageLink = urlparse.urljoin(response.url, nextPageLink)
self.log('\nGoing to next search page: ' + nextPageLink + '\n', log.DEBUG)
yield Request(nextPageLink, callback=self.parse)
except:
self.log('Whole category parsed: ', log.DEBUG)
def parse_produit(self,response):
print self.pageNumber
body = response.css('body').extract_first()
hxs = HtmlXPathSelector(response)
body = response.css('body').extract_first()
f = io.open('./amazon/page%s' % str(self.pageNumber), 'w+', encoding='utf-8')
f.write(body)
f.close()
self.pageNumber = self.pageNumber + 1
I Have 2 problems,
the First one, i can't crawl 3 urls,
and the second one i can't call parse_produit
what's wrong with my code? why i have not the result of " print self.pageNumber" in my consol?
Non-coder here. I have an activity I'm editing for beginning students. It was created a year ago by someone else. The project is pre-created for students. They are supposed deploy it, create a bucket, and upload some files to the bucket from within the app. When I try it, I get this error:
Traceback (most recent call last):
File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/webapp2-2.5.2/webapp2.py", line 1535, in __call__
rv = self.handle_exception(request, response, e)
File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/webapp2-2.5.2/webapp2.py", line 1529, in __call__
rv = self.router.dispatch(request, response)
File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/webapp2-2.5.2/webapp2.py", line 1278, in default_dispatcher
return route.handler_adapter(request, response)
File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/webapp2-2.5.2/webapp2.py", line 1102, in __call__
return handler.dispatch()
File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/webapp2-2.5.2/webapp2.py", line 572, in dispatch
return self.handle_exception(e, self.app.debug)
File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/webapp2-2.5.2/webapp2.py", line 570, in dispatch
return method(*args, **kwargs)
File "/base/data/home/apps/s~lively-armor-126415/1.391710117126333360/main.py", line 205, in get
for imagefile in gcs.listbucket(bucket_path(), delimiter='/'):
File "/base/data/home/apps/s~lively-armor-126415/1.391710117126333360/main.py", line 114, in bucket_path
return '/' + bucket_name + '/'
TypeError: cannot concatenate 'str' and 'NoneType' objects
Here is (part of) main.py:
#!/usr/bin/env python
import webapp2
import sys
import os
import logging
import urllib
import zipfile
import StringIO
import jinja2
import datetime
import mimetypes
import json
from google.appengine.api import users
from google.appengine.api import mail
from google.appengine.api import xmpp
from google.appengine.api import channel
from google.appengine.api import app_identity
from google.appengine.api import images
from google.appengine.api import memcache
from google.appengine.api import taskqueue
from google.appengine.api import search
from google.appengine.ext import ndb
from google.appengine.datastore.datastore_query import Cursor
sys.path.insert(0, 'libs')
libpath = os.path.join(os.path.dirname(__file__), 'lib')
sys.path.append(libpath)
from wtforms import Form
from wtforms import StringField,TextAreaField,SelectField,DecimalField
from wtforms import FileField
from wtforms import SubmitField
from wtforms import validators
import cloudstorage as gcs
from datamodels import Product, ProductCategory
JINJA_ENV = jinja2.Environment(
loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
extensions=['jinja2.ext.autoescape'],
autoescape=True
)
# Add custom filter for currency output in JINJA2
def currencyformat(value):
template = "${:.2f}"
currency_string = template.format(value)
return currency_string
JINJA_ENV.filters['currencyformat'] = currencyformat
PRODUCT_GROUPS = [
('1','Bathroom'),
('2','Decor'),
('3','Lumber'),
('4','Materials'),
('5','Outdoors'),
('6','Tools')]
def login_html():
# Load differently based on whether logged in to Google account
user = users.get_current_user()
if user:
url = users.create_logout_url('/')
username = user.nickname()
else:
url = users.create_login_url('/')
username = ''
template_values = {
'url': url,
'username': username
}
greeting_template = JINJA_ENV.get_template('html/greeting.htm')
greeting_html = greeting_template.render(template_values)
return greeting_html
def store_product(prodcode, title, price, category, description):
logging.info('Add product %s to category %s in database', title, category)
category_key = ndb.Key(ProductCategory, category)
product = Product(
parent=category_key,
id=prodcode,
title=title,
price=price,
category=category,
desc=description
)
product.put()
try:
# Create a searchable document to use with Search API
document = search.Document(
doc_id = prodcode,
fields=[
search.TextField(name='title', value=title),
search.TextField(name='category', value=category),
search.HtmlField(name='desc', value=description),
search.NumberField(name='price', value=float(price)),
])
index = search.Index(name="ProductIndex")
index.put(document)
except:
logging.exception("Unable to store search document for " + prodcode)
def file_extension(filename):
return os.path.splitext(filename)[-1]
def bucket_path():
bucket_name = app_identity.get_default_gcs_bucket_name()
return '/' + bucket_name + '/'
class EditProductForm(Form):
# Test and message for currency format
cur_regex = '^\s*(?=.*[1-9])\d*(?:\.\d{1,2})?\s*$'
pricemsg = 'Enter a price with up to two decimal places (no dollar symbol)'
prodcode = StringField(
'* Product Code:',
[validators.Length(min=1, max=10)])
price = StringField(
'* Product Price:',
[validators.Regexp(cur_regex, message=pricemsg)])
title = StringField(
'* Product Title:',
[validators.Length(min=1, max=500)])
category = SelectField(
'* Product Group:',
choices=PRODUCT_GROUPS,
default='Hardware')
description = TextAreaField(
'* Product Description:',
[validators.Required()])
submitbtn = SubmitField('Save Product')
class EditImagesForm(Form):
image = FileField('File to Upload:')
submitbtn = SubmitField('Upload')
class BucketImageHandler(webapp2.RequestHandler):
# Return image from cloud storage
def get(self, image_file):
self.response.headers['Content-Type'] = 'image/png'
# Get complete file name
filename = bucket_path() + image_file
cache_name = 'productimages:{}'.format(image_file)
# Get image data from memcache
filedata = memcache.get(cache_name)
if filedata is None:
try:
# Get image from cloud storage
gcs_file = gcs.open(filename)
filedata = gcs_file.read()
memcache.add(cache_name, filedata, 3600)
except:
# Get placeholder image from static images
self.redirect('/images/image_placeholder.png')
self.response.out.write(filedata)
class UploadHandler(webapp2.RequestHandler):
# Display upload page
def get(self):
# Allow only for admin users
if users.is_current_user_admin():
# Delete image if one is passed in
# (in finished site, add a prompt to confirm)
image_filename = self.request.get('del')
if image_filename != '':
datastore_filename = bucket_path() + image_filename
logging.info('>>> DELETED FILE %s', image_filename)
try:
gcs.delete(datastore_filename)
except:
pass
# Gather image data to pass in to HTML template
MAX_IMAGES = 10
image_count = 0
reached_end = True
last_image = 1
start = self.request.get('s')
if start is '':
first_image = 1
else:
first_image = int(start)
if first_image < 1:
first_image = 1
# Get images from Cloud Storage
image_gallery = []
for imagefile in gcs.listbucket(bucket_path(), delimiter='/'):
image_count += 1
reached_first_image = (image_count >= first_image)
reached_last_image = (image_count >= first_image + MAX_IMAGES)
if reached_first_image and not reached_last_image:
# Files to show for this page
filename = imagefile.filename.split('/')[-1]
if file_extension(filename) == '.png':
this_image = dict(
name=filename,
size=imagefile.st_size,
safename=urllib.quote_plus(filename)
)
image_gallery.append(this_image)
last_image = image_count
back_start_index = first_image - MAX_IMAGES
next_start_index = last_image + 1
# Prepare image edit form for HTML template
new_images_form = EditImagesForm()
# Populate batch upload page
template_values = {
'admin_mode': users.is_current_user_admin(),
'greeting_html': login_html(),
'editform': new_images_form,
'gallery': image_gallery,
'start_image_index': first_image,
'end_image_index': last_image,
'image_count': image_count,
'back_start_index': back_start_index,
'next_start_index': next_start_index
}
image_mgr_template = JINJA_ENV.get_template('html/uploadmgr.htm')
image_mgr_html = image_mgr_template.render(template_values)
self.response.write(image_mgr_html)
else:
# Unauthorized user - raise an error
self.abort(401)
# Post new image or batch update to the gallery
def post(self):
# Allow batch upload only for admin users
if users.is_current_user_admin():
file_data = self.request.get('image')
upload_filename = ''
try:
upload_filename = os.path.basename(self.request.POST['image'].filename)
except:
logging.info('NO FILE SPECIFIED')
self.redirect('/upload')
upload_file_extension = file_extension(upload_filename)
datastore_filename = bucket_path() + upload_filename
logging.info('Store file to %s', datastore_filename)
if upload_file_extension == '.png':
# Write image to cloud storage
if len(file_data) > 0:
gcs_file = gcs.open(
datastore_filename,
'w',content_type='image/png')
file_data = images.resize(file_data, 400, 400)
gcs_file.write(file_data)
gcs_file.close()
# Upload done -- return to gallery
self.redirect('/upload')
elif upload_file_extension == '.zip':
# Save uploaded Zip file to Google Cloud Storage
gcs_file = gcs.open(
datastore_filename,
'w',content_type='application/zip')
gcs_file.write(file_data)
gcs_file.close()
logging.info('>>> STORED ZIP FILE %s', datastore_filename)
# Start background task to extract the Zip file
client_id = 'bgmsg-' + users.get_current_user().user_id()
email_address = users.get_current_user().email()
taskqueue.add(
url='/processuploads',
method="POST",
params={'zipfile': datastore_filename,
'address': email_address,
'clientid': client_id,
'starttime': datetime.datetime.now() }
)
# Upload done -- return to gallery
self.redirect('/upload')
else:
# Ignore other file types
self.redirect('/upload')
else:
# Unauthorized user - raise an error
self.abort(401)
class BatchProcessBackgroundHandler(webapp2.RequestHandler):
def post(self):
# Task queue handler - Extract and process uploaded Zip file
# Check header to ensure request came from inside App Engine platform
if 'X-AppEngine-TaskName' in self.request.headers:
zip_file_name = self.request.get('zipfile')
address = self.request.get('address')
client_id = self.request.get('clientid')
start_time = self.request.get('starttime')
# logging.info('>>> EXTRACTING ZIP FILE %s', zip_file_name)
# Get zip data from cloud storage
gcs_file = gcs.open(zip_file_name)
gcs_data = gcs_file.read()
zip_data = StringIO.StringIO(gcs_data)
# Open the archive for reading
zip_file = zipfile.ZipFile(zip_data, 'r')
# Extract each file in the archive and process based on extension
for extracted_file_name in zip_file.namelist():
extracted_file_extension = file_extension(extracted_file_name)
if extracted_file_extension == '.png':
# Read Zip file data as StringIO
extracted_image_data = zip_file.read(extracted_file_name)
# Resize images no wider or taller than 400 pixels
extracted_image_data = images.resize(
extracted_image_data,
400,
400)
datastore_filename = bucket_path() + extracted_file_name
gcs_file = gcs.open(
datastore_filename,
'w',
content_type='image/png')
gcs_file.write(extracted_image_data)
gcs_file.close()
elif extracted_file_extension == '.txt':
extracted_data = zip_file.read(extracted_file_name)
lines = extracted_data.split('\r\n')
for line in lines:
if line:
line_values = line.split('\t')
category = line_values[0]
prodcode = line_values[1]
title = line_values[2]
price = line_values[3]
description = line_values[4]
store_product(
prodcode,
title,
price,
category,
description)
# Close the Zip file
zip_file.close()
# Delete the Zip file when done
gcs.delete(zip_file_name)
# Compose success message
notify_title = 'Batch Update Successfully Completed'
message_body = 'Batch file ' + zip_file_name + '\n'
message_body += 'Started at ' + start_time + '\n'
message_body += 'Finished at ' + str(datetime.datetime.now()) + '\n'
message_body += 'Refresh your browser to see the product updates.\n'
# Send message by email
mail.send_mail(
sender = 'WW Admin <admin#wwheelhouse.com>',
to = address,
subject = notify_title,
body = message_body
)
# Send message by XMPP
user_address = address
chat_message_sent = False
msg = message_body
status_code = xmpp.send_message(user_address, msg)
chat_message_sent = (status_code == xmpp.NO_ERROR)
# Send message to web client via channel API
channel.send_message(
client_id,
msg
)
else:
# Report forbidden operation
self.error(403)
class DeleteProductHandler(webapp2.RequestHandler):
def get(self):
if users.is_current_user_admin():
# Get product code from query string passed in to page
prodcode = self.request.get('edit')
category = self.request.get('cat')
logging.info('>>> GET prodcode=%s and cat=%s', prodcode, category)
try:
# Get product from the datastore
parent_key = ndb.Key('ProductCategory', category)
product = Product.get_by_id(prodcode, parent=parent_key)
# Delete the entity
product.key.delete()
except:
pass
# Redirect back to main product view
self.redirect('/?cat=' + category)
else:
# Report forbidden operation
self.error(403)
class SearchHandler(webapp2.RequestHandler):
def post(self):
# Process search
search_text = self.request.get('q')
search_category = self.request.get('scat')
query_string = "title:" + search_text
query_string += " OR desc:" + search_text
if search_category != '' and search_category != '0':
query_string += " AND category=" + search_category
found_products = ''
num_found = 0
if search_text != '':
index = search.Index(name="ProductIndex")
found_products = index.search(query_string)
num_found = found_products.number_found
# Populate search results page
template_values = {
'admin_mode': users.is_current_user_admin(),
'greeting_html': login_html(),
'prod_categories': PRODUCT_GROUPS,
'selected': search_category,
'search_text': search_text,
'product_list': found_products,
'num_found': num_found
}
results_template = JINJA_ENV.get_template('html/searchresults.htm')
search_html = results_template.render(template_values)
self.response.write(search_html)
class MainHandler(webapp2.RequestHandler):
def get(self):
editcode = self.request.get('edit')
prod_category = self.request.get('cat',default_value='0')
in_edit = (prod_category and editcode)
if in_edit:
product = Product.query_get_product(prod_category, editcode)
new_product_form = EditProductForm(
prodcode=editcode,
title=product.title,
price=product.price,
category=prod_category,
description=product.desc
)
else:
# Produce empty product editing form
new_product_form = EditProductForm()
self.response.write(self.catalog_html(new_product_form))
# logging.info("ENVIRONMENT: %s", os.environ)
def post(self):
if users.is_current_user_admin():
# Get data submitted in form and validate user input
prodcode = self.request.get('prodcode')
title = self.request.get('title')
price = self.request.get('price')
category = self.request.get('category')
description = self.request.get('description')
new_product_form = EditProductForm(
prodcode=prodcode,
title=title,
price=price,
category=category,
description=description
)
if new_product_form.validate():
store_product(prodcode, title, price, category, description)
self.redirect('/?cat='+category+'&viewproduct='+prodcode)
else:
html = self.catalog_html(new_product_form)
self.response.write(html)
else:
# Unauthorized user -- raise an error
self.abort(401)
def catalog_html(self, editform):
""" Return HTML for the product catalog """
PRODUCTS_PER_PAGE = 4
viewcode = self.request.get('viewproduct')
editcode = self.request.get('edit')
category = self.request.get('cat', default_value='0')
in_edit = (category and editcode) # Show Edit mode only if category and editcode provided
in_one_product_view = viewcode != ''
# If one product view or in edit, show single product
if in_one_product_view or in_edit:
# RETURN SINGLE PRODUCT VIEW
if in_edit:
# Query to get the product specified for editing
product = Product.query_get_product(category, editcode)
else:
# Query to get the product specified for viewing
product = Product.query_get_product(category, viewcode)
# Populate catalog page
template_values = {
'admin_mode': users.is_current_user_admin(),
'greeting_html': login_html(),
'prod_categories': PRODUCT_GROUPS,
'selected': category,
'product': product,
'editform': editform
}
one_product_template = JINJA_ENV.get_template('html/oneproduct.htm')
one_product_html = one_product_template.render(template_values)
return one_product_html
else:
# MULTIPLE PRODUCT VIEW
if category == '0':
# Show all products in all categories
q_forward = Product.query_all_categories_sort_newest()
q_backward = Product.query_all_categories_sort_oldest()
else:
# Show products in one category
q_forward = Product.query_by_category_sort_newest(category)
q_backward = Product.query_by_category_sort_oldest(category)
page_nav = ''
products = None
num_products_in_query = q_forward.count()
num_pages_to_show = num_products_in_query / PRODUCTS_PER_PAGE
if (num_products_in_query % PRODUCTS_PER_PAGE) > 0:
num_pages_to_show += 1
cursor_was_passed_in = self.request.get('cursor') is not ''
page_num = self.request.get('pg',default_value='1')
prev_cur = ''
next_cur = ''
if num_products_in_query > 0:
# Read the cursor passed in from the previous page
cursor = Cursor(urlsafe=self.request.get('cursor'))
# Fetch a forward cursor
products, next_cursor, more_after = q_forward.fetch_page(
PRODUCTS_PER_PAGE,
start_cursor=cursor )
# Fetch a backward cursor
prev_products, prev_cursor, more_before = q_backward.fetch_page(
PRODUCTS_PER_PAGE,
start_cursor=cursor.reversed() )
if cursor_was_passed_in and prev_cursor:
prev_cur = prev_cursor.urlsafe()
if more_after and next_cursor:
next_cur = next_cursor.urlsafe()
# Populate catalog page
template_values = {
'admin_mode': users.is_current_user_admin(),
'greeting_html': login_html(),
'catalog': products,
'prod_categories': PRODUCT_GROUPS,
'selected': category,
'editform': editform,
'prev_cur': prev_cur,
'next_cur': next_cur,
'page_num': page_num,
'page_num_prev': int(page_num)-1,
'page_num_next': int(page_num)+1,
'num_pages_to_show': num_pages_to_show
}
catalog_template=JINJA_ENV.get_template('html/catalog.htm')
return catalog_template.render(template_values)
...
Please help. I don't have a clue how to fix this, but I need to get this project working for the students to use.
Thanks so much,
Chrys
That's because there's no default bucket attached to your current project.
You can create a bucket with gsutil or the cloud console and then hardcode the value in your bucket_path function.
Also you can keep your current bucket_path helper function that returns the bucket_name from app_identity.get_default_gcs_bucket_name() and then create a default bucket in the new console by visiting your app engine's application settings in the cloud console https://console.cloud.google.com/appengine/settings?project=
I am using Scrapy for a project, in this project I am extracting the information from the xml.
In the xml document the format where I would like to implement the for loop:
<relatedPersonsList>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>
<relatedPersonName>
<firstName>Mark</firstName>
<middleName>E.</middleName>
<lastName>Lucas</lastName>
</relatedPersonName>
<relatedPersonAddress>
<street1>1 IMATION WAY</street1>
<city>OAKDALE</city>
<stateOrCountry>MN</stateOrCountry>
<stateOrCountryDescription>MINNESOTA</stateOrCountryDescription>
<zipCode>55128</zipCode>
</relatedPersonAddress>
<relatedPersonRelationshipList>
<relationship>Executive Officer</relationship>
<relationship>Director</relationship>
</relatedPersonRelationshipList>
<relationshipClarification/>
</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
</relatedPersonsList>
As you can see in the <relatedPersonsList>, you can have multiple <relatedPersonInfo>, and when I try to make a for loop, I still only get the information of the first person.
This is my actual code:
for person in xxs.select('./relatedPersonsList/relatedPersonInfo'):
item = Myform() #even if get rid of it I get the same result
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
here is the code that I used on my spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
import urlparse
from formds.items import SecformD
class SecDform(CrawlSpider):
name = "DFORM"
allowed_domain = ["http://www..gov"]
start_urls = [
""
]
rules = (
Rule(
SgmlLinkExtractor(restrict_xpaths=["/html/body/div/table/tr/td[3]/a[2]"]),
callback='parse_formd',
#follow= True no need of follow thing
),
Rule(
SgmlLinkExtractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[NEXT]")]')),
follow=True
),
)
def parse_formd(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[#id="formDiv"]/div/table/tr[3]/td[3]/a/#href').extract()
for site in sites:
yield Request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document)
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
item = SecformD()
item["stateOrCountryDescription"] = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
item["zipCode"] = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
item["issuerPhoneNumber"] = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
#item = SecDform()
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
return item
I extract the information to a .json file using this command:
scrapy crawl DFORM -o tes4.json -t json
Try something like this:
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
items = []
# common field values
stateOrCountryDescription = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
zipCode = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
issuerPhoneNumber = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
# instantiate one item per loop iteration
item = SecformD()
# save common parameters
item["stateOrCountryDescription"] = stateOrCountryDescription
item["zipCode"] = zipCode
item["issuerPhoneNumber"] = issuerPhoneNumber
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
items.append(item)
return items
Is there a different widget or argument that will allow django to only show/take the year and month input instead of year, month and day?
Currently using SelectDateWidget.
There's a snippet here, which sets the day to 1 (presuming you've got a DateField that this value will end up in, you'll need to get some kind of day).
The code is like this (just in case Django snippets disappears):
import datetime
import re
from django.forms.widgets import Widget, Select
from django.utils.dates import MONTHS
from django.utils.safestring import mark_safe
__all__ = ('MonthYearWidget',)
RE_DATE = re.compile(r'(\d{4})-(\d\d?)-(\d\d?)$')
class MonthYearWidget(Widget):
"""
A Widget that splits date input into two <select> boxes for month and year,
with 'day' defaulting to the first of the month.
Based on SelectDateWidget, in
django/trunk/django/forms/extras/widgets.py
"""
none_value = (0, '---')
month_field = '%s_month'
year_field = '%s_year'
def __init__(self, attrs=None, years=None, required=True):
# years is an optional list/tuple of years to use in the "year" select box.
self.attrs = attrs or {}
self.required = required
if years:
self.years = years
else:
this_year = datetime.date.today().year
self.years = range(this_year, this_year+10)
def render(self, name, value, attrs=None):
try:
year_val, month_val = value.year, value.month
except AttributeError:
year_val = month_val = None
if isinstance(value, basestring):
match = RE_DATE.match(value)
if match:
year_val, month_val, day_val = [int(v) for v in match.groups()]
output = []
if 'id' in self.attrs:
id_ = self.attrs['id']
else:
id_ = 'id_%s' % name
month_choices = MONTHS.items()
if not (self.required and value):
month_choices.append(self.none_value)
month_choices.sort()
local_attrs = self.build_attrs(id=self.month_field % id_)
s = Select(choices=month_choices)
select_html = s.render(self.month_field % name, month_val, local_attrs)
output.append(select_html)
year_choices = [(i, i) for i in self.years]
if not (self.required and value):
year_choices.insert(0, self.none_value)
local_attrs['id'] = self.year_field % id_
s = Select(choices=year_choices)
select_html = s.render(self.year_field % name, year_val, local_attrs)
output.append(select_html)
return mark_safe(u'\n'.join(output))
def id_for_label(self, id_):
return '%s_month' % id_
id_for_label = classmethod(id_for_label)
def value_from_datadict(self, data, files, name):
y = data.get(self.year_field % name)
m = data.get(self.month_field % name)
if y == m == "0":
return None
if y and m:
return '%s-%s-%s' % (y, m, 1)
return data.get(name, None)
A Python 3 widget sample here https://djangosnippets.org/snippets/10522/.
Example usage :
class myForm(forms.Form):
# ...
date = forms.DateField(
required=False,
widget=MonthYearWidget(years=xrange(2004,2010))
)
I came across the same problem today and solved it by removing the day field via a css property and setting 1 as value for the day on clean up.
#id_my_date_field_day-button {
display: none;
}
I used a ModelForm with an UpdateView and therefore had initial data in my fields which made life a bit simpler because I always had a valid value for the day of my_date_field.
I've written a simpler version (https://djangosnippets.org/snippets/10943/) inheriting from django built-in SelectDateWidget.
In widgets.py:
import calendar
import datetime
from django.forms.widgets import HiddenInput, SelectDateWidget
from django.utils import datetime_safe
from django.utils.formats import get_format
class MonthYearWidget(SelectDateWidget):
def __init__(self, last_day=False, *args, **kwargs):
self.last_day = last_day
return super().__init__(*args, **kwargs)
def get_context(self, name, value, attrs):
context = super().get_context(name, value, attrs)
day_name = self.day_field % name
day_subwidget = HiddenInput().get_context(
name=day_name,
value=1,
attrs={**context["widget"]["attrs"], "id": "id_%s" % day_name},
)
context["widget"]["subwidgets"][0] = day_subwidget["widget"]
return context
def value_from_datadict(self, data, files, name):
value = super().value_from_datadict(data, files, name)
if self.last_day is True:
y = data.get(self.year_field % name)
m = data.get(self.month_field % name)
if y is not None and m is not None:
input_format = get_format("DATE_INPUT_FORMATS")[0]
monthrange = calendar.monthrange(int(y), int(m))
date_value = datetime.date(int(y), int(m), monthrange[1])
date_value = datetime_safe.new_date(date_value)
return date_value.strftime(input_format)
return value
kwargs:
last_day : if set to True, returns the last day of the month in the generated date, otherwise returns a date starting from the 1st day of the month
Usage example:
# models.py
from django.db import models
from django.utils.translation import gettext_lazy as _
class MyModel(models.Model):
start = models.DateField(
_("Start date"),
)
end = models.DateField(
_("End date"),
)
class Meta:
verbose_name = _("My model")
# forms.py
from django import forms
from .models import MyModel
from .widgets import MonthYearWidget
class MyModelForm(forms.ModelForm):
class Meta:
model = MyModel
exclude = []
widgets = {
"start": MonthYearWidget(),
"end": MonthYearWidget(last_day=True),
}