How to access the values inside the 'files' field in scrapy - python-2.7

I have downloaded some files using the file pipeline and i want to get the values of the files field. I tried to print item['files'] and it gives me a key error. Why is this so and how can i do it?
class testspider2(CrawlSpider):
name = 'genspider'
URL = 'flu-card.com'
URLhttp = 'http://www.flu-card.com'
allowed_domains = [URL]
start_urls = [URLhttp]
rules = (
[Rule(LxmlLinkExtractor(allow = (),restrict_xpaths = ('//a'),unique = True,),callback='parse_page',follow=True),]
)
def parse_page(self, response):
List = response.xpath('//a/#href').extract()
item = GenericspiderItem()
date = strftime("%Y-%m-%d %H:%M:%S")#get date&time dd-mm-yyyy hh:mm:ss
MD5hash = '' #store as part of the item, some links crawled are not file links so they do not have values on these fields
fileSize = ''
newFilePath = ''
File = open('c:/users/kevin123/desktop//ext.txt','a')
for links in List:
if re.search('http://www.flu-card.com', links) is None:
responseurl = re.sub('\/$','',response.url)
url = urljoin(responseurl,links)
else:
url = links
#File.write(url+'\n')
filename = url.split('/')[-1]
fileExt = ''.join(re.findall('.{3}$',filename))
if (fileExt != ''):
blackList = ['tml','pdf','com','php','aspx','xml','doc']
for word in blackList:
if any(x in fileExt for x in blackList):
pass #url is blacklisted
else:
item['filename'] = filename
item['URL'] = url
item['date'] = date
print item['files']
File.write(fileExt+'\n')
yield GenericspiderItem(
file_urls=[url]
)
yield item

It is not possible to access item['files'] in your spider. That is because the files are download by the FilesPipeline, and items just reach pipelines after they get out of your spider.
You first yield the item, then it gets to FilesPipeline, then the files are dowloaded, an just then the field images is populated with the info you want. To access it, you have to write a pipeline and schedule it after the FilesPipeline. Inside your pipeline, you can access the files field.
Also note that, in your spider, you are yielding to different kinds of items!

Related

Getting a variable from a field using arcpy

I am creating a toolbox tool using a python script that creates maps based on user input. I have created a map template that gets saved and altered with python. I am struggling on how to update some text in text boxes in the layout view using Arcpy. I was able to do it with dynamic text with data driven pages, but I couldn't find any python code to get data driven pages to refresh so I decided to try to update the text with python directly. With data driven pages, the dynamic text was pulling the text from an attribute table. I'm fairly new to python so am struggling with how to pull values from a table to use as part of the text. I am able to update text as long as I have the variable defined somewhere else (not from a table), but the only method I found to pull data from a table was with a search cursor but that returns a list rather than a value so I get an error. The feature classes with the text values I want only have one row in them so it is a list of one. How can I convert that list to a value. I am only including the applicable parts of the script. I also removed the actual paths from the code.
import arcpy
import os
ID = arcpy.GetParameterAsText(1)
city = arcpy.GetParameterAsText(3)
WS = os.path.join('path to gdb', "WS")
dfield = 'name'
datefield = 'date'
cfield = "county"
#Use SearchCursor - these features only have one row, but these are my problem because they are lists
wsname = [row[0] for row in arcpy.da.SearchCursor(WS, dfield)]
wsdate = [row[0] for row in arcpy.da.SearchCursor(WS, datefield)]
county = [row[0] for row in arcpy.da.SearchCursor(overview, cfield)]
#update text
for elm in arcpy.mapping.ListLayoutElements(mxd, "TEXT_ELEMENT"):
elm.text = elm.text.replace('WS',wsname) #this doesn't work because wsname is a list
elm.text = elm.text.replace('City',city) #this works
elm.text = elm.text.replace('text2',"words"+ ID +" -more words") #This works
elm.text = elm.text.replace('Name', county) #this doesn't work because county is a list
elm.text = elm.text.replace('Date',wsdate) #this doesn't work because wsdate is a list
arcpy.RefreshActiveView()
mxd.save()
This code will work when run from a arcgis toobox script tool.
# define the aprx file and the layout in the project
import arcpy
aprx = arcpy.mp.ArcGISProject(r'path\to\the\arcgis\aprxfile.aprx')
aprxLayout = aprx.listLayouts()[0] '''adding the zero index will return the first
layout in the layout list, if there is more than one layout'''
# get the attribute value to use for the layout text element
fieldNames = ['FieldName1', 'FieldName2']
with arcpy.da.SearchCursor(r'path\to.gdb\featureclass', fieldNames) as sc:
for row in sc:
if (row[0]) is not None:
field1Value = (row[0])
if (row[0]) is None:
field1Value = 'Null'
if (row[1]) is not None:
field2Value = (row[0])
if (row[1]) is None:
field2Value = 'Null'
# Assign the attribute value to the layout text element
for textElem in aprxLayout.listElements:
if textElem.name == 'name of layout text element in the element properties':
text.Elem.text = field1Value
if textElem.name == 'name of layout text element in the element properties':
text.Elem.text = field2Value
aprx.saveACopy(r'path/to/folder/projectname')
del aprx
I was able to tweak armedwiththeword's code to come up with this.
import arcpy
mxd = arcpy.mapping.MapDocument(path_to_mxd)
fieldNames = ['name', 'date']
with arcpy.da.SearchCursor(WS, fieldNames) as sc:
for row in sc:
if(row[0]) is not None:
field1Value = (row[0])
if(row[0]) is None:
field1Value = 'Null'
if(row[1]) is not None:
field2Value = (row[1])
if(row[1]) is None:
field2Value = 'Null'
fieldName = ['CTY_NAME']
with arcpy.da.SearchCursor(overview, fieldName) as sc:
for row in sc:
if(row[0]) is not None:
field3Value = (row[0])
if(row[0]) is None:
field3Value = 'Null'
# Assign the attribute value to the layout text element
for textElem in arcpy.mapping.ListLayoutElements(mxd,'TEXT_ELEMENT'):
if textElem.name == 'title':
textElem.text = field1Value + " words"
if textElem.name == 'subtitle':
textElem.text = "WS -0"+ ID + " -more words"
if textElem.name == 'city':
textElem.text = city
if textElem.name == 'county':
textElem.text = field3Value
if textElem.name == 'date':
textElem.text = field2Value

Why does the render_template keep on showing the old value of the flask form?

I've been searching for an answer for hours. I apologise if I missed something.
I'm using the same form multiple times in order to add rows to my database.
Every time I check an excel file to pre-fill some of the wtforms StringFields with known information that the user may want to change.
The thing is: I change the form.whatever.data and when printing it, it shows the new value. But when I render the template it keeps showing the old value.
I tried to do form.hours_estimate.data = "" before assigning it a new value just in case but it didn't work.
I will attach here the route I'm talking about. The important bit is after # Get form ready for next service. If there's more info needed please let me know.
Thank you very much.
#coordinator_bp.route("/coordinator/generate-order/<string:pev>", methods=['GET', 'POST'])
#login_required
def generate_order_services(pev):
if not (current_user.is_coordinator or current_user.is_manager):
return redirect(url_for('public.home'))
# Get the excel URL
f = open("./app/database/datafile", 'r')
filepath = f.read()
f.close()
error = None
if GenerateServicesForm().submit1.data and GenerateServicesForm().validate():
# First screen submit (validate the data -> first Service introduction)
form = FillServiceForm()
next_service_row = get_next_service_row(filepath)
if next_service_row is None:
excel_info = excel_get_pev(filepath)
error = "Excel error. Service code not found. If you get this error please report the exact way you did it."
return render_template('coordinator/get_pev_form.html', form=GetPevForm(), error=error, info=excel_info)
service_info = get_service_info(filepath, next_service_row)
service_code = service_info[0]
start_date = service_info[1]
time_estimate = service_info[2]
objects = AssemblyType.get_all()
assembly_types = []
for assembly_type in objects:
assembly_types.append(assembly_type.type)
form.service_code.data = service_code
form.start_date.data = start_date
form.hours_estimate.data = time_estimate
return render_template('coordinator/fill_service_form.html', form=form, error=error, assembly_types=assembly_types)
if FillServiceForm().submit2.data:
if not FillServiceForm().validate():
objects = AssemblyType.get_all()
assembly_types = []
for assembly_type in objects:
assembly_types.append(assembly_type.type)
return render_template('coordinator/fill_service_form.html', form=FillServiceForm(), error=error,
assembly_types=assembly_types)
# Service screen submits
# Here we save the data of the last submit and ready the next one or end the generation process
# Ready the form
form = FillServiceForm()
next_service_row = get_next_service_row(filepath)
if next_service_row is None:
excel_info = excel_get_pev(filepath)
error = "Excel error. Service code not found. If you get this error please report the exact way you did it."
return render_template('coordinator/get_pev_form.html', form=GetPevForm(), error=error, info=excel_info)
service_info = get_service_info(filepath, next_service_row)
service_code = service_info[0]
form.service_code.data = service_code
# create the service (this deletes the service code from the excel)
service = create_service(form, filepath)
if isinstance(service,str):
return render_template('coordinator/fill_service_form.html', form=form, error=service)
# Get next service
next_service_row = get_next_service_row(filepath)
if next_service_row is None:
# This means there is no more services pending
return "ALL DONE"
else:
# Get form ready for next service
service_info = get_service_info(filepath, next_service_row)
service_code = service_info[0]
start_date = service_info[1]
time_estimate = service_info[2]
print("time_estimate")
print(time_estimate) # I get the new value.
objects = AssemblyType.get_all()
assembly_types = []
for assembly_type in objects:
assembly_types.append(assembly_type.type)
form.service_code.data = service_code
form.start_date.data = start_date
form.hours_estimate.data = time_estimate
print(form.hours_estimate.data) # Here I get the new value. Everything should be fine.
# In the html, the old value keeps on popping.
return render_template('coordinator/fill_service_form.html', form=form, error=error,
assembly_types=assembly_types)
number_of_services = excel_get_services(filepath=filepath, selected_pev=pev)
# Get the number of the first excel row of the selected pev
first_row = excel_get_row(filepath, pev)
if first_row is None:
excel_info = excel_get_pev(filepath)
error = "Excel error. PEV not found. If you get this error please report the exact way you did it."
return render_template('coordinator/get_pev_form.html', form=GetPevForm(), error=error, info=excel_info)
service_code = []
start_date = []
time_estimate_code = []
quantity = []
# Open the excel
wb = load_workbook(filepath)
# grab the active worksheet
ws = wb.active
for idx in range(number_of_services):
# Append the data to the lists
service_code.append(ws.cell(row=first_row+idx, column=12).value)
start_date.append(str(ws.cell(row=first_row + idx, column=5).value)[:10])
time_estimate_code.append(ws.cell(row=first_row+idx, column=7).value)
quantity.append(ws.cell(row=first_row + idx, column=9).value)
wb.close()
return render_template('coordinator/generate_services_form.html',
form=GenerateServicesForm(),
pev=pev,
service_code=service_code,
start_date=start_date,
time_estimate_code=time_estimate_code,
quantity=quantity)
Well I found a workarround: I send the data outside the form like this:
return render_template('coordinator/fill_service_form.html', form=form, error=error,
assembly_types=assembly_types,
service_code=service_code,
start_date=start_date,
time_estimate=time_estimate)
And replace the jinja form for this:
<input class="form-control" placeholder="2021-04-23" name="start_date" type="text" value="{{start_date}}">
I'm still using the form (name= the form field name) and at the same time I input the value externally.
I hope this helps somebody.

How should I be formatting my yield requests?

My scrapy spider is very confused, or I am, but one of us is not working as intended. My spider pulls start url's from a file and is supposed to: Start on an Amazon search page, crawl the page and grab the url's of each search result, follow the link to the items page, crawl the items page for information on the item, once all items have been crawled on the first page follow pagination up to page X, rinse and repeat.
I am using ScraperAPI and Scrapy-user-agent to randomize my middlewares. I have formatted my start_requests with a priority based on their index in the file, so they should be crawled in order. I have checked and ensured that I AM receiving a successful 200 html response with the actual html from the Amazon page. Here is the code for the spider:
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
page_number = 2
current_keyword = 0
keyword_list = []
payload = {'api_key': 'mykey', 'url':'https://httpbin.org/ip'}
r = requests.get('http://api.scraperapi.com', params=payload)
print(r.text)
#/////////////////////////////////////////////////////////////////////
def start_requests(self):
with open("keywords.txt") as f:
for index, line in enumerate(f):
try:
keyword = line.strip()
AmazonSpiderSpider.keyword_list.append(keyword)
formatted_keyword = keyword.replace(' ', '+')
url = "http://api.scraperapi.com/?api_key=mykey&url=https://www.amazon.com/s?k=" + formatted_keyword + "&ref=nb_sb_noss_2"
yield scrapy.Request(url, meta={'priority': index})
except:
continue
#/////////////////////////////////////////////////////////////////////
def parse(self, response):
print("========== starting parse ===========")
for next_page in response.css("h2.a-size-mini a").xpath("#href").extract():
if next_page is not None:
if "https://www.amazon.com" not in next_page:
next_page = "https://www.amazon.com" + next_page
yield scrapy.Request('http://api.scraperapi.com/?api_key=mykey&url=' + next_page, callback=self.parse_dir_contents)
second_page = response.css('li.a-last a').xpath("#href").extract_first()
if second_page is not None and AmazonSpiderSpider.page_number < 3:
AmazonSpiderSpider.page_number += 1
yield scrapy.Request('http://api.scraperapi.com/?api_key=mykey&url=' + second_page, callback=self.parse_pagination)
else:
AmazonSpiderSpider.current_keyword = AmazonSpiderSpider.current_keyword + 1
#/////////////////////////////////////////////////////////////////////
def parse_pagination(self, response):
print("========== starting pagination ===========")
for next_page in response.css("h2.a-size-mini a").xpath("#href").extract():
if next_page is not None:
if "https://www.amazon.com" not in next_page:
next_page = "https://www.amazon.com" + next_page
yield scrapy.Request(
'http://api.scraperapi.com/?api_key=mykey&url=' + next_page,
callback=self.parse_dir_contents)
second_page = response.css('li.a-last a').xpath("#href").extract_first()
if second_page is not None and AmazonSpiderSpider.page_number < 3:
AmazonSpiderSpider.page_number += 1
yield scrapy.Request(
'http://api.scraperapi.com/?api_key=mykey&url=' + second_page,
callback=self.parse_pagination)
else:
AmazonSpiderSpider.current_keyword = AmazonSpiderSpider.current_keyword + 1
#/////////////////////////////////////////////////////////////////////
def parse_dir_contents(self, response):
items = ScrapeAmazonItem()
print("============= parsing page ==============")
temp = response.css('#productTitle::text').extract()
product_name = ''.join(temp)
product_name = product_name.replace('\n', '')
product_name = product_name.strip()
temp = response.css('#priceblock_ourprice::text').extract()
product_price = ''.join(temp)
product_price = product_price.replace('\n', '')
product_price = product_price.strip()
temp = response.css('#SalesRank::text').extract()
product_score = ''.join(temp)
product_score = product_score.strip()
product_score = re.sub(r'\D', '', product_score)
product_ASIN = response.css('li:nth-child(2) .a-text-bold+ span').css('::text').extract()
keyword = AmazonSpiderSpider.keyword_list[AmazonSpiderSpider.current_keyword]
items['product_keyword'] = keyword
items['product_ASIN'] = product_ASIN
items['product_name'] = product_name
items['product_price'] = product_price
items['product_score'] = product_score
yield items
For the FIRST start url, it will crawl three or four items and then it will jump to the SECOND start url. It will skip processing the remaining items and pagination pages, going directly to the second start url. For the second url, it will crawl three or four items, then it again will skip to the THIRD start url. It continues in this way, grabbing three or four items, then skipping to the next URL until it reaches the final start url. It will completely gather all information on this URL. Sometimes the spider COMPLETELY SKIPS the first or second starting url. This happens infrequently, but I have no idea as to what could cause this.
My code for following result item URL's works fine, but I never get the print statement for "starting pagination" so it is not correctly following pages. Also, there is something odd with middlewares. It begins parsing before it has assigned a middleware

Scrapy crawler not recursively crawling next page

I am trying to build this crawler to get housing data from craigslist,
but the crawler stops after fetching the first page and does not go to the next page .
Here is the code , it works for the first page ,but for the love of god I dont understand why it does not get to the next page .Any insight is really appreciated .I followed this part from scrapy tutorial
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "craigslistmm"
start_urls = [
"https://vancouver.craigslist.ca/search/hhh"
]
def parse_second(self,response):
#need all the info in a dict
meta_dict = response.meta
for q in response.css("section.page-container"):
meta_dict["post_details"]= {
"location":
{"longitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-longitude)" ).extract(),
"latitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-latitude)" ).extract()},
"detailed_info": ' '.join(q.css('section#postingbody::text').extract()).strip()
}
return meta_dict
def parse(self, response):
pattern = re.compile("\/([a-z]+)\/([a-z]+)\/.+")
for q in response.css("li.result-row"):
post_urls = q.css("p.result-info a::attr(href)").extract_first()
mm = re.match(pattern, post_urls)
neighborhood= q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
next_url = "https://vancouver.craigslist.ca/"+ post_urls
request = scrapy.Request(next_url,callback=self.parse_second)
#next_page = response.xpath('.//a[#class="button next"]/#href').extract_first()
#follow_url = "https://vancouver.craigslist.ca/" + next_page
#request1 = scrapy.Request(follow_url,callback=self.parse)
#yield response.follow(next_page,callback = self.parse)
request.meta['id'] = q.css("li.result-row::attr(data-pid)").extract_first()
request.meta['pricevaluation'] = q.css("p.result-info span.result-meta span.result-price::text").extract_first()
request.meta["information"] = q.css("p.result-info span.result-meta span.housing::text" ).extract_first()
request.meta["neighborhood"] =q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
request.meta["area"] = mm.group(1)
request.meta["adtype"] = mm.group(2)
yield request
#yield scrapy.Request(follow_url, callback=self.parse)
next_page = LinkExtractor(allow="s=\d+").extract_links(response)[0]
# = "https://vancouver.craigslist.ca/" + next_page
yield response.follow(next_page.url,callback=self.parse)
The problem seems to be with the next_page extraction using LinkExtractor. If you look in the look, you'll see duplicate requests being filtered. There are more links on the page that satisfy your extraction rule and maybe they are not extracted in any particular order (or not in the order you wish).
I think better approach is to extract exactly the information you want, try it with this:
next_page = response.xpath('//span[#class="buttons"]//a[contains(., "next")]/#href').extract_first()

Adding xhr links to scraped categories hrefs missing scheme error

i have built a spider which gets data from one category , the method it follows is when the category page is specified in start url and defining start_requests for pagination which iterates over the link provided by xhr request. Since i wanted to get all the categories at once i have written code like this. my logic was to first get all category links and append those links with xhr links which follows same string for every category which is (?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu) and parse these appended url to start_request and iterate them for pagination and item parsing . but i am not able to run spider because it throws the missing scheme error since in start request i havenot provided the http:// i am stuck onto how should i solve this issue please help..
class JabcatSpider(scrapy.Spider):
name = "jabcat"
allowed_domains = ["trendin.com"]
start_urls = [
'http://www.trendin.com',
]
max_pages = 400
def parse(self,response):
urls = response.xpath('//div[#class = "men"]//#href').extract()
for url in urls:
urljoin=(url + "/" "?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu")
#yield scrapy.Request(urljoin, callback=self.start_requests)
print urljoin
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('?from=%d&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu' % i, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[#id="product_rows"]/div/div/div/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
for sel in response.xpath('//*[#id="catalog-product"]/section[2]'):
item = Jabongo()
item['title'] = response.xpath('//*[#id="product-details-wrapper"]/div[1]/div[2]/div/div[1]/h1/span[2]/text()').extract()
# item['price'] = response.xpath('//*[#id="pdp-price-info"]/span[2]/text()').extract()
# item['image'] = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract()
# # item['color'] = sel.xpath('//ul/li/label[.="Color"]/following-sibling::Span/text()').extract()
# return item
#pattern = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract