getting indexerror : list index out of range - python-2.7

from BeautifulSoup import BeautifulSoup
import mechanize
import re
def price_walmart_match(keyword):
url = "http://www.walmart.com/search/?query="+keyword
br = mechanize.Browser()
br.set_handle_robots(False)
br.open(url)
html = br.response().read()
result_soup = BeautifulSoup(html)
found_results = result_soup.findAll('div' , attrs={'class':'js-tile tile-landscape'})
all_results=[]
for result in found_results:
title = result.find('a' , attrs={'class':'js-product-title'})
links = result.find('a' , href=True)
before_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('<span class="sup">$</span>')[1].split('<span class="visuallyhidden">')[0]
after_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('</span><span class="sup">')[1].split('</span>')[0]
prices = before_decimal+'.'+after_decimal
inArray = [float(prices), "http://www.walmart.com"+links['href']]
all_results.append(inArray)
print(all_result)
Sorry it is full code where i get error.

Thats Because the mentioned class is not there in the page.
Try to Represent query term which is constant.

Related

Django Request.query_parmas issue

Below is the url that i am going to get, i am able to retrieve the specific data with the patientName and patientNRIC. However, how should I make to be variables, using params? like I would like to pass in to values to the function but achieved the same information instead of hard coded. Thank you for your help.
test.py
import requests
def get_patient():
# patientName= {'patientName': patientName}
# patientNRIC = {'patientNRIC': patientNRIC}
p = {'patientName':'John','PatientNRIC':'S1111111A'}
django_url = "https://4cfe2cdde986.ngrok.io/test/?patientName=John&patientNRIC=S1111111A"
r = requests.get(django_url)
r = r.json()
print(r)
get_patient()
like so
import requests
def get_patient(p):
query_string = ''
for query, value in p.items():
query_string+=f'{query}={value}&'
django_url = f'https://4cfe2cdde986.ngrok.io/test/?{query_string}'
r = requests.get(django_url)
r = r.json()
print(r)
p = {'patientName':'John','patientNRIC':'S1111111A'}
get_patient(p)
Like docs says
import requests
def get_patient():
# patientName= {'patientName': patientName}
# patientNRIC = {'patientNRIC': patientNRIC}
p = {'patientName':'John','PatientNRIC':'S1111111A'}
django_url = "https://4cfe2cdde986.ngrok.io/test/"
r = requests.get(django_url, params=p)
r = r.json()
print(r)
get_patient()

I want to return the elements of list into a data frame as below. I am a beginner

I attempted 3 different ways:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as soup
from selenium import webdriver
driver = webdriver.Chrome(executable_path='C:/Users/lemonade/Documents/work/chromedriver')
my_url = "https://www.carehome.co.uk/"
def make_soup(url):
driver.get(url)
m_soup = soup(driver.page_source, features='html.parser')
return m_soup
main_page = make_soup(my_url)
boroughs = [borough.text.strip() for borough in main_page.select('.seo_links.seo_links_country [href]')]
indexs = list(range(16,19))
london_list = [boroughs[i] for i in indexs]
boroughs1 = [bo.replace("Borough","") for bo in london_list]
boroughs2 = [b1.replace("&","and") for b1 in boroughs1]
boroughs3 = ['-'.join(b2.split()) for b2 in boroughs2]
borough_links = ["https://www.carehome.co.uk/care_search_results.cfm/searchunitary/" + b3 for b3 in boroughs3]
borough_soup = [make_soup(b_link) for b_link in borough_links]
for soups in borough_soup:
titles = [title.text.strip() for title in soups.select('.home-name [href]')]
return(titles)
for soups in borough_soup:
addresses = [address.text.strip() for address in soups.select('.home-name>p.grey')]
return(addresses)
df = pd.DataFrame(zip(titles, addresses), columns = ['title','address'])
print(df)
I tried the code below instead: This gave |AttributeError: 'list' object has no attribute 'text'|
title_html = [soups.select('.home-name [href]') for soups in borough_soup]
titles = [title.text.strip() for title in title_html ]
addresses_html =[soups.select('.home-name>p.grey') for soups in borough_soup]
addresses = [address.text.strip() for address in addresses_html]
I tried to create and append a list and return that list. [See Below] This just outputted a single element from the list.
def func(borough_soup):
for soups in borough_soup:
titles = [title_html.text.strip() for title_html in soups.select('.home-name [href]')]
for title in titles:
titles1 = []
titles1.append(title)
return(titles1)
Any help would be much appreciated!
This was the fix. Creating function with an empty list and then appending each element to the list. After that concating each DF
def title(x):
titles1 = []
for soups in borough_soup:
titles = [title.text.strip() for title in soups.select('.home-name [href]')]
titles1.append(titles)
return(titles1)
titles = title(borough_soup)
def address(x):
address1 = []
for soups in borough_soup:
addresses = [address.text.strip() for address in soups.select('.home-name>p.grey')]
address1.append(addresses)
return(address1)
addresses = address(borough_soup)
indexs2 = list(range(0,2))
df_list = [pd.DataFrame(zip(titles[i], addresses[i])) for i in indexs2]
df = pd.concat(df_list)

Scrapy detect if Xpath not exists

I've been trying to make my first crawler and i've acomplished what i needed ( get the 1º shop and 2º shop shipping infos and prices ) but with 2 crawlers instead of 1 because i've a big stopper here.
When there'are more than 1 shop the output result is:
In [1]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()').extract()
Out[1]:
[u'ENV\xcdO 3,95\u20ac ',
u'ENV\xcdO GRATIS',
u'ENV\xcdO GRATIS',
u'ENV\xcdO 4,95\u20ac ']
To get only the second result i'm using:
In [2]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
Out[2]: u'ENV\xcdO GRATIS'
But when there's no second results ( only 1 shop ) i'm getting:
IndexError: list index out of range
And the crawler skip the full page even if the other items have data ...
After trying several times i've decided to do a fast solution to get the result, 2 crawlers 1 for first shops and the other for the second one but right now i want to do it clean in only 1 crawler.
Some help, tip or advice will be appreciated, that's my first try making a recursive crawler with scrapy, kinda like it.
There's the code:
# -*- coding: utf-8 -*-
import scrapy
from Guapalia.items import GuapaliaItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class GuapaliaSpider(CrawlSpider):
name = "guapalia"
allowed_domains = ["guapalia.com"]
start_urls = (
'https://www.guapalia.com/perfumes?page=1',
'https://www.guapalia.com/maquillaje?page=1',
'https://www.guapalia.com/cosmetica?page=1',
'https://www.guapalia.com/linea-de-bano?page=1',
'https://www.guapalia.com/parafarmacia?page=1',
'https://www.guapalia.com/solares?page=1',
'https://www.guapalia.com/regalos?page=1',
)
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='js-pager']/a[contains(text(),'Siguientes')]"),follow=True),
Rule(LinkExtractor(restrict_xpaths="//div[#class='list-display__item list-display__item--product']/div/a[#class='col-xs-10 col-sm-10 col-md-12 clickOnProduct']"),callback='parse_articles',follow=True),
)
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')[1].extract()
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_second_shop'] = articles_second_shop if articles_second_shop else 'N/A'
item['articles_second_shipping'] = articles_second_shipping
item['articles_name'] = articles_name
yield item
Basic output of crawler with the right format when there're more than 1 shop:
2017-09-21 09:53:11 [scrapy] DEBUG: Crawled (200) <GET https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355> (referer: https://www.guapalia.com/perfumes?page=1)
2017-09-21 09:53:11 [scrapy] DEBUG: Scraped from <200 https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355>
{'articles_first_shipping': [u'ENV\xcdO GRATIS'],
'articles_first_shop': [u'DOUGLAS'],
'articles_name': [u'ZEN edp vaporizador 100 ml'],
'articles_second_shipping': u'ENV\xcdO 3,99\u20ac ',
'articles_second_shop': u'BUYSVIP',
'articles_urls': 'https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355'}
The problem is when doesn't exists a second shop because my code on the field second shop
IndexError: list index out of range
SOLUTION Thanks to #Tarun Lalwani
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
if len(articles_second_shop) > 1:
item['articles_second_shop'] = articles_second_shop[1].extract()
else:
item['articles_second_shop'] = 'Not Found'
if len(articles_second_shipping) > 1:
item['articles_second_shipping'] = articles_second_shipping[1].extract()
else:
item['articles_second_shipping'] = 'Not Found'
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_name'] = articles_name
yield item
You need to get the result in a variable first. Then you can take decision based on its length
texts = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
if len(texts) > 1:
data = texts[1].extract()
elif len(text) == 1:
data = texts[0].extract()
else
data ="Not found"

How to use beautifulsoup to save html of a link in a file and do the same with all the links in the html file

I'm trying to write a parser which will take a url and download it's html in a .html file. Then it'll go through the html file to find all links and save them as well. I want to repeat it multiple time. Can some one please help a little?
This is the code I have written:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
count = 1
give_url = raw_input("Enter url:\t")
def magic(give_url):
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
link_set.add(give_url + str(html_link))
magic(give_url)
for each_item in link_set:
print each_item
print "\n"
Although it's working fine but When I try to call the magic function in for loop, i get RuntimeError: Set changed size during iteration.
I got it working.
The code for recursive URL parsing using beautiful soup:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
#
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
pass
else:
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
else:
link_set.add(html_link)
# print "Total links in the given url are: " + str(len(link_set))
magic(give_url,link_set,0)
link_set2 = set()
link_set3 = set()
for element in link_set:
link_set2.add(element)
count = 1
for element in link_set:
magic(element,link_set3,count)
count += 1
for each_item in link_set3:
link_set2.add(each_item)
link_set3.clear()
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
There are many mistakes so I request you all to please tell me where I can improve the code.

Not geting min price and link of any keyword entered

from BeautifulSoup import BeautifulSoup
import mechanize
import re
def price_walmart_match(keyword):
url = "http://www.walmart.com/search/?query="+keyword
br = mechanize.Browser()
br.set_handle_robots(False)
br.open(url)
html = br.response().read()
result_soup = BeautifulSoup(html)
found_results = result_soup.findAll('div' , attrs={'class':'js-tile tile-landscape'})
all_results=[]
for result in found_results:
title = result.find('a' , attrs={'class':'js-product-title'})
links = result.find('a' , href=True)
before_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('<span class="sup">$</span>')[1].split('<span class="visuallyhidden">')[0]
after_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('</span><span class="sup">')[1].split('</span>')[0]
prices = before_decimal+'.'+after_decimal
inArray = [float(prices), "http://www.walmart.com"+links['href']]
all_results.append(inArray)
result = []
minval = all_results[0][0]
for values in all_results:
if minval >= values[0]:
result = values
minval = values[0]
return(result)