getting indexerror : list index out of range - python-2.7

from BeautifulSoup import BeautifulSoup
import mechanize
import re
def price_walmart_match(keyword):
url = ""+keyword
br = mechanize.Browser()
html = br.response().read()
result_soup = BeautifulSoup(html)
found_results = result_soup.findAll('div' , attrs={'class':'js-tile tile-landscape'})
for result in found_results:
title = result.find('a' , attrs={'class':'js-product-title'})
links = result.find('a' , href=True)
before_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('<span class="sup">$</span>')[1].split('<span class="visuallyhidden">')[0]
after_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('</span><span class="sup">')[1].split('</span>')[0]
prices = before_decimal+'.'+after_decimal
inArray = [float(prices), ""+links['href']]
Thats Because the mentioned class is not there in the page.
Django Request.query_parmas issue

Below is the url that i am going to get, i am able to retrieve the specific data with the patientName and patientNRIC. However, how should I make to be variables, using params? like I would like to pass in to values to the function but achieved the same information instead of hard coded. Thank you for your help.
import requests
def get_patient():
# patientName= {'patientName': patientName}
# patientNRIC = {'patientNRIC': patientNRIC}
p = {'patientName':'John','PatientNRIC':'S1111111A'}
django_url = ""
r = requests.get(django_url)
r = r.json()
import requests
def get_patient(p):
query_string = ''
for query, value in p.items():
django_url = f'{query_string}'
r = requests.get(django_url)
r = r.json()
p = {'patientName':'John','patientNRIC':'S1111111A'}
import requests
def get_patient():
# patientName= {'patientName': patientName}
# patientNRIC = {'patientNRIC': patientNRIC}
p = {'patientName':'John','PatientNRIC':'S1111111A'}
django_url = ""
r = requests.get(django_url, params=p)
I want to return the elements of list into a data frame as below. I am a beginner

I attempted 3 different ways:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as soup
from selenium import webdriver
driver = webdriver.Chrome(executable_path='C:/Users/lemonade/Documents/work/chromedriver')
my_url = ""
def make_soup(url):
m_soup = soup(driver.page_source, features='html.parser')
return m_soup
main_page = make_soup(my_url)
boroughs = [borough.text.strip() for borough in'.seo_links.seo_links_country [href]')]
indexs = list(range(16,19))
london_list = [boroughs[i] for i in indexs]
boroughs1 = [bo.replace("Borough","") for bo in london_list]
boroughs2 = [b1.replace("&","and") for b1 in boroughs1]
boroughs3 = ['-'.join(b2.split()) for b2 in boroughs2]
borough_links = ["" + b3 for b3 in boroughs3]
borough_soup = [make_soup(b_link) for b_link in borough_links]
for soups in borough_soup:
titles = [title.text.strip() for title in'.home-name [href]')]
for soups in borough_soup:
addresses = [address.text.strip() for address in'.home-name>p.grey')]
df = pd.DataFrame(zip(titles, addresses), columns = ['title','address'])
I tried the code below instead: This gave |AttributeError: 'list' object has no attribute 'text'|
title_html = ['.home-name [href]') for soups in borough_soup]
titles = [title.text.strip() for title in title_html ]
addresses_html =['.home-name>p.grey') for soups in borough_soup]
addresses = [address.text.strip() for address in addresses_html]
I tried to create and append a list and return that list. [See Below] This just outputted a single element from the list.
def func(borough_soup):
for soups in borough_soup:
titles = [title_html.text.strip() for title_html in'.home-name [href]')]
for title in titles:
titles1 = []
This was the fix. Creating function with an empty list and then appending each element to the list. After that concating each DF
def title(x):
titles1 = []
for soups in borough_soup:
titles = [title.text.strip() for title in'.home-name [href]')]
titles = title(borough_soup)
def address(x):
address1 = []
for soups in borough_soup:
addresses = [address.text.strip() for address in'.home-name>p.grey')]
addresses = address(borough_soup)
indexs2 = list(range(0,2))
df_list = [pd.DataFrame(zip(titles[i], addresses[i])) for i in indexs2]
Scrapy detect if Xpath not exists

I've been trying to make my first crawler and i've acomplished what i needed ( get the 1º shop and 2º shop shipping infos and prices ) but with 2 crawlers instead of 1 because i've a big stopper here.
When there'are more than 1 shop the output result is:
In [1]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()').extract()
[u'ENV\xcdO 3,95\u20ac ',
u'ENV\xcdO 4,95\u20ac ']
To get only the second result i'm using:
In [2]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
Out[2]: u'ENV\xcdO GRATIS'
But when there's no second results ( only 1 shop ) i'm getting:
IndexError: list index out of range
And the crawler skip the full page even if the other items have data ...
After trying several times i've decided to do a fast solution to get the result, 2 crawlers 1 for first shops and the other for the second one but right now i want to do it clean in only 1 crawler.
Some help, tip or advice will be appreciated, that's my first try making a recursive crawler with scrapy, kinda like it.
There's the code:
# -*- coding: utf-8 -*-
import scrapy
from Guapalia.items import GuapaliaItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class GuapaliaSpider(CrawlSpider):
name = "guapalia"
allowed_domains = [""]
start_urls = (
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='list-display__item list-display__item--product']/div/a[#class='col-xs-10 col-sm-10 col-md-12 clickOnProduct']"),callback='parse_articles',follow=True),
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')[1].extract()
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_second_shop'] = articles_second_shop if articles_second_shop else 'N/A'
item['articles_second_shipping'] = articles_second_shipping
item['articles_name'] = articles_name
yield item
Basic output of crawler with the right format when there're more than 1 shop:
2017-09-21 09:53:11 [scrapy] DEBUG: Crawled (200) <GET> (referer:
2017-09-21 09:53:11 [scrapy] DEBUG: Scraped from <200>
{'articles_first_shipping': [u'ENV\xcdO GRATIS'],
'articles_first_shop': [u'DOUGLAS'],
'articles_name': [u'ZEN edp vaporizador 100 ml'],
'articles_second_shipping': u'ENV\xcdO 3,99\u20ac ',
'articles_second_shop': u'BUYSVIP',
'articles_urls': ''}
The problem is when doesn't exists a second shop because my code on the field second shop
IndexError: list index out of range
SOLUTION Thanks to #Tarun Lalwani
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
if len(articles_second_shop) > 1:
item['articles_second_shop'] = articles_second_shop[1].extract()
item['articles_second_shop'] = 'Not Found'
if len(articles_second_shipping) > 1:
item['articles_second_shipping'] = articles_second_shipping[1].extract()
item['articles_second_shipping'] = 'Not Found'
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_name'] = articles_name
yield item
You need to get the result in a variable first. Then you can take decision based on its length
texts = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
if len(texts) > 1:
data = texts[1].extract()
elif len(text) == 1:
data = texts[0].extract()
data ="Not found"

How to use beautifulsoup to save html of a link in a file and do the same with all the links in the html file

I'm trying to write a parser which will take a url and download it's html in a .html file. Then it'll go through the html file to find all links and save them as well. I want to repeat it multiple time. Can some one please help a little?
This is the code I have written:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
count = 1
give_url = raw_input("Enter url:\t")
def magic(give_url):
page = urllib2.urlopen(give_url)
page_content =
with open('page_content.html', 'w') as fid:
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
link_set.add(give_url + str(html_link))
for each_item in link_set:
print each_item
print "\n"
Although it's working fine but When I try to call the magic function in for loop, i get RuntimeError: Set changed size during iteration.
I got it working.
The code for recursive URL parsing using beautiful soup:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content =
with open('page_content.html', 'w') as fid:
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
# print "Total links in the given url are: " + str(len(link_set))
link_set2 = set()
link_set3 = set()
for element in link_set:
count = 1
for element in link_set:
count += 1
for each_item in link_set3:
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
There are many mistakes so I request you all to please tell me where I can improve the code.

Not geting min price and link of any keyword entered

from BeautifulSoup import BeautifulSoup
import mechanize
import re
def price_walmart_match(keyword):
url = ""+keyword
br = mechanize.Browser()
html = br.response().read()
result_soup = BeautifulSoup(html)
found_results = result_soup.findAll('div' , attrs={'class':'js-tile tile-landscape'})
for result in found_results:
title = result.find('a' , attrs={'class':'js-product-title'})
links = result.find('a' , href=True)
before_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('<span class="sup">$</span>')[1].split('<span class="visuallyhidden">')[0]
after_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('</span><span class="sup">')[1].split('</span>')[0]
prices = before_decimal+'.'+after_decimal
inArray = [float(prices), ""+links['href']]
result = []
minval = all_results[0][0]
for values in all_results:
if minval >= values[0]:
result = values
minval = values[0]