Python web crawler using BeautifulSoup, trouble getting URLs - python-2.7

so I am trying to build a dynamic web crawler to get all url links within links.
so far i am able to get all the links for Chapters, but when I trying to do section links from each chapter, my output does not print out anything.
the code i used :
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
#########################Sections#######################
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)

With some minor modifications to your code, I was able to get it to run and output the sections. Mainly, you needed to fix your indentation, and define a function before you call it.
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
try:
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
else:
continue
except KeyError:
continue
#########################Sections#######################
output:
/codes/alabama/2015/title-3/chapter-1/section-3-1-1/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-2/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-3/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-4/index.html etc.

You don't need any try/except blocks, you can use href=True with find or find_all to only select the anchor tags with href's or a css select a[href] as below, the chapter links are in the first ul with inside the article tag with the id #maincontent so you don't need to filter at all:
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
import requests
from bs4 import BeautifulSoup
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.content, "html.parser")
section_links = [a["href"] for a in soup.select('div .primary-content a[href]')]
print (section_links)
for title in range(1, 4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup(r.content, "html.parser").select("#maincontent ul:nth-of-type(1) a[href]"):
href = "http://law.justia.com" + link['href']
leveltwo(href)
If you were to use find_all you simply need to pass find_all(.., href=True) to filter your anchor tags to only select ones that have hrefs.

Related

Webscraping is buggy through AWS Lambda, but works fine in VS Code and on EC2 instance

My dependencies are fine, Lambda doesn't create any errors, code runs smoothly. I also checked memory (512MB) and timeout (5 mins). Just instead of a list of HTML divs I'm getting a list of empty lists. Interestingly there are quite a few lists nested, so it might even be the number of divs I'm trying to scrape, they're just completely empty.
import requests
from bs4 import BeautifulSoup
def lambda_handler(event, context):
url3='https://www.szybko.pl/l/na-sprzedaz/lokal-mieszkalny/Wroc%C5%82aw?assetType=lokal-mieszkalny&localization_search_text=Wroc%C5%82aw&market=aftermarket&price_min_sell=200000&price_max_sell=400000&meters_min=30&rooms_min=2'
def get_last_page3(url):
result = requests.get(url)
source = result.content
soup = BeautifulSoup(source, 'html.parser')
last_page = soup.find_all("li",{'class': 'blank'})[1].text
return int(last_page)
def get_list_of_soups3(url):
list_of_soups=[]
for page in range(1,get_last_page3(url)+1):
try:
result = requests.get(url+'&strona='+str(page))
source = result.content
soup = BeautifulSoup(source, 'html.parser')
ads = soup.find_all("div",{'class': "gt-listing-item-asset listing-item"})
list_of_soups.append(ads)
except Exception as e:
print(e)
break
return list_of_soups
all_ads3 = []
try:
for soup in get_list_of_soups3(url3):
for s in soup:
name = s.find("a")['aria-label'].replace('Szczegóły ogłoszenia - ','')
district = s.find("a",{'class': 'mapClassClick list-elem-address popup-gmaps'}).text.replace('\n','').replace(' ','').replace(', dolnośląskie','')
price = s.find("span",{'class': 'listing-price'}).text.strip().replace(' zł','').replace(' ','')[:6]
rooms = s.find("li",{'class': 'asset-feature rooms'}).text.replace(' ','')
sq = s.find("li",{'class': 'asset-feature area'}).text.replace('m²','').replace(',','.')
price_sq = s.find("span",{'class': 'listing-price'}).find('i').text.replace('zł/m²','').replace(' ','').strip()
link = s.find('a')['href'].strip()
ad=[name,district,int(price),int(rooms),round(float(sq)),int(price_sq),link]
all_ads3.append(ad)
except Exception as e:
print('error: website changed or unresponsive',e)
return get_list_of_soups3(url3)
Also, a similar code scraping a similar website works perfectly fine from both IDE and Lambda. Both Lambdas are configured in the same way.
I'm using Python with requests and beautiful soup libraries.
I was able to solve this by changing the HTML class of divs scraped in the second function. I achived this with print statement debugging.
Not sure what is the reason, my guess would be that maybe Lambda couldn't handle a photothumbnail that was included in the original div? Maybe something to do with the way ads are generated on this particular website?
Code also includes my print statements in comments and has try/except removed. The crucial change in line 29: ads = soup.find_all("div",{'class': "listing-content"})
import requests
from bs4 import BeautifulSoup
def lambda_handler(event, context):
# # Scraping url3: szybko.pl
url3='https://www.szybko.pl/l/na-sprzedaz/lokal-mieszkalny/Wroc%C5%82aw?assetType=lokal-mieszkalny&localization_search_text=Wroc%C5%82aw&market=aftermarket&price_min_sell=200000&price_max_sell=400000&meters_min=30&rooms_min=2'
def get_last_page3(url):
result = requests.get(url)
source = result.content
#print('SOURCE:',source)
soup = BeautifulSoup(source, 'html.parser')
last_page = soup.find_all("li",{'class': 'blank'})[1].text
print('PAGE:',last_page)
return int(last_page)
def get_list_of_soups3(url):
list_of_soups=[]
for page in range(1,get_last_page3(url)+1):
try:
result = requests.get(url+'&strona='+str(page))
#print('RESULT:',result)
source = result.content
soup = BeautifulSoup(source, 'html.parser')
#print('SOUP:',soup) #it's fine
ads = soup.find_all("div",{'class': "listing-content"})
#print('ADS:',ads)
list_of_soups.append(ads)
except Exception as e:
print(e)
break
return list_of_soups
all_ads3 = []
for soup in get_list_of_soups3(url3):
for s in soup:
name = s.find("a",{'class': 'listing-title-heading hide-overflow-text'}).find("div",{'class': "tooltip"}).text#.replace('Szczegóły ogłoszenia - ','')
district = s.find("a",{'class': 'mapClassClick list-elem-address popup-gmaps'}).text.replace('\n','').replace(' ','').replace(', dolnośląskie','').strip()
price = s.find("div",{'class': 'listing-title'}).find_all("span")[2]['content']#.text.strip().replace(' zł','').replace(' ','')[:6]
rooms = s.find("li",{'class': 'asset-feature rooms'}).text.replace(' ','')
sq = s.find("li",{'class': 'asset-feature area'}).text.replace('m²','').replace(',','.')
price_sq = int(price)/round(float(sq))
link = s.find('a')['href'].strip()
ad=[name,district,int(price),int(rooms),round(float(sq)),int(price_sq),link]
all_ads3.append(ad)
return len(all_ads3)

BeautifulSoup unable to get Inner tags

I am currently trying to scrape product data off lazada.sg using bs4 in the below code.
from bs4 import BeautifulSoup
import requests
url = "https://www.lazada.sg/shop-mobiles/"
page = requests.get(url)
content = page.text #read html
soup = BeautifulSoup(content, 'html.parser')
products = soup.find_all("div", {"class" : "c16H9d"}) #find div tags containing product details
with open("test.txt", 'w') as f:
f.write(str(products))
However the output in test.txt is just [].
I found that the above class is in <div id="root">, which I extracted and got this result.
How will I be able to access the 'inner div tags'?
Here is a snippet of the page source.
Data is dynamically loaded from script tag. You can regex out and use json library to view. You will need to adjust print line presumably for 2.7
import requests, re, json, pprint
r = requests.get('https://www.lazada.sg/shop-mobiles/')
p = re.compile(r'window.pageData=(.*)<')
data = json.loads(p.findall(r.text)[0])
for item in data['mods']['listItems']:
pprint.pprint(item)
break # delete me later

Reading an image url with beautifulsoup

I'm trying to read a picture from a website. This is my code so far:
from bs4 import BeautifulSoup
import requests
url = 'https://www.basketball-reference.com/players/h/hardeja01.html'
page_request = requests.get(url)
soup = BeautifulSoup(page_request.text,"lxml")
img_src = soup.find("div", {"class": "media-item"})
print img_src
# <div class="media-item"><img alt="Photo of James Harden" itemscope="image" src="https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg"/>\n</div>
I'm interested in the url of the jpg image. I can write some regular expression to get the jpg but there must be some easier way to do that.
What is the best way to extract the url of the jpg?
You can do that in several ways. This as one of such approach:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.basketball-reference.com/players/h/hardeja01.html")
soup = BeautifulSoup(page.text, 'html.parser')
image = soup.find(itemscope="image")['src']
print(image)
Output:
https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg
You can use a select method that works with CSS selectors:
img_src = soup.select_one('.media-item > img')['src']
You can also try out Requests-HTML:
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.basketball-reference.com/players/h/hardeja01.html')
>>> r.html.find('.media-item > img', first=True).attrs['src']
'https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg'
There is a very simple solution:
img_src = soup2.find("div", class_="media-item").find('img')['src']

Beautiful Soup - Unable to scrape links from paginated pages

I'm unable to scrape the links of the articles present in the paginated webpages. Additionally I get a blank screen at times as my output. I am unable to find the problem in my loop. Also the csv file doesn't get created.
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import lxml
import csv
import urllib2
def get_url_for_search_key(search_key):
for i in range(1,100):
base_url = 'http://www.thedrum.com/'
response = requests.get(base_url + 'search?page=%s&query=' + search_key +'&sorted=')%i
soup = BeautifulSoup(response.content, "lxml")
results = soup.findAll('a')
return [url['href'] for url in soup.findAll('a')]
pprint(get_url_for_search_key('artificial intelligence'))
with open('StoreUrl.csv', 'w+') as f:
f.seek(0)
f.write('\n'.join(get_url_for_search_key('artificial intelligence')))
Are you sure, that you need only first 100 pages? Maybe there's more of them...
My vision of your task below, this will collect links from all pages and also precisely catches next page button links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
results = soup.findAll('a')
res.append([url['href'] for url in soup.findAll('a')])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
EDIT: alternative approach for collecting only article links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
search_results = soup.find('div', class_='search-results') #localizing search window with article links
article_link_tags = search_results.findAll('a') #ordinary scheme goes further
res.append([url['href'] for url in article_link_tags])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
to print links use:
for i in res:
for j in i:
print(j)

How to get attribute values from variable in Python

So I'm doing a relatively simple project so I can teach myself Python. I've come to a point where I'm stuck. So I have a variable named element in pycharm debugger which shows as
This variable is type Tag, which is correct to me. In element I want to see if the class="schedule_dgrd_time/result"which is not the case in the above image.
I see that within element there is an attrs.
How can I access that value? If I do element.string I get the text value which in this case would be Sat.(...I could make that work), but I was wondering if I can check the class attribute value first.
I've been searching for this for a couple days now and just can't get it. I've googled myself to death at this point. Any help or pointers would be greatly appreciated. Thanks for reading.
Update
Here is my code
import urllib2
import datetime
import re
from bs4 import BeautifulSoup
# today's date
date = datetime.datetime.today().strftime('%-m/%d/%Y')
validDay = "Mon\.|Tue\.|Wed\.|Thu(r)?(s)?\.|Fri\."
website = "http://www.texassports.com/schedule.aspx?path=baseball"
opener = urllib2.build_opener()
##add headers that make it look like I'm a browser
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
page = opener.open(website)
# turn page into html object
soup = BeautifulSoup(page, 'html.parser')
#print soup.prettify()
#get all home games
all_rows = soup.find_all('tr', class_='schedule_home_tr')
# see if any game is today
# entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('.*({}).*'.format(date)))]
# hard coding for testing weekend
entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('3/11/2017'))]
time = "schedule_dgrd_time/result"
for elements in entryForToday:
for element in elements:
#this is where I'm stuck.
# if element.attrs:
# print element.attrs['class'][0]
I know a double nested for loop is not ideal so if you have a better way I'm glad to hear it. Thanks
So I was able to figure out. I have some NavigableString which doesn't have attrs so that was throwing an error. element.attrs['class'][0] does work now. I had to check if isinstanceOf a tag, if not it would skip it. Anywho, my code is below for anyone that is interested.
import urllib2
import datetime
import re
from bs4 import BeautifulSoup
from bs4 import Tag
# today's date
date = datetime.datetime.today().strftime('%-m/%d/%Y')
validDay = "Mon\.|Tue\.|Wed\.|Thu(r)?(s)?\.|Fri\."
website = "http://www.texassports.com/schedule.aspx?path=baseball"
opener = urllib2.build_opener()
##add headers that make it look like I'm a browser
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
page = opener.open(website)
# turn page into html object
soup = BeautifulSoup(page, 'html.parser')
#print soup.prettify()
#get all home games
all_rows = soup.find_all('tr', class_='schedule_home_tr')
# see if any game is today
# entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('.*({}).*'.format(date)))]
# hard coding for testing weekend
entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('3/14/2017'))]
classForTime = "schedule_dgrd_time/result"
timeOfGame = "none";
if entryForToday:
entryForToday = [t for t in entryForToday if t.findAll('td',
class_='schedule_dgrd_game_day_of_week',
text=re.compile('.*({}).*'.format(validDay)))]
if entryForToday:
for elements in entryForToday:
for element in elements:
if isinstance(element, Tag):
if element.attrs['class'][0] == classForTime:
timeOfGame = element.text
# print element.text
break
print timeOfGame