Beautiful Soup - Unable to scrape links from paginated pages - python-2.7

I'm unable to scrape the links of the articles present in the paginated webpages. Additionally I get a blank screen at times as my output. I am unable to find the problem in my loop. Also the csv file doesn't get created.
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import lxml
import csv
import urllib2
def get_url_for_search_key(search_key):
for i in range(1,100):
base_url = 'http://www.thedrum.com/'
response = requests.get(base_url + 'search?page=%s&query=' + search_key +'&sorted=')%i
soup = BeautifulSoup(response.content, "lxml")
results = soup.findAll('a')
return [url['href'] for url in soup.findAll('a')]
pprint(get_url_for_search_key('artificial intelligence'))
with open('StoreUrl.csv', 'w+') as f:
f.seek(0)
f.write('\n'.join(get_url_for_search_key('artificial intelligence')))

Are you sure, that you need only first 100 pages? Maybe there's more of them...
My vision of your task below, this will collect links from all pages and also precisely catches next page button links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
results = soup.findAll('a')
res.append([url['href'] for url in soup.findAll('a')])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
EDIT: alternative approach for collecting only article links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
search_results = soup.find('div', class_='search-results') #localizing search window with article links
article_link_tags = search_results.findAll('a') #ordinary scheme goes further
res.append([url['href'] for url in article_link_tags])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
to print links use:
for i in res:
for j in i:
print(j)

Related

Python bs4 find /n instead wanted text

I tried to scrape a website using requests and bs4, T tried to get the 'title' of msgProdStockOut, I used soup.find to find id = prodMainImg but it returns /n and nothing in msgProdStockOut.
Is there a way to get the title in msgProdStockOut directly?
g)
import requests
from bs4 import BeautifulSoup as soup
my_url = 'https://www.gu-global.com/tw/store/goods/325571'
r = requests.get(my_url).content
soup = soup(r, 'html.parser')
soup.find(id = 'msgProdStockOut')
#None
soup.find(id = 'prodMainImg ').text
#u'/n'

BeautifulSoup web table scraping

from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
site = 'https://racing.hkjc.com/racing/information/English/racing/LocalResults.aspx/'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
res = urlopen(req)
rawpage = res.read()
page = rawpage.replace("<!-->", "")
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table", {"class":"f_tac table_bd draggable"})
print (table)
this work perfectly got a table output, untill i change the url to next page there is nothing to output (None)
'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2020/03/14&Racecourse=ST&RaceNo=2'
please help what's wrong of the url or the code?
you must add query string to end of url:
example:
to fetch table from page 2:
site ='https://racing.hkjc.com/racing/information/English/racing/LocalResults.aspx/?RaceDate=2020/03/14&Racecourse=ST&RaceNo=2'

Reading an image url with beautifulsoup

I'm trying to read a picture from a website. This is my code so far:
from bs4 import BeautifulSoup
import requests
url = 'https://www.basketball-reference.com/players/h/hardeja01.html'
page_request = requests.get(url)
soup = BeautifulSoup(page_request.text,"lxml")
img_src = soup.find("div", {"class": "media-item"})
print img_src
# <div class="media-item"><img alt="Photo of James Harden" itemscope="image" src="https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg"/>\n</div>
I'm interested in the url of the jpg image. I can write some regular expression to get the jpg but there must be some easier way to do that.
What is the best way to extract the url of the jpg?
You can do that in several ways. This as one of such approach:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.basketball-reference.com/players/h/hardeja01.html")
soup = BeautifulSoup(page.text, 'html.parser')
image = soup.find(itemscope="image")['src']
print(image)
Output:
https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg
You can use a select method that works with CSS selectors:
img_src = soup.select_one('.media-item > img')['src']
You can also try out Requests-HTML:
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.basketball-reference.com/players/h/hardeja01.html')
>>> r.html.find('.media-item > img', first=True).attrs['src']
'https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg'
There is a very simple solution:
img_src = soup2.find("div", class_="media-item").find('img')['src']

how to get particular tag data from url in python from urllib2

I'm very new to python 2.7 and I have a task to read a table in the URL.
I'm getting the data from URL with table. and now the issue is, I need only data but I am getting with tags also.
Please help me. Thank you in advance.
from bs4 import BeautifulSoup
import urllib2
response = urllib2.urlopen('https://www.somewebsite.com/')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "defaultTableStyle tableFontMD tableNoBorder"})
records = []
for row in tabulka.findAll('tr'):
col = row.findAll('td')
print col
you have to use .text attribute
from bs4 import BeautifulSoup
import urllib2
response = urllib2.urlopen('https://www.somewebsite.com/')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "defaultTableStyle tableFontMD tableNoBorder"})
records = []
for row in tabulka.findAll('tr'):
col = row.findAll('td')
print [coli.text for coli in col]

Python web crawler using BeautifulSoup, trouble getting URLs

so I am trying to build a dynamic web crawler to get all url links within links.
so far i am able to get all the links for Chapters, but when I trying to do section links from each chapter, my output does not print out anything.
the code i used :
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
#########################Sections#######################
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
With some minor modifications to your code, I was able to get it to run and output the sections. Mainly, you needed to fix your indentation, and define a function before you call it.
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
try:
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
else:
continue
except KeyError:
continue
#########################Sections#######################
output:
/codes/alabama/2015/title-3/chapter-1/section-3-1-1/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-2/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-3/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-4/index.html etc.
You don't need any try/except blocks, you can use href=True with find or find_all to only select the anchor tags with href's or a css select a[href] as below, the chapter links are in the first ul with inside the article tag with the id #maincontent so you don't need to filter at all:
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
import requests
from bs4 import BeautifulSoup
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.content, "html.parser")
section_links = [a["href"] for a in soup.select('div .primary-content a[href]')]
print (section_links)
for title in range(1, 4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup(r.content, "html.parser").select("#maincontent ul:nth-of-type(1) a[href]"):
href = "http://law.justia.com" + link['href']
leveltwo(href)
If you were to use find_all you simply need to pass find_all(.., href=True) to filter your anchor tags to only select ones that have hrefs.