Reading an image url with beautifulsoup - python-2.7

I'm trying to read a picture from a website. This is my code so far:
from bs4 import BeautifulSoup
import requests
url = 'https://www.basketball-reference.com/players/h/hardeja01.html'
page_request = requests.get(url)
soup = BeautifulSoup(page_request.text,"lxml")
img_src = soup.find("div", {"class": "media-item"})
print img_src
# <div class="media-item"><img alt="Photo of James Harden" itemscope="image" src="https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg"/>\n</div>
I'm interested in the url of the jpg image. I can write some regular expression to get the jpg but there must be some easier way to do that.
What is the best way to extract the url of the jpg?

You can do that in several ways. This as one of such approach:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.basketball-reference.com/players/h/hardeja01.html")
soup = BeautifulSoup(page.text, 'html.parser')
image = soup.find(itemscope="image")['src']
print(image)
Output:
https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg

You can use a select method that works with CSS selectors:
img_src = soup.select_one('.media-item > img')['src']
You can also try out Requests-HTML:
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.basketball-reference.com/players/h/hardeja01.html')
>>> r.html.find('.media-item > img', first=True).attrs['src']
'https://d2cwpp38twqe55.cloudfront.net/req/201804182/images/players/hardeja01.jpg'

There is a very simple solution:
img_src = soup2.find("div", class_="media-item").find('img')['src']

Related

Python bs4 find /n instead wanted text

I tried to scrape a website using requests and bs4, T tried to get the 'title' of msgProdStockOut, I used soup.find to find id = prodMainImg but it returns /n and nothing in msgProdStockOut.
Is there a way to get the title in msgProdStockOut directly?
g)
import requests
from bs4 import BeautifulSoup as soup
my_url = 'https://www.gu-global.com/tw/store/goods/325571'
r = requests.get(my_url).content
soup = soup(r, 'html.parser')
soup.find(id = 'msgProdStockOut')
#None
soup.find(id = 'prodMainImg ').text
#u'/n'

BeautifulSoup unable to get Inner tags

I am currently trying to scrape product data off lazada.sg using bs4 in the below code.
from bs4 import BeautifulSoup
import requests
url = "https://www.lazada.sg/shop-mobiles/"
page = requests.get(url)
content = page.text #read html
soup = BeautifulSoup(content, 'html.parser')
products = soup.find_all("div", {"class" : "c16H9d"}) #find div tags containing product details
with open("test.txt", 'w') as f:
f.write(str(products))
However the output in test.txt is just [].
I found that the above class is in <div id="root">, which I extracted and got this result.
How will I be able to access the 'inner div tags'?
Here is a snippet of the page source.
Data is dynamically loaded from script tag. You can regex out and use json library to view. You will need to adjust print line presumably for 2.7
import requests, re, json, pprint
r = requests.get('https://www.lazada.sg/shop-mobiles/')
p = re.compile(r'window.pageData=(.*)<')
data = json.loads(p.findall(r.text)[0])
for item in data['mods']['listItems']:
pprint.pprint(item)
break # delete me later

I'm scraping two different websites with beautifulSoup - how can I run it in one code?

I'm using BeautifulSoup to scrape several company sites for job positions (I have permission). They have slightly different HTML structure so I have created several scrapers to scrape the individual websites. The output from the scrapers are the same which is the urls to the job postings.
Issue
I have the scrapers and they are working fine individually - however for efficiency purposes I would like to be able to run them at the same time, instead of having to run them individually. What's the easiest way to do that?
Scraper 1
import requests
from bs4 import BeautifulSoup
base = "http://implementconsultinggroup.com"
url = "http://implementconsultinggroup.com/career/#/1143"
req = requests.get(url).text
soup = BeautifulSoup(req,'html.parser')
links = soup.select("a")
for link in links:
if "career" in link.get("href") and 'COPENHAGEN' in link.text:
res = requests.get(base + link.get("href")).text
soup = BeautifulSoup(res,'html.parser')
title = soup.select_one("h1.page-intro__title").get_text() if
soup.select_one("h1.section__title") else ""
overview = soup.select_one("p.page-intro__longDescription").get_text()
details = soup.select_one("div.rte").get_text()
print(title, link, details)
Scraper 2
import requests
from bs4 import BeautifulSoup
url =
"http://deloittedk.easycruit.com/_sp=136ecff9b65625bf.1504382903200&icid=top_"
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find_all("a")
for link in links:
print "<a href='%s'>%s</a>" %(link.get("href"), link.text)

Beautiful Soup - Unable to scrape links from paginated pages

I'm unable to scrape the links of the articles present in the paginated webpages. Additionally I get a blank screen at times as my output. I am unable to find the problem in my loop. Also the csv file doesn't get created.
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import lxml
import csv
import urllib2
def get_url_for_search_key(search_key):
for i in range(1,100):
base_url = 'http://www.thedrum.com/'
response = requests.get(base_url + 'search?page=%s&query=' + search_key +'&sorted=')%i
soup = BeautifulSoup(response.content, "lxml")
results = soup.findAll('a')
return [url['href'] for url in soup.findAll('a')]
pprint(get_url_for_search_key('artificial intelligence'))
with open('StoreUrl.csv', 'w+') as f:
f.seek(0)
f.write('\n'.join(get_url_for_search_key('artificial intelligence')))
Are you sure, that you need only first 100 pages? Maybe there's more of them...
My vision of your task below, this will collect links from all pages and also precisely catches next page button links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
results = soup.findAll('a')
res.append([url['href'] for url in soup.findAll('a')])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
EDIT: alternative approach for collecting only article links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
search_results = soup.find('div', class_='search-results') #localizing search window with article links
article_link_tags = search_results.findAll('a') #ordinary scheme goes further
res.append([url['href'] for url in article_link_tags])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
to print links use:
for i in res:
for j in i:
print(j)

Python web crawler using BeautifulSoup, trouble getting URLs

so I am trying to build a dynamic web crawler to get all url links within links.
so far i am able to get all the links for Chapters, but when I trying to do section links from each chapter, my output does not print out anything.
the code i used :
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
#########################Sections#######################
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
With some minor modifications to your code, I was able to get it to run and output the sections. Mainly, you needed to fix your indentation, and define a function before you call it.
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
try:
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
else:
continue
except KeyError:
continue
#########################Sections#######################
output:
/codes/alabama/2015/title-3/chapter-1/section-3-1-1/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-2/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-3/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-4/index.html etc.
You don't need any try/except blocks, you can use href=True with find or find_all to only select the anchor tags with href's or a css select a[href] as below, the chapter links are in the first ul with inside the article tag with the id #maincontent so you don't need to filter at all:
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
import requests
from bs4 import BeautifulSoup
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.content, "html.parser")
section_links = [a["href"] for a in soup.select('div .primary-content a[href]')]
print (section_links)
for title in range(1, 4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup(r.content, "html.parser").select("#maincontent ul:nth-of-type(1) a[href]"):
href = "http://law.justia.com" + link['href']
leveltwo(href)
If you were to use find_all you simply need to pass find_all(.., href=True) to filter your anchor tags to only select ones that have hrefs.