BeautifulSoup output is not transferred to CSV file

BeautifulSoup output is not transferred to CSV file - python-2.7

I'm trying to export the output from a webscraper to a CSV file. The code works and I get the right output when I run it in the terminal, but it does not transfer to the CSV file.
Question
When I remove the first for loop it works fine, but I can't figure out exactly what the error it in this part?
Code
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('ImplementTest8.csv','w')
writer = csv.writer(outfile)
writer.writerow(["job_link", "job_desc"])
res = requests.get("http://implementconsultinggroup.com/career/#/6257").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")
for li in soup.find('ul', class_='list-articles list').find_all('li'):
level = li.find_all('dd', {'class': 'author'})[1].get_text()
if "Graduate" in level:
links = li.find_all("href")
for link in links:
if "career" in link.get("href") and 'COPENHAGEN' in link.text:
item_link = link.get("href").strip()
item_text = link.text.replace("View Position","").encode('utf-8').strip()
writer.writerow([item_link, item_text])
print(item_link, item_text)
Edited Code
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('ImplementTest8.csv','w')
writer = csv.writer(outfile)
writer.writerow(["job_link", "job_desc"])
res = requests.get("http://implementconsultinggroup.com/career/#/6257").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")
for li in soup.find('ul', class_='list-articles list').find_all('li'):
level = li.find_all('dd', {'class': 'author'})[1].get_text()
if "Graduate" in level:
links = li.find_all(href=True)
for link in links:
if "career" in link.get("href") and 'COPENHAGEN' in link.text:
item_link = link.get("href").strip()
item_text = link.text.replace("View Position","").encode('utf-8').strip()
writer.writerow([item_link, item_text])
print(item_link, item_text)

Href is a tag attribute not a tag name. if you want to ensure that all your links have a href attribute you can use it as a keyward argument, else use the tag name.
links = li.find_all(href=True)

Related

Beautiful Soup multiple findAll classes

Im working on a project to get a database on how many dogs get withdrawn prior to a race.
I need to scrape the data to then write into csv.
My problem is that the data that i'm scraping has a image instead of text(Between PLC and Greayhound on the webpage).
That means im running 2 different loops to get the info i need and then there hard to concatenate back in its correct place.
Here is the code.
import requests
import csv
URL = "https://www.thedogs.com.au/Racing/MeetResults.aspx?meetId=255268"
page = requests.get(URL)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')
#soup.findAll('td', class_='ResultsCenteredCellContents'):
odds=[]
dog = soup.findAll('img' )
for a in dog:
odds.append(a['src'].strip())
odds1=[]
for b in soup.findAll('td'):
odds1.append(b.text.strip())
So if i could run all the code i need in the one loop that can be written in CSV that would be great.

Yes they are displayed in images but if you notice the images are named according to the number it represents src="../Images/BoxNumber4.gif that image represents 4
import requests , csv
from bs4 import BeautifulSoup
def SaveAsCsv(list_of_rows,file_name):
try:
print('\nSaving CSV Result')
with open(file_name, 'a', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
writer.writerow(list_of_rows)
print("rsults saved successully")
except PermissionError:
print(f"Please make sure {file_name} is closed \n")
URL = "https://www.thedogs.com.au/Racing/MeetResults.aspx?meetId=255268"
page = requests.get(URL, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(page.text, 'html.parser')
tables = soup.findAll('table',{'id':"gvRaceResults"}) # getting all 11 tabs
for table_index , table in enumerate(tables,1):
print(f'Getting tab {table_index} out of {len(tables)} ')
rows = table.findAll('tr')
header_row = [row.text.strip() for row in rows[0].findAll('th')]
SaveAsCsv(header_row , 'thedogs.csv')
for index , row in enumerate(rows[1:],1):
print(f'Getting row {index} out of {len(rows[1:])} ')
row_list = []
colms = row.findAll('td')
for col in colms :
if 'ResultsContentsNumber' in col.attrs['class']:
image_src = col.find('img').get('src') #src="../Images/BoxNumber4.gif">
image_num = image_src.split('BoxNumber')[1].split('.')[0] # 4
row_list.append(image_num)
continue
row_list.append(col.text.strip())
SaveAsCsv(row_list , 'thedogs.csv')
Output:

I have the following list of strings yet I want to apply filter so that I may certain item from the lists.How to do that?

I am trying to obtain the image data from the following website.
However, I am getting a list of data that contains the links that are not needed. I want to apply the filter so that I can only get the data that starts with /PIAimages. How to apply the filter to do that?
import requests
from bs4 import BeautifulSoup
import csv
result = []
response = requests.get("https://www.ikea.com/sa/en/catalog/products/00361049/")
assert response.ok
page = BeautifulSoup(response.text, "html.parser")
for des in page.find_all('img'):
image= des.get('src')
print(image)
Expected output:
/PIAimages/0531313_PE647261_S1.JPG
/PIAimages/0513228_PE638849_S1.JPG
/PIAimages/0618875_PE688687_S1.JPG
/PIAimages/0325432_PE517964_S1.JPG
/PIAimages/0690287_PE723209_S1.JPG
/PIAimages/0513996_PE639275_S1.JPG
/PIAimages/0325450_PE517970_S1.JPG
Actual output:
/ms/img/header/ikea-logo.svg
/ms/en_SA/img/header/ikea-store.png
/ms/img/header/main_menu_shadow.gif
/sa/en/images/products/strandmon-wing-chair-beige__0513996_PE639275_S4.JPG
/PIAimages/0531313_PE647261_S1.JPG
/PIAimages/0513228_PE638849_S1.JPG
/PIAimages/0618875_PE688687_S1.JPG
/PIAimages/0325432_PE517964_S1.JPG
/PIAimages/0690287_PE723209_S1.JPG
/PIAimages/0513996_PE639275_S1.JPG
/PIAimages/0325450_PE517970_S1.JPG
/ms/img/static/loading.gif
/ms/img/static/stock_check_green.gif
/ms/img/ads/services/ways_to_shop/20172_otav20a_assembly_20x20.jpg
/ms/en_SA/img/icons/picking-with-delivery.jpg
/ms/img/ads/services/ways_to_shop/20172_otav24a_pickingdelivery_20x20.jpg
/sa/en/images/products/strandmon-wing-chair-beige__0739100_PH147003_S4.JPG
https://smetrics.ikea.com/b/ss/ikeaallnojavascriptprod/5/?c8=sa&pageName=nojavascript

Use If clause then append data into list.
import requests
from bs4 import BeautifulSoup
result = []
response = requests.get("https://www.ikea.com/sa/en/catalog/products/00361049/")
assert response.ok
page = BeautifulSoup(response.text, "html.parser")
for des in page.find_all('img'):
image= des.get('src')
if 'PIAimages' in image:
result.append(image)
print(result)
OR use regular expression.This is much faster.
import requests
import re
from bs4 import BeautifulSoup
result = []
response = requests.get("https://www.ikea.com/sa/en/catalog/products/00361049/")
assert response.ok
page = BeautifulSoup(response.text, "html.parser")
for des in page.find_all('img', src=re.compile("PIAimages")):
image= des.get('src')
result.append(image)
print(result)

I think it faster and more concise to use css attribute = value selector with starts with operator. You specify the start substring for the src in the selector so only qualifying elements are returned.
import requests
from bs4 import BeautifulSoup
response = requests.get("https://www.ikea.com/sa/en/catalog/products/00361049/")
page = BeautifulSoup(response.text, "lxml")
images = [item['src'] for item in page.select('img[src^=\/PIAimages]')]
print(images)

Beautifulsoup: exclude unwanted parts

I realise it's probably a very specific question but I'm struggling to get rid of some parts of text I get using the code below. I need a plain article text which I locate by finding "p" tags under 'class':'mol-para-with-font'. Somehow I get lots of other stuff like author's byline, date stamp, and most importantly text from adverts on the page. Examining html I cannot see them containing the same 'class':'mol-para-with-font' so I'm puzzled (or maybe I've been staring at it for too long...). I know there are lots of html gurus here so I'll be grateful for your help.
My code:
import requests
import translitcodec
import codecs
def get_text(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
# delete unwanted tags:
for s in soup(['figure', 'script', 'style', 'table']):
s.decompose()
article_soup = [s.get_text(separator="\n", strip=True) for s in soup.find_all( ['p', {'class':'mol-para-with-font'}])]
article = '\n'.join(article_soup)
text = codecs.encode(article, 'translit/one').encode('ascii', 'replace') #replace traslit with ascii
text = u"{}".format(text) #encode to unicode
print text
url = 'http://www.dailymail.co.uk/femail/article-4703718/How-Alexander-McQueen-Kate-s-royal-tours.html'
get_text(url)

Only 'p'-s with class="mol-para-with-font" ?
This will give it to you:
import requests
from bs4 import BeautifulSoup as BS
url = 'http://www.dailymail.co.uk/femail/article-4703718/How-Alexander-McQueen-Kate-s-royal-tours.html'
r = requests.get(url)
soup = BS(r.content, "lxml")
for i in soup.find_all('p', class_='mol-para-with-font'):
print(i.text)

Regular expression to find precise pdf links in a webpage

Given url='http://normanpd.normanok.gov/content/daily-activity', the website has three types of arrests, incidents, and case summaries. I was asked to use regular expressions to discover the URL strings of all the Incidents pdf documents in Python.
The pdfs are to be downloaded in a defined location.
I have gone through the link and found that Incident pdf files URLs are in the form of:
normanpd.normanok.gov/filebrowser_download/657/2017-02-19%20Daily%20Incident%20Summary.pdf
I have written code :
import urllib.request
url="http://normanpd.normanok.gov/content/daily-activity"
response = urllib.request.urlopen(url)
data = response.read() # a `bytes` object
text = data.decode('utf-8')
urls=re.findall(r'(\w|/|-/%)+\sIncident\s(%|\w)+\.pdf$',text)
But in the URLs list, the values are empty.
I am a beginner in python3 and regex commands. Can anyone help me?

This is not an advisable method. Instead, use an HTML parsing library like bs4 (BeautifulSoup) to find the links and then only regex to filter the results.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url="http://normanpd.normanok.gov/content/daily-activity"
response = urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(Incident%20Summary\.pdf)'))
for el in links:
print("http://normanpd.normanok.gov" + el['href'])
Output :
http://normanpd.normanok.gov/filebrowser_download/657/2017-02-23%20Daily%20Incident%20Summary.pdf
http://normanpd.normanok.gov/filebrowser_download/657/2017-02-22%20Daily%20Incident%20Summary.pdf
http://normanpd.normanok.gov/filebrowser_download/657/2017-02-21%20Daily%20Incident%20Summary.pdf
http://normanpd.normanok.gov/filebrowser_download/657/2017-02-20%20Daily%20Incident%20Summary.pdf
http://normanpd.normanok.gov/filebrowser_download/657/2017-02-19%20Daily%20Incident%20Summary.pdf
http://normanpd.normanok.gov/filebrowser_download/657/2017-02-18%20Daily%20Incident%20Summary.pdf
http://normanpd.normanok.gov/filebrowser_download/657/2017-02-17%20Daily%20Incident%20Summary.pdf
But if you were asked to use only regexes, then try something simpler:
import urllib.request
import re
url="http://normanpd.normanok.gov/content/daily-activity"
response = urllib.request.urlopen(url)
data = response.read() # a `bytes` object
text = data.decode('utf-8')
urls=re.findall(r'(filebrowser_download.+?Daily%20Incident.+?\.pdf)',text)
print(urls)
for link in urls:
print("http://normanpd.normanok.gov/" + link)

Using BeautifulSoup this is an easy way:
soup = BeautifulSoup(open_page, 'html.parser')
links = []
for link in soup.find_all('a'):
current = link.get('href')
if current.endswith('pdf') and "Incident" in current:
links.append('{0}{1}'.format(url,current))

BeautifulSoup doesn't return any value

I am new to Beautifulsoup and seems to have encountered a problem. The code I wrote is correct to my knowledge but the output is empty. It doesn't show any value.
import requests
from bs4 import BeautifulSoup
url = requests.get("https://www.nseindia.com/")
soup = BeautifulSoup(url.content, "html.parser")
nifty = soup.find_all("span", {"id": "lastPriceNIFTY"})
for x in nifty:
print x.text

The page seems to be rendered by javascript. requests will fail to get the content which is loaded by JavaScript, it will get the partial page before the JavaScript rendering. You can use the dryscrape library for this like so:
import dryscrape
from bs4 import BeautifulSoup
sess = dryscrape.Session()
sess.visit("https://www.nseindia.com/")
soup = BeautifulSoup(sess.body(), "lxml")
nifty = soup.select("span[id^=lastPriceNIFTY]")
print nifty[0:2] #printing sample i.e first two entries.
Output:
[<span class="number" id="lastPriceNIFTY 50"><span class="change green">8,792.80 </span></span>, <span class="value" id="lastPriceNIFTY 50 Pre Open" style="color:#000000"><span class="change green">8,812.35 </span></span>]

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

BeautifulSoup output is not transferred to CSV file - python-2.7

Href is a tag attribute not a tag name. if you want to ensure that all your links have a href attribute you can use it as a keyward argument, else use the tag name. links = li.find_all(href=True)

Related

Beautiful Soup multiple findAll classes

I have the following list of strings yet I want to apply filter so that I may certain item from the lists.How to do that?

Beautifulsoup: exclude unwanted parts

Regular expression to find precise pdf links in a webpage

BeautifulSoup doesn't return any value

Categories

Resources