Dont know why the below script would not crawl glassdoor.com - python-2.7

Dont know why the below python script would not crawl glassdoor.com website
from bs4 import BeautifulSoup # documentation available at :` #www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import NavigableString, Tag
import requests # To send http requests and access the page : docs.python-requests.org/en/latest/
import csv # To create the output csv file
import unicodedata # To work with the string encoding of the data
entries = []
entry = []
urlnumber = 1 # Give the page number to start with
while urlnumber<100: # Give the page number to end with
#print type(urlnumber), urlnumber
url = 'http://www.glassdoor.com/p%d' % (urlnumber,) #Give the url of the forum, excluding the page number in the hyperlink
#print url
try:
r = requests.get(url, timeout = 10) #Sending a request to access the page
except Exception,e:
print e.message
break
if r.status_code == 200:
data = r.text
else:
print str(r.status_code) + " " + url
soup = BeautifulSoup(data) # Getting the page source into the soup
for div in soup.find_all('div'):
entry = []
if(div.get('class') != None and div.get('class')[0] == 'Comment'): # A single post is referred to as a comment. Each comment is a block denoted in a div tag which has a class called comment.
ps = div.find_all('p') #gets all the tags called p to a variable ps
aas = div.find_all('a') # gets all the tags called a to a variable aas
spans = div.find_all('span') #
times = div.find_all('time') # used to extract the time tage which gives the iDate of the post
concat_str = ''
for str in aas[1].contents: # prints the contents that is between the tag start and end
if str != "<br>" or str != "<br/>": # This denotes breaks in post which we need to work around.
concat_str = (concat_str + ' '+ str.encode('iso-8859-1')).strip() # The encoding is because the format exracted is a unicode. We need a uniform structure to work with the strings.
entry.append(concat_str)
concat_str = ''
for str in times[0].contents:
if str != "<br>" or str != "<br/>":
concat_str = (concat_str + ' '+ str.encode('iso-8859-1')).strip()
entry.append(concat_str)
#print "-------------------------"
for div in div.find_all('div'):
if (div.get('class') != None and div.get('class')[0] == 'Message'): # Extracting the div tag witht the class attribute as message.
blockqoutes = []
x = div.get_text()
for bl in div.find_all('blockquote'):
blockqoutes.append(bl.get_text()) #Block quote is used to get the quote made by a person. get text helps to elimiate the hyperlinks and pulls out only the data.
bl.decompose()
entry.append(div.get_text().replace("\n"," ").replace("<br/>","").encode('ascii','replace').encode('iso-8859-1'))
for bl in blockqoutes:
#print bl
entry.append(bl.replace("\n"," ").replace("<br/>","").encode('ascii','replace').encode('iso-8859-1'))
#print entry
entries.append(entry)
urlnumber = urlnumber + 1 # increment so that we can extract the next page
with open('gd1.csv', 'w') as output:
writer = csv.writer(output, delimiter= ',', lineterminator = '\n')
writer.writerows(entries)
print "Wrote to gd1.csv"
I fixed some errors in your script, but I guess that it doesn't print anything because you only get 405(!)-pages!
Also your previous try/catch-block didn't print the error message. Was it on purpose?

Related

Beautiful Soup multiple findAll classes

Im working on a project to get a database on how many dogs get withdrawn prior to a race.
I need to scrape the data to then write into csv.
My problem is that the data that i'm scraping has a image instead of text(Between PLC and Greayhound on the webpage).
That means im running 2 different loops to get the info i need and then there hard to concatenate back in its correct place.
Here is the code.
import requests
import csv
URL = "https://www.thedogs.com.au/Racing/MeetResults.aspx?meetId=255268"
page = requests.get(URL)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')
#soup.findAll('td', class_='ResultsCenteredCellContents'):
odds=[]
dog = soup.findAll('img' )
for a in dog:
odds.append(a['src'].strip())
odds1=[]
for b in soup.findAll('td'):
odds1.append(b.text.strip())
So if i could run all the code i need in the one loop that can be written in CSV that would be great.
Yes they are displayed in images but if you notice the images are named according to the number it represents src="../Images/BoxNumber4.gif that image represents 4
import requests , csv
from bs4 import BeautifulSoup
def SaveAsCsv(list_of_rows,file_name):
try:
print('\nSaving CSV Result')
with open(file_name, 'a', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
writer.writerow(list_of_rows)
print("rsults saved successully")
except PermissionError:
print(f"Please make sure {file_name} is closed \n")
URL = "https://www.thedogs.com.au/Racing/MeetResults.aspx?meetId=255268"
page = requests.get(URL, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(page.text, 'html.parser')
tables = soup.findAll('table',{'id':"gvRaceResults"}) # getting all 11 tabs
for table_index , table in enumerate(tables,1):
print(f'Getting tab {table_index} out of {len(tables)} ')
rows = table.findAll('tr')
header_row = [row.text.strip() for row in rows[0].findAll('th')]
SaveAsCsv(header_row , 'thedogs.csv')
for index , row in enumerate(rows[1:],1):
print(f'Getting row {index} out of {len(rows[1:])} ')
row_list = []
colms = row.findAll('td')
for col in colms :
if 'ResultsContentsNumber' in col.attrs['class']:
image_src = col.find('img').get('src') #src="../Images/BoxNumber4.gif">
image_num = image_src.split('BoxNumber')[1].split('.')[0] # 4
row_list.append(image_num)
continue
row_list.append(col.text.strip())
SaveAsCsv(row_list , 'thedogs.csv')
Output:

Extraction of text using Beautiful Soup and regular expressions in 10-K Edgar fillings

I want to automatically extract section "1A. Risk Factors" from around 10000 files and write it into txt files.
A sample URL with a file can be found here
The desired section is between "Item 1a Risk Factors" and "Item 1b". The thing is that the 'item', '1a' and '1b' might look different in all these files and may be present in multiple places - not only the longest, proper one that interest me. Thus, there should be some regular expressions used, so that:
The longest part between "1a" and "1b" is extracted (otherwise the table of contents will appear and other useless elements)
Different variants of the expressions are taken into consideration
I tried to implement these two goals in the script, but as it's my first project in Python, I just randomly sorted expressions that I think might work and apparently they are in a wrong order (I'm sure I should iterate on the "< a >"elements, add each extracted "section" to a list, then choose the longest one and write it to a file, though I don't know how to implement this idea).
EDIT: Currently my method returns very little data between 1a and 1b (i think it's a page number) from the table of contents and then it stops...(?)
My code:
import requests
import re
import csv
from bs4 import BeautifulSoup as bs
with open('indexes.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for line in reader:
fn1 = line[0]
fn2 = re.sub(r'[/\\]', '', line[1])
fn3 = re.sub(r'[/\\]', '', line[2])
fn4 = line[3]
saveas = '-'.join([fn1, fn2, fn3, fn4])
f = open(saveas + ".txt", "w+",encoding="utf-8")
url = 'https://www.sec.gov/Archives/' + line[4].strip()
print(url)
response = requests.get(url)
soup = bs(response.content, 'html.parser')
risks = soup.find_all('a')
regexTxt = 'item[^a-zA-Z\n]*1a.*item[^a-zA-Z\n]*1b'
for risk in risks:
for i in risk.findAllNext():
i.get_text()
sections = re.findall(regexTxt, str(i), re.IGNORECASE | re.DOTALL)
for section in sections:
clean = re.compile('<.*?>')
# section = re.sub(r'table of contents', '', section, flags=re.IGNORECASE)
# section = section.strip()
# section = re.sub('\s+', '', section).strip()
print(re.sub(clean, '', section))
The goal is to find the longest part between "1a" and "1b" (regardless of how they exactly look) in the current URL and write it to a file.
In the end I used a CSV file, that contains a column HTMURL, which is the link to htm-format 10-K. I got it from Kai Chen that created this website. I wrote a simple script that writes pure txt into files. Processing it will be a simple task now.
import requests
import csv
from pathlib import Path
from bs4 import BeautifulSoup
with open('index.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for line in reader:
print(line[9])
url = line[9]
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.get_text())
name = line[1]
name = name.replace('/', '')
name = name.replace("/PA/", "")
name = name.replace("/DE/", "")
dir = Path(name + line[4] + ".txt")
f = open(dir, "w+", encoding="utf-8")
if dir.is_dir():
break
else: f.write(soup.get_text())

How to use a concatenated string for get Method of requests?

I'm trying to write a small crawler to crawl multiple wikipedia pages.
I want to make the crawl somewhat dynamic by concatenating the hyperlink for the exact wikipage from a file which contains a list of names.
For example, the first line of "deutsche_Schauspieler.txt" says "Alfred Abel" and the concatenated string would be "https://de.wikipedia.org/wiki/Alfred Abel". Using the txt file will result in heading being none, yet when I complete the link with a string inside the script, it works.
This is for python 2.x.
I already tried to switch from " to ',
tried + instead of %s
tried to put the whole string into the txt file (so that first line reads "http://..." instead of "Alfred Abel"
tried to switch from "Alfred Abel" to "Alfred_Abel
from bs4 import BeautifulSoup
import requests
file = open("test.txt","w")
f = open("deutsche_Schauspieler.txt","r")
content = f.readlines()
for line in content:
link = "https://de.wikipedia.org/wiki/%s" % (str(line))
response = requests.get(link)
html = response.content
soup = BeautifulSoup(html)
heading = soup.find(id='Vorlage_Personendaten')
uls = heading.find_all('td')
for item in uls:
file.write(item.text.encode('utf-8') + "\n")
f.close()
file.close()
I expect to get the content of the table "Vorlage_Personendaten" which actually works if i change line 10 to
link = "https://de.wikipedia.org/wiki/Alfred Abel"
# link = "https://de.wikipedia.org/wiki/Alfred_Abel" also works
But I want it to work using the textfile
Looks like the problem in your text file where you have used "Alfred Abel" that is why you are getting the following exceptions
uls = heading.find_all('td')
AttributeError: 'NoneType' object has no attribute 'find_all'
Please remove the string quotes "Alfred Abel" and use Alfred Abel inside the text file deutsche_Schauspieler.txt . it will work as expected.
I found the solution myself.
Although there are no additionaly lines on the file, the content array displays like
['Alfred Abel\n'], but printing out the first index of the array will result in 'Alfred Abel'. It still gets interpreted like the string in the array, thus forming a false link.
So you want to move the last(!) character from the current line.
A solution could look like so:
from bs4 import BeautifulSoup
import requests
file = open("test.txt","w")
f = open("deutsche_Schauspieler.txt","r")
content = f.readlines()
print (content)
for line in content:
line=line[:-1] #Note how this removes \n which are technically two characters
link = "https://de.wikipedia.org/wiki/%s" % str(line)
response = requests.get(link)
html = response.content
soup = BeautifulSoup(html,"html.parser")
try:
heading = soup.find(id='Vorlage_Personendaten')
uls = heading.find_all('td')
for item in uls:
file.write(item.text.encode('utf-8') + "\n")
except:
print ("That did not work")
pass
f.close()
file.close()

How can I use values from a csv file, to loop my python script?

I have a problem. I use python 2.7.13 to gather some data from a web page.
I try to gather the data from multiply articles.
I use the folowing script to gather the data that I want to have.
import urllib2
i=0
line = open("artikelen.csv", "r").readlines()[i]
url = 'http://shop.niemann-frey.de/cshop/product.html?SID='
SID = '1cfbf44f2a062d40b1d8dd3fd9c434ff'
curl = '&art_nr='
artnr = line
response = urllib2.urlopen(url+SID+curl+artnr)
webContent = response.read()
for item in webContent.split("</title></head>"):
if "<html><head><title>" in item:
artikel = item [ item.find("<html><head><title>")+len("<html><head><title>") : ]
print "artikelnummer is: " + artikel
import csv
with open(artikel + '.csv', 'w') as fw:
writer = csv.writer(fw, delimiter=',')
f = open(artikel+'.csv','wb')
for item in webContent.split("&mode=thumb&dbname=bilder"):
if "https://shop.niemann-frey.de/cshop/lib/progs/getmedia.php?id=" in item:
nummer = item [ item.find("https://shop.niemann-frey.de/cshop/lib/progs/getmedia.php?id=")+len("https://shop.niemann-frey.de/cshop/lib/progs/getmedia.php?id=") : ]
print nummer
f.write(nummer+";")
f.close
i+=1
The list I use looks more or less like this
Artikelnummer:1B0009281B0012791B0011141B001271etc.
I want to use this values as "artnr" and I want to preform my script for all the values in the list. How can I manage this?Thank you all in advance!
Simple as:
with open("artikelen.csv", "r") as csv:
for artnr in csv:
# your code here

Writing search results only from last row in csv

Here is my code:
import urllib
import json
import csv
apiKey = "MY_KEY" # Google API credentials
##perform a text search based on input, place results in text-search-results.json
print "Starting"
myfile = open("results.csv","wb")
headers = []
headers.append(['Search','Name','Address','Phone','Website','Type','Google ID','Rating','Permanently Closed'])
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerows(headers)
with open('input_file.csv', 'rb') as csvfile:
filereader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in filereader:
search = ', '.join(row)
search.replace(' ', '+')
url1 = "https://maps.googleapis.com/maps/api/place/textsearch/json?query=%s&key=%s" % (search,apiKey)
urllib.urlretrieve(url1,"text-search-results.json")
print "SEARCH", search
print "Google Place URL", url1
## load text-search-results.json and get the list of place IDs
textSearchResults = json.load(open("text-search-results.json"))
listOfPlaceIds = []
for item in textSearchResults["results"]:
listOfPlaceIds.append(str(item["place_id"]))
## open a nested list for the results
output = []
## iterate through and download a JSON for each place ID
for ids in listOfPlaceIds:
url = "https://maps.googleapis.com/maps/api/place/details/json?placeid=%s&key=%s" % (ids,apiKey)
fn = ids + "-details.json"
urllib.urlretrieve(url,fn)
data = json.load(open(fn))
lineToAppend = []
lineToAppend.append(search)
try:
lineToAppend.append(str(data["result"]["name"]))
except KeyError:
lineToAppend.append('')
try:
lineToAppend.append(str(data["result"]["formatted_address"]))
except KeyError:
lineToAppend.append('')
try:
lineToAppend.append(str(data["result"]["formatted_phone_number"]))
except KeyError:
lineToAppend.append('')
try:
lineToAppend.append(str(data["result"]["website"]))
except KeyError:
lineToAppend.append('')
try:
lineToAppend.append(str(data["result"]["types"]))
except KeyError:
lineToAppend.append('')
try:
lineToAppend.append(str(data["result"]["place_id"]))
except KeyError:
lineToAppend.append('')
try:
lineToAppend.append(str(data["result"]["rating"]))
except KeyError:
lineToAppend.append('')
try:
lineToAppend.append(str(data["result"]["permanently_closed"]))
except KeyError:
lineToAppend.append('')
output.append(lineToAppend)
wr.writerows(output)
myfile.close()
What this is doing is taking the search terms from one column in the input_file and running that search through the Google Places API. However, when I have multiple search terms, it only returns the last search results in the results.csv file. I am not quite sure why this is happening since it is reading all of the search terms and running them through, but only returning the last result. Any suggestions?
Currently you are only writing out the last line because you are resetting the variable lineToAppend within your for loop. However, you are not adding it to your output within your for loop. Therefore, it is getting to the end of your for loop and writing out the last line.
Therefore currently it looks like this: (Shortened for brevity)
for ids in listOfPlaceIds:
url = "https://maps.googleapis.com/maps/api/place/details/json?placeid=%s&key=%s" % (ids,apiKey)
fn = ids + "-details.json"
urllib.urlretrieve(url,fn)
data = json.load(open(fn))
lineToAppend = []
lineToAppend.append(search)
...
output.append(lineToAppend)
wr.writerows(output)
Whereas it should be:
for ids in listOfPlaceIds:
url = "https://maps.googleapis.com/maps/api/place/details/json?placeid=%s&key=%s" % (ids,apiKey)
fn = ids + "-details.json"
urllib.urlretrieve(url,fn)
data = json.load(open(fn))
lineToAppend = []
lineToAppend.append(search)
...
output.append(lineToAppend)
wr.writerows(output)