simple web crawler - python-2.7

i wrote below program in python for very simple web crawler, but when i run it it return me
'NoneType' object is not callable' , could you please help me?
import BeautifulSoup
import urllib2
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup('a')
if page not in crawled:
union(tocrawl,links)
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')

[UPDATE] Here is the complete project code
https://bitbucket.org/deshan/simple-web-crawler
[ANWSER]
soup('a') returns the complete html tag.
Buy Music Now
so the urlopen gives the error
'NoneType' object is not callable'. you need extract the only the url/href.
links=soup.findAll('a',href=True)
for l in links:
print(l['href'])
You need to validate the url too.refer to following anwsers
How do you validate a URL with a regular expression in Python?
Python - How to validate a url in python ? (Malformed or not)
Again i would like to suggest you to use python sets instead Arrays.you can easily add,ommit duplicate urls.
http://docs.python.org/2/library/sets.html
Try the following code:
import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def isValidUrl(url):
if regex.match(url) is not None:
return True;
return False
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
print 'Crawled:'+page
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup.findAll('a',href=True)
if page not in crawled:
for l in links:
if isValidUrl(l['href']):
tocrawl.append(l['href'])
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')

Related

How can I add my web scrape process using bs4 to selenium automation in Python to make it one single process which just asks for a zipcode?

I am using selenium to go to a website and then go to the search button type a zipcode which I am entering beforehand and then for that zip code I want the link that the webpage has to feed my web scraper created using beautiful soup and once the link comes up I can scrape required data to get my csv.
What I want:
I am having trouble getting that link to the beautiful soup URL. I basically want to automate it so that I just have to enter a zip code and it gives me my CSV.
What I am able to get:
I am able to enter the zip code and search using selenium and then add that url to my scraper to give csv.
Code I am using for selenium :
driver = webdriver.Chrome('/Users/akashgupta/Desktop/Courses and Learning/Automating Python and scraping/chromedriver')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
#web scraping Part:
url="https://forecast.weather.gov/MapClick.php?lat=32.99802500000004&lon=-96.79775499999994#.Xo5LnFNKgWo"
res= requests.get(url)
soup=BeautifulSoup(res.content,'html.parser')
tag=soup.find_all('div',id='seven-day-forecast-body')
weekly=soup.find_all(class_='tombstone-container')
main=soup.find_all(class_='period-name')
description=soup.find_all(class_='short-desc')
temp=soup.find_all(class_='temp')
Period_Name=[]
Desc=[]
Temp=[]
for a in range(0,len(main)):
Period_Name.append(main[a].get_text())
Desc.append(description[a].get_text())
Temp.append(temp[a].get_text())
df = pd.DataFrame(list(zip(Period_Name, Desc,Temp)),columns =['Period_Name', 'Short_Desc','Temperature'])
from selenium import webdriver
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
WebDriverWait(driver, 10).until(EC.url_contains("https://forecast.weather.gov/MapClick.php")) # here you are waiting until url will match your output pattern
currentURL = driver.current_url
print(currentURL)
time.sleep(3)
driver.quit()
#web scraping Part:
res= requests.get(currentURL)
....

LPTHW 41 - url failure

The script includes:
import random
from urllib import urlopen
import sys
WORD_URL = "http://learncodethehardway.org/words.txt"
WORDS = []
...
# load up the words from the website
for word in urlopen(WORD_URL).readlines():
WORDS.append(word.strip())
...
My problem occurs when I try to run the script. I get the following tracebacks:
xterm_003.jpg

Python Web scraper using Beautifulsoup 4

I wanted to create a database with commonly used words. Right now when I run this script it works fine but my biggest issue is I need all of the words to be in one column. I feel like what I did was more of a hack than a real fix. Using Beautifulsoup, can you print everything in one column without having extra blank lines?
import requests
import re
from bs4 import BeautifulSoup
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
# Creating the CSV file
commonFile = open('common_words.csv', 'wb')
# Grabbing the lines you want
for node in soup.findAll("tr"):
# Getting just the text and removing the html
words = ''.join(node.findAll(text=True))
# Removing the extra lines
ID = re.sub(r'[\t\r\n]', '', words)
# Needed to add a break in the line to make the rows
update = ''.join(ID)+'\n'
# Now we add this to the file
commonFile.write(update)
commonFile.close()
How about this?
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open("common_words.csv", "w"))
f.writerow(["common_words"])
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
words = soup.select('div[class=file] tr')
for i in range(len(words)):
word = words[i].text
f.writerow([word.replace('\n', '')])

Unable to define regular expression for re.compile and pass it to Beautifulsoup

Currently I am practicing on the basic concept of accessing web using python. I am following a tutorial on YouTube and was guided till the following code.
from urllib2 import urlopen, HTTPError
from BeautifulSoup import BeautifulSoup
import re
url="http://getbusinessreviews.org/"
try:
webpage = urlopen(url).read
except HTTPError, e:
if e.code == 404:
e.msg = 'data not found on remote: %s' % e.msg
raise
pathFinderTitle = re.compile('<h2 class="entry-title"><a href.* rel="bookmark">(.*)</a></h2>')
if webpage:
if pathFinderTitle:
findPathTitle = re.findall(pathFinderTitle,webpage)
else:
print "unable to get path finder title"
else:
print "unable to url open "
listIterator =[]
listIterator[:]= range(2,10)
for i in listIterator:
print findPathTitle[i]
i want to extract "Nutracoster" from the following HTML
<h2 class="entry-title">
Nutracoster
</h2>
I've got two questions
I am getting no results at the moment can any one guide me what am I doing wrong?(I guess my regular expression is not well defined)
How can i pass this Regular expression to Beautifulsoup ?
Thanks in advance and sorry for any silly mistakes since i am at learning stage :D
You doesn't need to use a regex to select an element with Beautiful Soup: it can extract all the <h2> tags with specific attributes by itself.
Further, it's better to not use a regex to parse HTML (see this popular question).
Try this little snippet of code:
from bs4 import BeautifulSoup as BS
from urllib2 import urlopen, HTTPError, URLError
url = "http://getbusinessreviews.org/"
try:
webpage = urlopen(url)
except HTTPError, e:
if e.code == 404:
e.msg = 'data not found on remote: %s' % e.msg
raise
except URLError, e:
print e.args
soup = BS(webpage, 'lxml')
## Relevant lines ##
for h2 in soup.find_all("h2", attrs={"class": "entry-title"}):
print h2.text

How do you convert the multi-line content scraped into a list?

I was trying to convert the content scraped into a list for data manipulation, but got the following error: TypeError: 'NoneType' object is not callable
#! /usr/bin/python
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import os
import re
# Copy all of the content from the provided web page
webpage = urlopen("http://www.optionstrategist.com/calculators/free-volatility- data").read()
# Grab everything that lies between the title tags using a REGEX
preBegin = webpage.find('<pre>') # Locate the pre provided
preEnd = webpage.find('</pre>') # Locate the /pre provided
# Copy the content between the pre tags
voltable = webpage[preBegin:preEnd]
# Pass the content to the Beautiful Soup Module
raw_data = BeautifulSoup(voltable).splitline()
The code is very simple. This is the code for BeautifulSoup4:
# Find all <pre> tag in the HTML page
preTags = webpage.find_all('pre')
for tag in preTags:
# Get the text inside the tag
print(tag.get_text())
Reference:
find_all()
Kinds of filters to put into name field of find()/findall()
get_text()
To get the text from the first pre element:
#!/usr/bin/env python
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
url = "http://www.optionstrategist.com/calculators/free-volatility-data"
soup = BeautifulSoup(urlopen(url))
print soup.pre.string
To extract lines with data:
from itertools import dropwhile
lines = soup.pre.string.splitlines()
# drop lines before the data table header
lines = dropwhile(lambda line: not line.startswith("Symbol"), lines)
# extract lines with data
lines = (line for line in lines if '%ile' in line)
Now each line contains data in a fixed-column format. You could use slicing and/or regex to parse/validate individual fields in each row.