data crawling from website. Result changes multiple times - python-2.7

In this code I am trying to fetch the some attribute of mobile phone but I am unable to get the information. Though those attribute are present in the url's.
import requests,re
from bs4 import BeautifulSoup
from time import sleep
import urllib2
def demo(url):
header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64)', }
array = []
camera_review = processor_review = battery_review = display_review = verdict = pro = con = ""
request=""
while True:
try:
#request=urllib2.urlopen(url)
request = requests.get(url, "headers=header").text
break
except requests.exceptions.ConnectionError:
print " connection error"
sleep(15)
############################################################################################################
soup = BeautifulSoup(request, "html.parser")
try:
pros = soup.find("ul", attrs={"class": "for_list_overview"})
for i in pros.find_all('li'):
temp= i.find('span').contents[0]
pro += temp + "\n"
except AttributeError:
print "pro not found"
#################################################################################################################
try:
cons = soup.find("ul", attrs={"class": "against_list_overview"})
for j in cons.find_all('li'):
temp = j.find('span').contents[0]
con +=temp + "\n"
print con
except AttributeError:
print "con not available"
################################################################################################################
try:
dac = soup.find("div", attrs={"class": "overview_specs_green_box display_none"})
for k in dac.find_all(text=re.compile('camera')):
camera_review +=k
print camera_review
except AttributeError:
print "camera review not available"
#print k
#################################################################################
try:
for l in dac.find_all(text=re.compile('processor')):
processor_review +=l
print processor_review
except AttributeError:
print "processor review not available"
#print l
#################################################################################
try:
for m in dac.find_all(text=re.compile('battery')):
battery_review +=m
print battery_review
except AttributeError:
print "battery review not available"
#print m
#################################################################################
try:
for n in dac.find_all(text=re.compile('display')):
display_review +=n
print display_review
except AttributeError:
print "display review not available"
##############################################################################################################
try:
vid = soup.find("div", attrs={"style": "font-weight:400 !important;color: #3c3c3c;"})
for o in vid.find_all(text=re.compile('a')):
verdict +=o
if len(verdict) == 0:
temp = 'na'
verdict +=temp
print verdict
except AttributeError:
print "verdict Attribute Error"
url=["http://www.91mobiles.com/xiaomi-redmi-note-3-32gb-price-in-india",
"http://www.91mobiles.com/blackberry-priv-price-in-india",
"http://www.91mobiles.com/oneplus-3-price-in-india",
"http://www.91mobiles.com/coolpad-note-5-price-in-india",
"http://www.91mobiles.com/vivo-v3-max-price-in-india",
"http://www.91mobiles.com/oppo-f1s-price-in-india"]
i=0
while i<len(url):
demo(url[i])
print "###################################################################################################"
print "###################################################################################################"
i+=1
Each time I run the code output gets changed.

Related

"Unboundlocalerror: Local Variable "Val" Referenced before Assignment" Error

I have been trying to get my script to loop in such a way that it will load the outputs in 1 file, and then when it's done loading everything move the values to output file 2 , erase the values in output file 1 and start reloading them, then when those are down move the values into output two (overwriting the old ones) repeat.
I have been pretty successful so far and don't know what else to add to my script and am hoping someone here knows why I keep getting ""Unboundlocalerror: Local Variable "Val" Referenced before Assignment" error randomly midway through the loading process, when I have a very small input file, the script performs how I want.
Does anyone know how I can change my script to fix that error, I have tried to understand why it is happening but cannot.
I have tried to research it thoroughly but none of the suggestions I have found have worked (or I implemented them incorrectly, I have attached my script. Thanks!
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs,shutil
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
# global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class': 'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
# ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
return val
while True:
ifile = open('output.csv', "w", 0)
inputs = csv.reader(open('input.csv'))
# inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
for i in inputs:
ifile.write(extract(i[0]))
ifile.close()
Update:
Thanks for the help guys! This is my new script:
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs,shutil
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
# global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class': 'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
# ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
else:
return val
while True:
ifile = open('output.csv', "w", 0)
inputs = csv.reader(open('input.csv'))
# inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
for i in inputs:
val_to_write = extract(i[0])
if val_to_write:
ifile.write(val_to_write)
ifile.close()
shutil.copy('output.csv', 'output2.csv')
print("finished")
With the above script I am now getting the error: "ValueError: I/O operation on closed file". Thanks
Use try-except-else as you would only want to return val if no exception was raised (if an exception was raised then val wouldn't be assigned to when you try to return it). Another suggestion is not to use a "catch-em-all" except block.
def extract(url):
try:
sys.stdout.write('0')
# global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class': 'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
# ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
else:
return val
But be warned: if an exception does occur then extract will return None and the calling code will have to take account for that, for example:
for i in inputs:
val_to_write = extract(i[0])
if val_to_write:
ifile.write(val_to_write)
ifile.close()

How to use beautifulsoup to save html of a link in a file and do the same with all the links in the html file

I'm trying to write a parser which will take a url and download it's html in a .html file. Then it'll go through the html file to find all links and save them as well. I want to repeat it multiple time. Can some one please help a little?
This is the code I have written:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
count = 1
give_url = raw_input("Enter url:\t")
def magic(give_url):
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
link_set.add(give_url + str(html_link))
magic(give_url)
for each_item in link_set:
print each_item
print "\n"
Although it's working fine but When I try to call the magic function in for loop, i get RuntimeError: Set changed size during iteration.
I got it working.
The code for recursive URL parsing using beautiful soup:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
#
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
pass
else:
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
else:
link_set.add(html_link)
# print "Total links in the given url are: " + str(len(link_set))
magic(give_url,link_set,0)
link_set2 = set()
link_set3 = set()
for element in link_set:
link_set2.add(element)
count = 1
for element in link_set:
magic(element,link_set3,count)
count += 1
for each_item in link_set3:
link_set2.add(each_item)
link_set3.clear()
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
There are many mistakes so I request you all to please tell me where I can improve the code.

pyodbc insert failing silently after hours of working correctly

I have here a scraper. It works for hours then all the sudden the inserts are not making it into the table. The program keeps going but the table remains unchanged... The only errors i see that i get are Primary Key errors because some of the rows are duplicates and i do not want to insert them anyway.
from bs4 import BeautifulSoup
from datetime import datetime
import mechanize,cookielib,pyodbc,socket,sys
import httplib
url = 'www'
base= 'www'
proxies = {'http': 'proxy'}
username='u'
pw = 'p'
cnxnstring = 'DRIVER={SQL Server};SERVER=s;DATABASE=DB;UID=u;PWD=p'
insert="""INSERT INTO TxProductionPreStaging(LeaseName,LeaseNo,DistrictNo,WellNo,ProdMonth,ProdYear,ProdDate,OilBBL,CasingHeadMCF,GWGasMCF,CondensateBBL,LastScraped)
VALUES(?,?,?,?,?,?,?,?,?,?,?,?)"""
def initReq():
br = mechanize.Browser()
br.set_proxies(proxies)
br.add_proxy_password(username, pw)
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
while True:
try:
soup = BeautifulSoup(br.open(url,timeout=20).read())
if soup is not None:
if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
print 'REDIRECT PAGE'
else:
break
except (mechanize.URLError,mechanize.HTTPError,httplib.IncompleteRead) as exc:
if isinstance(exc.reason, socket.timeout):
print exc
except Exception as error:
print error
return br
def initForm( br, prodMonth ):
br.select_form('SearchCriteriaForm')
br.form.set_all_readonly(False)
br.form.find_control(name='viewType').value = ['Lease']
br["startMonth"]=[prodMonth]
br["startYear"]=[prodYear]
br["endMonth"]=[prodMonth]
br["endYear"]=[prodYear]
br["district"]=['Statewide']
r=br.submit(nr=2)
return r
def bs( r ):
soup = BeautifulSoup(r.read())
return soup
def getTags( soup ):
bigL=[]
mini=[]
for node in soup.findAll(attrs={'class': 'DataGrid'}):
for i in node.findAll('tr'):
if i.find('td'):
for j in i.findAll('td'):
s = str(j.text);s= s.replace('\r\n',''); s=s.replace(' ','').strip('-').strip('\n')
mini.append(s)
bigL.append(mini[:])
del mini[:]
return bigL
def insertTable( bigL, cnxn, cursor ,prodMonth, prodDate):
print 'INSERT TABLE'
global c
for i,item in enumerate(bigL):
leaseName=bigL[i][0]
leaseNo=bigL[i][1]
districtNo=bigL[i][2]
wellNo=bigL[i][3]
oil=int(bigL[i][4].replace(',',''))
casingHead=int(bigL[i][5].replace(',',''))
gas=int(bigL[i][6].replace(',',''))
condensate=int(bigL[i][7].replace(',',''))
dt = datetime.now()
try:
cursor.execute(insert,leaseName,leaseNo,districtNo,wellNo,prodMonth,prodYear,prodDate,oil,casingHead,gas,condensate,dt)
cnxn.commit()
except pyodbc.Error as e:
print e
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=u;DATABASE=DB;UID=us;PWD=p');
cursor = cnxn.cursor()
return
def baseUrl( prodYear ):
months=['01','02','03','04','05','06','07','08','09','10','11','12']
for item in months:
prodMonth=str(item)
prodDate=str(prodMonth)+'/01/'+str(prodYear)
prodDate=datetime.strptime(prodDate, '%m/%d/%Y')
br = initReq()
r = initForm( br, prodMonth )
soup = bs( r )
L = getTags( soup )
cnxn, cursor = getcnxn()
insertTable( L, cnxn, cursor, prodMonth, prodDate )
count = 20;
while True:
cs= str(count)
count = count +20
print count,cs
while True:
try:
soup = BeautifulSoup( br.open(base+cs, timeout=20).read())
if soup is not None:
if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
print 'REDIRECT PAGE'
else:
break
except (mechanize.URLError,mechanize.HTTPError, httplib.IncompleteRead) as exc:
print exc
except Exception as e:
print e
var=soup.prettify(formatter=None)
if 'No Matches Found' in var:
break
else:
L = getTags( soup )
insertTable( L, cnxn, cursor, prodMonth, prodDate )
return
def getcnxn():
while True:
try:
cnxn = pyodbc.connect(cnxnstring);
cursor = cnxn.cursor()
break
except:
print sys.exc_info()[1]
return cnxn, cursor
if __name__ == '__main__':
prodYear=str(sys.argv[1]);
baseUrl( prodYear )
cnxn.close()
The one thing that helped with this is to try to get a cursor periodically. This tests the connection. I am web scraping so with every new page I:
try:
cursor = cnxn.cursor()
except e:
cnxn ==reinit()
cursor = cnxn.cursor()
Edit: Also pyodbc was not catching the error properly... This is why i thought it was silently failing. Turns out that i just had to catch all errors to see where it was failing.

Cannot Pool.map() function because of UnpickleableError?

So I am trying to multi process function F. Which is accessed by a button press with tkinter.
def f(x):
global doom,results,info
doom = doom + 1
if check(x) == True:
results.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "1/"+doom
s.configure(text=texx)
root.update()
The function is called within a function like so:
def dojob():
index = ['URLS'...]
pool = Pool(processes=4)
s.configure(text="Shifting Workload to cores..")
root.update()
pool.map(f, index)
The button is inside root window.
I get the following error:
Exception in thread Thread-2:
Traceback (most recent call last):
File "C:\Python27\lib\threading.py", line 808, in __bootstrap_inner
self.run()
File "C:\Python27\lib\threading.py", line 761, in run
self.__target(*self.__args, **self.__kwargs)
File "C:\Python27\lib\multiprocessing\pool.py", line 342, in _handle_tasks
put(task)
UnpickleableError: Cannot pickle <type 'tkapp'> objects
I do not even know what a pickle does? Help?
Here is the complete code:
from Tkinter import *
from ttk import *
from tkMessageBox import showinfo
from multiprocessing import Pool
import random
emails = set()
import urllib2
import urllib2 as urllib
########
CONSTANT_PAGECOUNT = 20
######
def f(x):
global doom,emails,info
doom = doom + 1
if check(x) == True:
print "",
emails.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "Sk1nn1n "+str(doom)+'/'+str(CONSTANT_PAGECOUNT)+""
s.configure(text=texx)
root.update()
return 0
def f(x):
print ""
def showFile(site,info):
top = Toplevel()
top.title('Sites')
x = Text(top)
x.pack()
i=0
for site_url in site:
x.insert(END,site_url)
i=i+1
def get_column_number(url):
return True
def check(url):
return True
def getgoogleurl(search,siteurl=False,startr=0):
if siteurl==False:
return 'http://www.google.com/search?q='+urllib2.quote(search)+'&start='+str(startr)+'&oq='+urllib2.quote(search)
else:
return 'http://www.google.com/search?q=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)+'&oq=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)
def getgooglelinks(search,siteurl=False,startr=0):
#google returns 403 without user agent
headers = {'User-agent':'Mozilla/11.0'}
req = urllib2.Request(getgoogleurl(search,siteurl,startr),None,headers)
site = urllib2.urlopen(req)
data = site.read()
site.close()
#no beatifulsoup because google html is generated with javascript
start = data.find('<div id="res">')
end = data.find('<div id="foot">')
if data[start:end]=='':
#error, no links to find
return False
else:
links =[]
data = data[start:end]
start = 0
end = 0
while start>-1 and end>-1:
#get only results of the provided site
if siteurl==False:
start = data.find('<a href="/url?q=')
else:
start = data.find('<a href="/url?q='+str(siteurl))
data = data[start+len('<a href="/url?q='):]
end = data.find('&sa=U&ei=')
if start>-1 and end>-1:
link = urllib2.unquote(data[0:end])
data = data[end:len(data)]
if link.find('http')==0:
links.append(link)
return links
def rip(results=15,accuracy=16):
global e
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
linklist = []
counter = 0
doom = 0
while counter < results:
links = getgooglelinks(keyword,startr=counter)
for link in links:
if len(linklist) > CONSTANT_PAGECOUNT:
s.configure(text="Proccessing..")
root.update()
return linklist
else:
doom = doom + 1
linklist.append(link)
texx = str(doom)+"/"+str(CONSTANT_PAGECOUNT)
s.configure(text=texx)
root.update()
root.update()
counter = counter+accuracy
return linklist
def flip():
global e
emails = set()
info = []
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
s.configure(text="Generating index..")
root.update()
doom = -1
index = rip(CONSTANT_PAGECOUNT,10)
if 1:
try:
pool = Pool(processes=4)
#s.configure(text="Shifting Workload to cores..")
#root.update()
pool.map(f, index)
pool.close()
except:
print "The errors there.."
j.config(value=CONSTANT_PAGECOUNT)
if len(emails) > 0:
filepath='relavant_list_'+str(random.randint(1,9999))+'.emList.txt'
#print len(emails),
#print "emails found."
ggg = open(filepath,'a+')
for x in emails:
ggg.write(x+"\n")
showinfo(
str(len(emails))+" key word related sites found!",
" sites are saved in "+str(filepath)
)
showFile(emails,info)
s.configure(text=filepath)
else:
s.configure(text='No related sites found : (')
if __name__ == '__main__':
### CONSTANTS
version = '1.0'
### END CONSTANTS
root = Tk()
root.title('Program v'+version)
s = Style()
s.theme_use('default')
#print s.theme_names()
s.configure("black.Horizontal.TProgressbar", foreground='blue', background='blue')
j = Progressbar(root, style="black.Horizontal.TProgressbar", orient="vertical", length=200, mode="determinate", maximum=CONSTANT_PAGECOUNT, value=0)
j.pack(side='right',fill='y')
f = Frame(root)
x = Frame(f)
e = Entry(x,width=51)
s = Label(x,width=50,anchor='center',text='Waiting for task..')
Button(f,text='Generate List!',width=50,command=flip).pack(fill='both',expand=True)
s.pack(side='bottom',fill='y',expand=True)
e.pack(side='top',fill='both',expand=True)
x.pack(side='top',fill='y',expand=True)
f.pack(side='left',expand=True,fill="both")
root.mainloop()
You are leaking a tkinter object. Most likely because you are trying to update the interface from another process with the last line of f()
Update based on code
You have a name collision between your function f() and a variable f in your __main__ which gets assigned to your main window and causes the tkapp pickle error. Rename the function to def myfunc() or something. Also need to call pool.join() after pool.close()

Why is my Python code returning an error when I try to fetch YouTube videos for a given keyword?

Whenever I try to run my code, I receive the following error: "comment_content error! 'nonetype' object has no attribute 'href'" I am new to Python, and did not write this code myself; it was given to me to use. My understanding is that it was functioning properly before? Could this have to do with changes in the YouTube Data API since it was written?
import pdb
import gdata.youtube
import gdata.youtube.service
import codecs
import time
client = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery()
### the input words are here
query.vq = "4b hair"
#######
# the out put file are here
viewFile = codecs.open('views4b_hair.csv', 'w')
commentFile=codecs.open('comments4b_hair.csv', 'w')
##########
query.max_results = 50
query.start_index = 0
query.safesearch = "moderate"
#query.format = 5
query.orderby = "relevance"
#query.author = "hawaiinani"
#pdb.set_trace()
for i in range(19):
#pdb.set_trace()
query.start_index=str(int(query.start_index)+50)
feed = client.YouTubeQuery(query)
print len(feed.entry)
youtubeid=[]
youtubetitle=[]
for entry in feed.entry:
#youtubetitle.append(entry.title.text)
youtubeid.append(entry.id.text[38:])
print entry.id.text[38:],i
try:
entry_comment = client.GetYouTubeVideoEntry(video_id=entry.id.text[38:])
comment_feed = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
viewFile.write(','.join([entry.id.text[38:],entry_comment.published.text,
str(entry_comment.media.duration.seconds), str(entry_comment.statistics.view_count),comment_feed.total_results.text,entry_comment.media.title.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')]) + '\n')
#videop.append("%s, %s,%s, %s, %s, %s" % (search_result["id"]["videoId"],entry.published.text,
# entry.media.duration.seconds, entry.statistics.view_count,comment_feed.total_results.text,entry.media.title.text))
#
#time.sleep(3)
except Exception, ex:
print 'View_content Error', ex
time.sleep(10)
try:
comment_content = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
indexh=0
#while comment_content:
while indexh<10:
indexh=indexh+1
for comment_entry in comment_content.entry:
pubText = comment_entry.published.text
#print pubText
titleText = comment_entry.content.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')
#print titleText
#print 'Got title'
#pubText, titleText = comment_entry.published.text, comment_entry.title.text
commentFile.write(','.join([entry.id.text[38:],pubText,titleText]) + '\n'+'\n')
#commentFile.write(u',')
#commentFile.write(pubText + u',')
#print 'About to write title'
#print titleText
#print 'Wrote title'
#commentlist.append("%s, %s,%s" % (search_result["id"]["videoId"],pubText, titleText))
comment_content=client.Query(comment_content.GetNextLink().href)
#time.sleep(3)
#time.sleep(3)
except Exception, ex:
print 'Comment_content Error!', ex
time.sleep(5)
#pdb.set_trace()
viewFile.close()
commentFile.close()
The error occurs when comment_content.GetNextLink() becomes None. In order to fix it, replace:
while indexh < 10:
with:
while indexh < 10 and comment_content:
also replace:
comment_content=client.Query(comment_content.GetNextLink().href)
with:
next_link = comment_content.GetNextLink()
if next_link:
comment_content = client.Query(next_link.href)
else:
comment_content = None
Hope that helps.