"Unboundlocalerror: Local Variable "Val" Referenced before Assignment" Error - python-2.7

I have been trying to get my script to loop in such a way that it will load the outputs in 1 file, and then when it's done loading everything move the values to output file 2 , erase the values in output file 1 and start reloading them, then when those are down move the values into output two (overwriting the old ones) repeat.
I have been pretty successful so far and don't know what else to add to my script and am hoping someone here knows why I keep getting ""Unboundlocalerror: Local Variable "Val" Referenced before Assignment" error randomly midway through the loading process, when I have a very small input file, the script performs how I want.
Does anyone know how I can change my script to fix that error, I have tried to understand why it is happening but cannot.
I have tried to research it thoroughly but none of the suggestions I have found have worked (or I implemented them incorrectly, I have attached my script. Thanks!
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs,shutil
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
# global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class': 'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
# ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
return val
while True:
ifile = open('output.csv', "w", 0)
inputs = csv.reader(open('input.csv'))
# inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
for i in inputs:
ifile.write(extract(i[0]))
ifile.close()
Update:
Thanks for the help guys! This is my new script:
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs,shutil
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
# global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class': 'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
# ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
else:
return val
while True:
ifile = open('output.csv', "w", 0)
inputs = csv.reader(open('input.csv'))
# inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
for i in inputs:
val_to_write = extract(i[0])
if val_to_write:
ifile.write(val_to_write)
ifile.close()
shutil.copy('output.csv', 'output2.csv')
print("finished")
With the above script I am now getting the error: "ValueError: I/O operation on closed file". Thanks

Use try-except-else as you would only want to return val if no exception was raised (if an exception was raised then val wouldn't be assigned to when you try to return it). Another suggestion is not to use a "catch-em-all" except block.
def extract(url):
try:
sys.stdout.write('0')
# global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class': 'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
# ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
else:
return val
But be warned: if an exception does occur then extract will return None and the calling code will have to take account for that, for example:
for i in inputs:
val_to_write = extract(i[0])
if val_to_write:
ifile.write(val_to_write)
ifile.close()

Related

data crawling from website. Result changes multiple times

In this code I am trying to fetch the some attribute of mobile phone but I am unable to get the information. Though those attribute are present in the url's.
import requests,re
from bs4 import BeautifulSoup
from time import sleep
import urllib2
def demo(url):
header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64)', }
array = []
camera_review = processor_review = battery_review = display_review = verdict = pro = con = ""
request=""
while True:
try:
#request=urllib2.urlopen(url)
request = requests.get(url, "headers=header").text
break
except requests.exceptions.ConnectionError:
print " connection error"
sleep(15)
############################################################################################################
soup = BeautifulSoup(request, "html.parser")
try:
pros = soup.find("ul", attrs={"class": "for_list_overview"})
for i in pros.find_all('li'):
temp= i.find('span').contents[0]
pro += temp + "\n"
except AttributeError:
print "pro not found"
#################################################################################################################
try:
cons = soup.find("ul", attrs={"class": "against_list_overview"})
for j in cons.find_all('li'):
temp = j.find('span').contents[0]
con +=temp + "\n"
print con
except AttributeError:
print "con not available"
################################################################################################################
try:
dac = soup.find("div", attrs={"class": "overview_specs_green_box display_none"})
for k in dac.find_all(text=re.compile('camera')):
camera_review +=k
print camera_review
except AttributeError:
print "camera review not available"
#print k
#################################################################################
try:
for l in dac.find_all(text=re.compile('processor')):
processor_review +=l
print processor_review
except AttributeError:
print "processor review not available"
#print l
#################################################################################
try:
for m in dac.find_all(text=re.compile('battery')):
battery_review +=m
print battery_review
except AttributeError:
print "battery review not available"
#print m
#################################################################################
try:
for n in dac.find_all(text=re.compile('display')):
display_review +=n
print display_review
except AttributeError:
print "display review not available"
##############################################################################################################
try:
vid = soup.find("div", attrs={"style": "font-weight:400 !important;color: #3c3c3c;"})
for o in vid.find_all(text=re.compile('a')):
verdict +=o
if len(verdict) == 0:
temp = 'na'
verdict +=temp
print verdict
except AttributeError:
print "verdict Attribute Error"
url=["http://www.91mobiles.com/xiaomi-redmi-note-3-32gb-price-in-india",
"http://www.91mobiles.com/blackberry-priv-price-in-india",
"http://www.91mobiles.com/oneplus-3-price-in-india",
"http://www.91mobiles.com/coolpad-note-5-price-in-india",
"http://www.91mobiles.com/vivo-v3-max-price-in-india",
"http://www.91mobiles.com/oppo-f1s-price-in-india"]
i=0
while i<len(url):
demo(url[i])
print "###################################################################################################"
print "###################################################################################################"
i+=1
Each time I run the code output gets changed.

How to use beautifulsoup to save html of a link in a file and do the same with all the links in the html file

I'm trying to write a parser which will take a url and download it's html in a .html file. Then it'll go through the html file to find all links and save them as well. I want to repeat it multiple time. Can some one please help a little?
This is the code I have written:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
count = 1
give_url = raw_input("Enter url:\t")
def magic(give_url):
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
link_set.add(give_url + str(html_link))
magic(give_url)
for each_item in link_set:
print each_item
print "\n"
Although it's working fine but When I try to call the magic function in for loop, i get RuntimeError: Set changed size during iteration.
I got it working.
The code for recursive URL parsing using beautiful soup:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
#
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
pass
else:
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
else:
link_set.add(html_link)
# print "Total links in the given url are: " + str(len(link_set))
magic(give_url,link_set,0)
link_set2 = set()
link_set3 = set()
for element in link_set:
link_set2.add(element)
count = 1
for element in link_set:
magic(element,link_set3,count)
count += 1
for each_item in link_set3:
link_set2.add(each_item)
link_set3.clear()
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
There are many mistakes so I request you all to please tell me where I can improve the code.

print if list index out of range

hi all im trying to create a handle for "list index out of range" but seem not to be having any luck.
import json, urllib, re
from urllib import urlencode
import googlemaps
import tempfile
import win32api
import win32print
start = "Adelaide, South Australia"
finish = " ghkjffzh, south Australia "
url = 'http://maps.googleapis.com/maps/api/directions/json?%s' % urlencode((
('origin', start),
('destination', finish)
))
ur = urllib.urlopen(url)
result = json.load(ur)
filename = "output.txt"
with open(filename, 'w') as output:
for i in range(0, len(result['routes'][0]['legs'][0]['steps'])):
try:
s = (result['routes'][0]['legs'][0]['steps'][i]['html_instructions'])
d = (result['routes'][0]['legs'][0]['steps'][i]['distance']['text'])
l = (result['routes'][0]['legs'][0]['steps'][i]['duration']['text'])
s = re.sub('<[A-Za-z\/][^>]*>', '', s)
output.writelines(s + " " + d + " " + l + '\n')
except Exception:
print "Directions could not be printed"
output.write("Directions could not be given due to the format of page or the address type")
but nothing is written to .txt and still get error.
ive tried to replace Exception with IndexError and VauleError but no change
Solved used by exploring the returned json result and found a Status result so I passed that first.
with open(filename, 'w') as output:
if result ['status'] == "NOT_FOUND"
output.write( " no directions avalible")
else:
for i in range(0, len(result['routes'][0]['legs'][0]['steps'])):
s = (result['routes'][0]['legs'][0]['steps'][i]['html_instructions'])
d = (result['routes'][0]['legs'][0]['steps'][i]['distance']['text'])
l = (result['routes'][0]['legs'][0]['steps'][i]['duration']['text'])
s = re.sub('<[A-Za-z\/][^>]*>', '', s)
output.writelines(s + " " + d + " " + l + '\n')

(Python) Shelve + try: how to search for a key and set it if it doesn't exists without repeating code?

To store the user work folders permanently, I'm using shelve. And to know if the user has the folders configured I'm using a similar code 3 times:
pastaUsuario = os.getenv('HOMEDRIVE') + os.getenv('HOMEPATH')
pastaPrincipal = pastaUsuario + '\\rev'
pastaConfig = pastaPrincipal + '\\config'
config = shelve.open(pastaConfig + '\\config.db')
try:
pastaIsometricosSpooler = config['pastaIsometricosSpooler']
except Exception:
config['pastaIsometricoSpooler'] = raw_input('Digite o caminho da pasta de extração do Spooler: ')
pastaIsometricosSpooler = config['pastaIsometricosSpooler']
finally:
config.close()
config = shelve.open(pastaConfig + '\\config.db')
try:
ultimoIso = config['ultimoIso']
except Exception:
config['ultimoIso'] = raw_input('Digite o tag do isométrico a ser revisado: ')
ultimoIso = config['ultimoIso']
finally:
config.close()
config = shelve.open(pastaConfig + '\\config.db')
try:
ultimaRev = config['ultimaRev']
except Exception:
config['ultimaRev'] = raw_input('Digite a nova revisão: ')
ultimaRev = config['ultimaRev']
finally:
config.close()
How to avoid repeating the almost identical code?
I tried to use the "for" statement with a list:
config = shelve.open(pastaConfig + '\\config.db')
for x in ['pastaIsometricosSpooler', 'ultimoIso', 'ultimaRev']:
try:
x = config[x]
except Exception:
config[x] = raw_input()
x = config[x]
finally:
config.close()
But the variable set doesn't work because of quotes (eg.: 'ultimaRev' = config['ultimaRev'])
Sorry for my bad english!
This is probably best done by using a function rather than trying to make the same code work in a loop. Adapting what you have:
def getconfig( x, prompt ):
try:
theconf= config[x]
except Exception:
config[x] = raw_input( prompt )
theconf= config[x]
return theconf
Then you can use it three times:
config = shelve.open(pastaConfig + '\\config.db')
ultimaRev = getconfig( 'ultimaRev', 'Digite a nova revisão: ')
ultimoIso = getconfig( 'ultimoIso', 'Digite o tag do' )
pastaIsometricosSpooler = getconfig('pastaIsometricosSpooler', 'Digite o caminho da' )
config.close()

Why is my Python code returning an error when I try to fetch YouTube videos for a given keyword?

Whenever I try to run my code, I receive the following error: "comment_content error! 'nonetype' object has no attribute 'href'" I am new to Python, and did not write this code myself; it was given to me to use. My understanding is that it was functioning properly before? Could this have to do with changes in the YouTube Data API since it was written?
import pdb
import gdata.youtube
import gdata.youtube.service
import codecs
import time
client = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery()
### the input words are here
query.vq = "4b hair"
#######
# the out put file are here
viewFile = codecs.open('views4b_hair.csv', 'w')
commentFile=codecs.open('comments4b_hair.csv', 'w')
##########
query.max_results = 50
query.start_index = 0
query.safesearch = "moderate"
#query.format = 5
query.orderby = "relevance"
#query.author = "hawaiinani"
#pdb.set_trace()
for i in range(19):
#pdb.set_trace()
query.start_index=str(int(query.start_index)+50)
feed = client.YouTubeQuery(query)
print len(feed.entry)
youtubeid=[]
youtubetitle=[]
for entry in feed.entry:
#youtubetitle.append(entry.title.text)
youtubeid.append(entry.id.text[38:])
print entry.id.text[38:],i
try:
entry_comment = client.GetYouTubeVideoEntry(video_id=entry.id.text[38:])
comment_feed = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
viewFile.write(','.join([entry.id.text[38:],entry_comment.published.text,
str(entry_comment.media.duration.seconds), str(entry_comment.statistics.view_count),comment_feed.total_results.text,entry_comment.media.title.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')]) + '\n')
#videop.append("%s, %s,%s, %s, %s, %s" % (search_result["id"]["videoId"],entry.published.text,
# entry.media.duration.seconds, entry.statistics.view_count,comment_feed.total_results.text,entry.media.title.text))
#
#time.sleep(3)
except Exception, ex:
print 'View_content Error', ex
time.sleep(10)
try:
comment_content = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
indexh=0
#while comment_content:
while indexh<10:
indexh=indexh+1
for comment_entry in comment_content.entry:
pubText = comment_entry.published.text
#print pubText
titleText = comment_entry.content.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')
#print titleText
#print 'Got title'
#pubText, titleText = comment_entry.published.text, comment_entry.title.text
commentFile.write(','.join([entry.id.text[38:],pubText,titleText]) + '\n'+'\n')
#commentFile.write(u',')
#commentFile.write(pubText + u',')
#print 'About to write title'
#print titleText
#print 'Wrote title'
#commentlist.append("%s, %s,%s" % (search_result["id"]["videoId"],pubText, titleText))
comment_content=client.Query(comment_content.GetNextLink().href)
#time.sleep(3)
#time.sleep(3)
except Exception, ex:
print 'Comment_content Error!', ex
time.sleep(5)
#pdb.set_trace()
viewFile.close()
commentFile.close()
The error occurs when comment_content.GetNextLink() becomes None. In order to fix it, replace:
while indexh < 10:
with:
while indexh < 10 and comment_content:
also replace:
comment_content=client.Query(comment_content.GetNextLink().href)
with:
next_link = comment_content.GetNextLink()
if next_link:
comment_content = client.Query(next_link.href)
else:
comment_content = None
Hope that helps.