list index out of range when running Selenium - list

I am new in coding. I have a question. I tried to run selenium for scraping data with 13 pages in total. Unfortunately, after page 13, the loop is still running, and I don't know how to stop the loop. This is the error: list index out of range when running Selenium, please I need your help. Thank you !
This is the code that I make
txt = driver
.find_element(By.XPATH,'//*[#id="searchResultsCount"]').text
print(txt)
print(txt.split{' '))
def result_status(driver):
txt = driver.find_e1ement(By.XPATH,'//*{#id="searchResultsCount"]').text
current = txt.sp1it(' ')[1]
end = txt.sp1it(' ')[3]
return current, end, driver
def next_page(driver):
pg_elems = driver.find_elements(By.CLASS_NAME,'page-link') #<a href="#page-274"
nxt_elem = [x for x in pg_elems if x.text == 'Next‘][#]
nxt_elem.click()
time.sleep(2)
return driver
driver = next_page(driver)
results_df = pd.DataFrame()
# Put it all together (From Matt)
# Get current resuLts
current, end, driver = result_status(driver)
#Loop through resuLts
i=0
while current != end:
i += 1
if i%2 == 0:
results = driver.find_element(By.ID,'searchResultsArea')
results_html = results.get_attribute('innerHTHL')
temp = pd.read_html(results_html)[0]
results_df = pd.concat([results_df,temp], ignore_index=True)
results_df.to_csv("results.csv", index=False)
#Check Status
current, end, driver = result_status(driver)
print(current,'|',end='')
#Go to next page
driver = next_page(driver)
if i == 660:
break
results = driver.find_e1ement(By.ID,'overSearchResults')
results_html = results.get_attribute('innerHTML')
df = pd.read_htm1(results_html
And this is the error
IndexError Traceback (most recent call last)
~\AppData\Loca1\Temp/ipykernel_37444/2741504647.py in <module>
21
22 #Go to next page
---> 23 driver = next_page(driver)
24 it i == 690:
25 break

Related

Scrapy detect if Xpath not exists

I've been trying to make my first crawler and i've acomplished what i needed ( get the 1º shop and 2º shop shipping infos and prices ) but with 2 crawlers instead of 1 because i've a big stopper here.
When there'are more than 1 shop the output result is:
In [1]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()').extract()
Out[1]:
[u'ENV\xcdO 3,95\u20ac ',
u'ENV\xcdO GRATIS',
u'ENV\xcdO GRATIS',
u'ENV\xcdO 4,95\u20ac ']
To get only the second result i'm using:
In [2]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
Out[2]: u'ENV\xcdO GRATIS'
But when there's no second results ( only 1 shop ) i'm getting:
IndexError: list index out of range
And the crawler skip the full page even if the other items have data ...
After trying several times i've decided to do a fast solution to get the result, 2 crawlers 1 for first shops and the other for the second one but right now i want to do it clean in only 1 crawler.
Some help, tip or advice will be appreciated, that's my first try making a recursive crawler with scrapy, kinda like it.
There's the code:
# -*- coding: utf-8 -*-
import scrapy
from Guapalia.items import GuapaliaItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class GuapaliaSpider(CrawlSpider):
name = "guapalia"
allowed_domains = ["guapalia.com"]
start_urls = (
'https://www.guapalia.com/perfumes?page=1',
'https://www.guapalia.com/maquillaje?page=1',
'https://www.guapalia.com/cosmetica?page=1',
'https://www.guapalia.com/linea-de-bano?page=1',
'https://www.guapalia.com/parafarmacia?page=1',
'https://www.guapalia.com/solares?page=1',
'https://www.guapalia.com/regalos?page=1',
)
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='js-pager']/a[contains(text(),'Siguientes')]"),follow=True),
Rule(LinkExtractor(restrict_xpaths="//div[#class='list-display__item list-display__item--product']/div/a[#class='col-xs-10 col-sm-10 col-md-12 clickOnProduct']"),callback='parse_articles',follow=True),
)
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')[1].extract()
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_second_shop'] = articles_second_shop if articles_second_shop else 'N/A'
item['articles_second_shipping'] = articles_second_shipping
item['articles_name'] = articles_name
yield item
Basic output of crawler with the right format when there're more than 1 shop:
2017-09-21 09:53:11 [scrapy] DEBUG: Crawled (200) <GET https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355> (referer: https://www.guapalia.com/perfumes?page=1)
2017-09-21 09:53:11 [scrapy] DEBUG: Scraped from <200 https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355>
{'articles_first_shipping': [u'ENV\xcdO GRATIS'],
'articles_first_shop': [u'DOUGLAS'],
'articles_name': [u'ZEN edp vaporizador 100 ml'],
'articles_second_shipping': u'ENV\xcdO 3,99\u20ac ',
'articles_second_shop': u'BUYSVIP',
'articles_urls': 'https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355'}
The problem is when doesn't exists a second shop because my code on the field second shop
IndexError: list index out of range
SOLUTION Thanks to #Tarun Lalwani
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
if len(articles_second_shop) > 1:
item['articles_second_shop'] = articles_second_shop[1].extract()
else:
item['articles_second_shop'] = 'Not Found'
if len(articles_second_shipping) > 1:
item['articles_second_shipping'] = articles_second_shipping[1].extract()
else:
item['articles_second_shipping'] = 'Not Found'
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_name'] = articles_name
yield item
You need to get the result in a variable first. Then you can take decision based on its length
texts = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
if len(texts) > 1:
data = texts[1].extract()
elif len(text) == 1:
data = texts[0].extract()
else
data ="Not found"

Memory Error when exporting data to csv file

Hello I was hoping someone could help me with my college coursework, I have an issue with my code. I keep running into a memory error with my data export.
Is there any way I can reduce the memory that is being used or is there a different approach I can take?
For the course work I am given a file of 300 records about customer orders from a CSV file and then I have to export the Friday records to a new CSV file. Also I am required to print the most popular method for customer's orders and the total money raised from the orders but I have an easy plan for that.
This is my first time working with CSV so I'm not sure how to do it. When I run the program it tends to crash instantly or stop responding. Once it appeared with 'MEMORY ERROR' however that is all it appeared with. I'm using a college provided computer so I am not sure on the exact specs but I know it runs 4GB of memory.
defining count occurences predefined function
def countOccurences(target,array):
counter = 0
for element in array:
if element == target:
counter= counter + 1
print counter
return counter
creating user defined functions for the program
dataInput function used for collecting data from provided file
def dataInput():
import csv
recordArray = []
customerArray = []
f = open('E:\Portable Python 2.7.6.1\Choral Shield Data File(CSV).csv')
csv_f = csv.reader(f)
for row in csv_f:
customerArray.append(row[0])
ticketID = row[1]
day, area = datasplit(ticketID)
customerArray.append(day)
customerArray.append(area)
customerArray.append(row[2])
customerArray.append(row[3])
recordArray.append(customerArray)
f.close
return recordArray
def datasplit(variable):
day = variable[0]
area = variable[1]
return day,area
def dataProcessing(recordArray):
methodArray = []
wed_thursCost = 5
friCost = 10
record = 0
while record < 300:
method = recordArray[record][4]
methodArray.append(method)
record = record+1
school = countOccurences('S',methodArray)
website = countOccurences('W',methodArray)
if school > website:
school = True
elif school < website:
website = True
dayArray = []
record = 0
while record < 300:
day = recordArray[record][1]
dayArray.append(day)
record = record + 1
fridays = countOccurences('F',dayArray)
wednesdays = countOccurences('W',dayArray)
thursdays = countOccurences('T', dayArray)
totalFriCost = fridays * friCost
totalWedCost = wednesdays * wed_thursCost
totalThurCost = thursdays * wed_thursCost
totalCost = totalFriCost + totalWedCost + totalThurCost
return totalCost,school,website
My first attempt to writing to a csv file
def dataExport(recordArray):
import csv
fridayRecords = []
record = 0
customerIDArray = []
ticketIDArray = []
numberArray = []
methodArray = []
record = 0
while record < 300:
if recordArray[record][1] == 'F':
fridayRecords.append(recordArray[record])
record = record + 1
with open('\Courswork output.csv',"wb") as f:
writer = csv.writer(f)
for record in fridayRecords:
writer.writerows(fridayRecords)
f.close
My second attempt at writing to the CSV file
def write_file(recordArray): # write selected records to a new csv file
CustomerID = []
TicketID = []
Number = []
Method = []
counter = 0
while counter < 300:
if recordArray[counter][2] == 'F':
CustomerID.append(recordArray[counter][0])
TicketID.append(recordArray[counter][1]+recordArray[counter[2]])
Number.append(recordArray[counter][3])
Method.append(recordArray[counter][4])
fridayRecords = [] # a list to contain the lists before writing to file
for x in range(len(CustomerID)):
one_record = CustomerID[x],TicketID[x],Number[x],Method[x]
fridayRecords.append(one_record)
#open file for writing
with open("sample_output.csv", "wb") as f:
#create the csv writer object
writer = csv.writer(f)
#write one row (item) of data at a time
writer.writerows(recordArray)
f.close
counter = counter + 1
#Main Program
recordArray = dataInput()
totalCost,school,website = dataProcessing(recordArray)
write_file(recordArray)
In the function write_file(recordArray) in your second attempt the counter variable counter in the first while loop is never updated so the loop continues for ever until you run out of memory.

I don't understand why I'm getting an index error, when trying to extract exif data

The code and error with sample data from an image:
image = Image.open(newest)
exif = image._getexif()
gps = {}
datebool = False
gpsbool = False
date = 'None'
time = 'None'
gpstext = 'None'
dmslat = 'None'
dmslon = 'None'
if exif is not None:
for tag, entry in exif.items(): #Import date and time from Exif
datebool = True
if TAGS.get(tag, tag) == 'DateTimeOriginal':
date = entry[0:10]
time = entry[11:19]
for tag, entry in exif.items(): #Check if the GPSInfo field exists
if TAGS.get(tag,tag) == 'GPSInfo':
gpsbool = True
for e in entry:
decoded = GPSTAGS.get(e,e)
print (decoded)
print(type(entry))
gps[decoded] = entry[e]
The results
4984
<type 'tuple'>
Traceback (most recent call last):File"C:\Users\~~~~~\Desktop\project_7-8-2015\8_bands\Program_camera.py", line 109, in <module>
gps[decoded] = entry[e]
IndexError: tuple index out of range
Since e is pulled from entry, how can indexing that particular e from entry generate an indexing error? Am I actually pulling the correct data for the gps?
for e in entry doesn't index the values in entry, it iterates over them. For example:
entry = (3, 5, 7)
for e in entry:
print(e)
will output:
3
5
7
So the line should probably look like:
gps[decoded] = e
though I'm not sure what the GPSTAGS line would become. If you really need the items in entry enumerated, then you should look into (to your great surprise, I'm sure) the enumerate() function.

Parsing txt file using python : List index out of range

Hello I have written a python program to parse data specific data from txt file
my code is:
f = open('C:/Users/aikaterini/Desktop/Ericsson_PARSER/BSC_alarms_vf_OSS.txt','r')
from datetime import datetime
import MySQLdb
def firstl():
with f as lines:
lines = lines.readlines()
print len(lines)
for i,line in enumerate(lines):
if line.startswith("*** Disconnected from"):
conline = line.split()
bsc = conline[-2]
print "\n"*5
print bsc
print "*"*70
break
for i,line in enumerate(lines):
if line.startswith("*** Connected to"):
conline = line.split()
bsc = conline[-2]
print "\n"*5
print bsc
print "*"*70
elif line[:3] == "A1/" or line[:3] == "A2/":
if lines[i+1].startswith("RADIO"):
fal = line.split()
first_alarm_line = [fal[0][:2],fal[-2],fal[-1]]
year = first_alarm_line[1][:2]
month = first_alarm_line[1][2:4]
day = first_alarm_line[1][4:]
hours = first_alarm_line[2][:2]
minutes = first_alarm_line[2][2:]
date = datetime.strptime( day + " " + month + " " + year + " " + \
hours+":"+minutes,"%d %m %y %H:%M")
print first_alarm_line
print date, "\n"
print lines[i+1]
print lines[i+4]
print lines[i+5]
desc_line = lines[i+4]
desc_values_line = lines[i+5]
desc = desc_line.split(None,2)
print desc
desc_values = desc_values_line.split(None,2)
rsite = ""
#for x in desc_values[1]:
# if not (x.isalpha() or x == "0"):
# rsite += x
rsite = desc_values[1].lstrip('RBS0')
print "\t"*2 + "rsite:" + rsite
if desc[-1] == "ALARM SLOGAN\n":
alarm_slogan = desc_values[-1]
print alarm_slogan
x = i
print x # to check the line
print len(line) #check length of lines
while not lines[x].startswith("EXTERNAL"):
x+=1
if lines[x].startswith("EXTERNAL"):
while not lines[x] == "\n":
print lines[x]
x+=1
print "\n"*5
elif lines[i+1].startswith("CELL LOGICAL"):
fal = line.split()
first_alarm_line = [fal[0][:2],fal[-2],fal[-1]]
#print i
print first_alarm_line
type = lines[i+1]
print type
cell_line = lines[i+3]
cell = cell_line.split()[0]
print cell
print "\n"*5
##########Database query###########
#db = MySQLdb.connect(host,user,password,database)
firstl()
when i run the program the results are correct
but it prints until line 50672 while there are 51027
and i get the last printed result with the following error:
['A2', '130919', '0309']
2013-09-19 03:09:00
RADIO X-CEIVER ADMINISTRATION
MO RSITE ALARM SLOGAN
RXOCF-18 RBS03668 OML FAULT
['MO', 'RSITE', 'ALARM SLOGAN\n']
rsite:3668
OML FAULT
50672
51027
Traceback (most recent call last):
File "C:\Python27\parser_v3.py", line 106, in <module>
firstl()
File "C:\Python27\parser_v3.py", line 72, in firstl
while not lines[x].startswith("EXTERNAL"):
IndexError: list index out of range
if i comment the while not line i get :
Traceback (most recent call last):
File "C:\Python27\parser_v3.py", line 106, in <module>
firstl()
File "C:\Python27\parser_v3.py", line 60, in firstl
rsite = desc_values[1].lstrip('RBS0')
IndexError: list index out of range
The txt content is like :
A1/EXT "FS G11B/25/13/3" 382 150308 1431
RADIO X-CEIVER ADMINISTRATION
BTS EXTERNAL FAULT
MO RSITE CLASS
RXOCF-16 RBS02190 1
EXTERNAL ALARM
ALARM SYSTEM ON/OFF G2190 DRAMA CNR
A1/EXT "FS G11B/25/13/3" 755 150312 1434
RADIO X-CEIVER ADMINISTRATION
BTS EXTERNAL FAULT
MO RSITE CLASS
RXOCF-113 RBS00674 1
EXTERNAL ALARM
IS.BOAR FAIL G0674 FALAKRO
I don't understand since i do a split with maxnumber 2 and i get 3 elements as u can see and i am picking the 2nd and if i comment that i get another error when i pick an element from a list and the thing is that returning the correct result.Please help me.
Sorry for the long post thank you in advance.
I'm haven't dug deep into your code, but have you tried validating that x does not exceed the number of elements in lines before trying to access that index? Also, for readability I'd suggest using lines[x] != rather than not lines[x] ==
while x < len(lines) and lines[x] != "\n":
I solved it although i don't know if it is correct way but it works.
I think the problem was that the x was exceeding the length of the list lines containing the file and there had to be a check after the split that the list had length larger or equal to number of elements so :
if len(desc_values) > 2 and len(desc) > 2:
rsite = desc_values[1].lstrip('RBS0')
print "\t"*2 + "rsite:" + rsite
if desc[-1] == "ALARM SLOGAN\n":
alarm_slogan = desc_values[-1]
print alarm_slogan
x = i
print x #to check the line
print len(lines) # check length of lines
while [x] < len(lines): #check so that x doesnt exceed the length of file list "line"
while not lines[x].startswith("EXTERNAL"):
x+=1
if lines[x].startswith("EXTERNAL"):
while lines[x] != "\n":
print lines[x]
x+=1
Thank you man you really helped me although i am trying to find a way to stop the iteration of x to gain some computation time i tried break but it throws you completely of the loop.
Thanks anyway

Cannot Pool.map() function because of UnpickleableError?

So I am trying to multi process function F. Which is accessed by a button press with tkinter.
def f(x):
global doom,results,info
doom = doom + 1
if check(x) == True:
results.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "1/"+doom
s.configure(text=texx)
root.update()
The function is called within a function like so:
def dojob():
index = ['URLS'...]
pool = Pool(processes=4)
s.configure(text="Shifting Workload to cores..")
root.update()
pool.map(f, index)
The button is inside root window.
I get the following error:
Exception in thread Thread-2:
Traceback (most recent call last):
File "C:\Python27\lib\threading.py", line 808, in __bootstrap_inner
self.run()
File "C:\Python27\lib\threading.py", line 761, in run
self.__target(*self.__args, **self.__kwargs)
File "C:\Python27\lib\multiprocessing\pool.py", line 342, in _handle_tasks
put(task)
UnpickleableError: Cannot pickle <type 'tkapp'> objects
I do not even know what a pickle does? Help?
Here is the complete code:
from Tkinter import *
from ttk import *
from tkMessageBox import showinfo
from multiprocessing import Pool
import random
emails = set()
import urllib2
import urllib2 as urllib
########
CONSTANT_PAGECOUNT = 20
######
def f(x):
global doom,emails,info
doom = doom + 1
if check(x) == True:
print "",
emails.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "Sk1nn1n "+str(doom)+'/'+str(CONSTANT_PAGECOUNT)+""
s.configure(text=texx)
root.update()
return 0
def f(x):
print ""
def showFile(site,info):
top = Toplevel()
top.title('Sites')
x = Text(top)
x.pack()
i=0
for site_url in site:
x.insert(END,site_url)
i=i+1
def get_column_number(url):
return True
def check(url):
return True
def getgoogleurl(search,siteurl=False,startr=0):
if siteurl==False:
return 'http://www.google.com/search?q='+urllib2.quote(search)+'&start='+str(startr)+'&oq='+urllib2.quote(search)
else:
return 'http://www.google.com/search?q=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)+'&oq=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)
def getgooglelinks(search,siteurl=False,startr=0):
#google returns 403 without user agent
headers = {'User-agent':'Mozilla/11.0'}
req = urllib2.Request(getgoogleurl(search,siteurl,startr),None,headers)
site = urllib2.urlopen(req)
data = site.read()
site.close()
#no beatifulsoup because google html is generated with javascript
start = data.find('<div id="res">')
end = data.find('<div id="foot">')
if data[start:end]=='':
#error, no links to find
return False
else:
links =[]
data = data[start:end]
start = 0
end = 0
while start>-1 and end>-1:
#get only results of the provided site
if siteurl==False:
start = data.find('<a href="/url?q=')
else:
start = data.find('<a href="/url?q='+str(siteurl))
data = data[start+len('<a href="/url?q='):]
end = data.find('&sa=U&ei=')
if start>-1 and end>-1:
link = urllib2.unquote(data[0:end])
data = data[end:len(data)]
if link.find('http')==0:
links.append(link)
return links
def rip(results=15,accuracy=16):
global e
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
linklist = []
counter = 0
doom = 0
while counter < results:
links = getgooglelinks(keyword,startr=counter)
for link in links:
if len(linklist) > CONSTANT_PAGECOUNT:
s.configure(text="Proccessing..")
root.update()
return linklist
else:
doom = doom + 1
linklist.append(link)
texx = str(doom)+"/"+str(CONSTANT_PAGECOUNT)
s.configure(text=texx)
root.update()
root.update()
counter = counter+accuracy
return linklist
def flip():
global e
emails = set()
info = []
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
s.configure(text="Generating index..")
root.update()
doom = -1
index = rip(CONSTANT_PAGECOUNT,10)
if 1:
try:
pool = Pool(processes=4)
#s.configure(text="Shifting Workload to cores..")
#root.update()
pool.map(f, index)
pool.close()
except:
print "The errors there.."
j.config(value=CONSTANT_PAGECOUNT)
if len(emails) > 0:
filepath='relavant_list_'+str(random.randint(1,9999))+'.emList.txt'
#print len(emails),
#print "emails found."
ggg = open(filepath,'a+')
for x in emails:
ggg.write(x+"\n")
showinfo(
str(len(emails))+" key word related sites found!",
" sites are saved in "+str(filepath)
)
showFile(emails,info)
s.configure(text=filepath)
else:
s.configure(text='No related sites found : (')
if __name__ == '__main__':
### CONSTANTS
version = '1.0'
### END CONSTANTS
root = Tk()
root.title('Program v'+version)
s = Style()
s.theme_use('default')
#print s.theme_names()
s.configure("black.Horizontal.TProgressbar", foreground='blue', background='blue')
j = Progressbar(root, style="black.Horizontal.TProgressbar", orient="vertical", length=200, mode="determinate", maximum=CONSTANT_PAGECOUNT, value=0)
j.pack(side='right',fill='y')
f = Frame(root)
x = Frame(f)
e = Entry(x,width=51)
s = Label(x,width=50,anchor='center',text='Waiting for task..')
Button(f,text='Generate List!',width=50,command=flip).pack(fill='both',expand=True)
s.pack(side='bottom',fill='y',expand=True)
e.pack(side='top',fill='both',expand=True)
x.pack(side='top',fill='y',expand=True)
f.pack(side='left',expand=True,fill="both")
root.mainloop()
You are leaking a tkinter object. Most likely because you are trying to update the interface from another process with the last line of f()
Update based on code
You have a name collision between your function f() and a variable f in your __main__ which gets assigned to your main window and causes the tkapp pickle error. Rename the function to def myfunc() or something. Also need to call pool.join() after pool.close()