Scrapy detect if Xpath not exists - python-2.7

I've been trying to make my first crawler and i've acomplished what i needed ( get the 1º shop and 2º shop shipping infos and prices ) but with 2 crawlers instead of 1 because i've a big stopper here.
When there'are more than 1 shop the output result is:
In [1]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()').extract()
Out[1]:
[u'ENV\xcdO 3,95\u20ac ',
u'ENV\xcdO GRATIS',
u'ENV\xcdO GRATIS',
u'ENV\xcdO 4,95\u20ac ']
To get only the second result i'm using:
In [2]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
Out[2]: u'ENV\xcdO GRATIS'
But when there's no second results ( only 1 shop ) i'm getting:
IndexError: list index out of range
And the crawler skip the full page even if the other items have data ...
After trying several times i've decided to do a fast solution to get the result, 2 crawlers 1 for first shops and the other for the second one but right now i want to do it clean in only 1 crawler.
Some help, tip or advice will be appreciated, that's my first try making a recursive crawler with scrapy, kinda like it.
There's the code:
# -*- coding: utf-8 -*-
import scrapy
from Guapalia.items import GuapaliaItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class GuapaliaSpider(CrawlSpider):
name = "guapalia"
allowed_domains = ["guapalia.com"]
start_urls = (
'https://www.guapalia.com/perfumes?page=1',
'https://www.guapalia.com/maquillaje?page=1',
'https://www.guapalia.com/cosmetica?page=1',
'https://www.guapalia.com/linea-de-bano?page=1',
'https://www.guapalia.com/parafarmacia?page=1',
'https://www.guapalia.com/solares?page=1',
'https://www.guapalia.com/regalos?page=1',
)
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='js-pager']/a[contains(text(),'Siguientes')]"),follow=True),
Rule(LinkExtractor(restrict_xpaths="//div[#class='list-display__item list-display__item--product']/div/a[#class='col-xs-10 col-sm-10 col-md-12 clickOnProduct']"),callback='parse_articles',follow=True),
)
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')[1].extract()
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_second_shop'] = articles_second_shop if articles_second_shop else 'N/A'
item['articles_second_shipping'] = articles_second_shipping
item['articles_name'] = articles_name
yield item
Basic output of crawler with the right format when there're more than 1 shop:
2017-09-21 09:53:11 [scrapy] DEBUG: Crawled (200) <GET https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355> (referer: https://www.guapalia.com/perfumes?page=1)
2017-09-21 09:53:11 [scrapy] DEBUG: Scraped from <200 https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355>
{'articles_first_shipping': [u'ENV\xcdO GRATIS'],
'articles_first_shop': [u'DOUGLAS'],
'articles_name': [u'ZEN edp vaporizador 100 ml'],
'articles_second_shipping': u'ENV\xcdO 3,99\u20ac ',
'articles_second_shop': u'BUYSVIP',
'articles_urls': 'https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355'}
The problem is when doesn't exists a second shop because my code on the field second shop
IndexError: list index out of range
SOLUTION Thanks to #Tarun Lalwani
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
if len(articles_second_shop) > 1:
item['articles_second_shop'] = articles_second_shop[1].extract()
else:
item['articles_second_shop'] = 'Not Found'
if len(articles_second_shipping) > 1:
item['articles_second_shipping'] = articles_second_shipping[1].extract()
else:
item['articles_second_shipping'] = 'Not Found'
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_name'] = articles_name
yield item

You need to get the result in a variable first. Then you can take decision based on its length
texts = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
if len(texts) > 1:
data = texts[1].extract()
elif len(text) == 1:
data = texts[0].extract()
else
data ="Not found"

Related

Problem when updating a table using celery task: OperationalError

EDIT 2022-10-04 18:40
I've tried using bulk_update and bulk_create as these method only query database once but still have the same issue
would appreciate any help/explanation on this issue
'''
Task to edit data correction forms (DCF) online
'''
#shared_task(bind=True)
def DCF_edition(self):
print(timezone.now())
DCF_BEFORE_UPDATE = pd.DataFrame.from_records(DataCorrectionForm.objects.all().values())
if not DCF_BEFORE_UPDATE.empty :
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.rename(columns={"patient": "pat"})
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.astype({'record_date': str,'created_date': str})
DCF_BEFORE_UPDATE['dcf_status'] = DCF_BEFORE_UPDATE.apply(lambda status: 0, axis=1)
# list of dataframe to concat
data = []
# load queries definition
queries = queries_definitions()
# print(queries)
if not queries.empty:
for index, row in queries.iterrows():
print('Query ide',row['ide'])
# print(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification date'],row['variable_name'],row['variable_label'],row['query_condition'],row['fields_to_display'])
try:
missing_or_inconsistent = missing_or_inconsistent_data(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification date'],row['variable_name'],row['variable_label'],row['query_condition'],row['fields_to_display']) #.iloc[:10] #to limit rows
missing_or_inconsistent.columns.values[2] = 'record_date' # rename the date column (that have database name)
missing_or_inconsistent['dcf_ide'] = str(row['ide']) + '_' + row['variable_name'] + '_' + missing_or_inconsistent[row['crf primary key']].astype(str)
missing_or_inconsistent['category'] = row['query_type']
missing_or_inconsistent['crf'] = row['crf_name']
missing_or_inconsistent['crf_ide'] = missing_or_inconsistent[row['crf primary key']]
missing_or_inconsistent['field_name'] = row['variable_name']
missing_or_inconsistent['field_label'] = row['variable_label']
missing_or_inconsistent['field_value'] = missing_or_inconsistent[row['variable_name']]
missing_or_inconsistent['message'] = row['query_message']
missing_or_inconsistent['query_id'] = 'Query ide ' + str(row['ide'])
missing_or_inconsistent['dcf_status'] = 1
missing_or_inconsistent['created_date'] = timezone.now()
missing_or_inconsistent['deactivated'] = False
missing_or_inconsistent['comments'] = None
data.append(missing_or_inconsistent[['ide','dcf_ide','category','crf','crf_ide','pat','record_date','field_name','field_label','message','field_value','dcf_status','created_date','query_id','deactivated','comments']])
dcf = pd.concat(data)
except Exception as e:
Log.objects.create(dcf_edition_status=0,dcf_edition_exception=str(e)[:200])
continue
DCF_AFTER_UPDATE = pd.concat([DCF_BEFORE_UPDATE,dcf])
DCF_AFTER_UPDATE['duplicate'] = DCF_AFTER_UPDATE.duplicated(subset=['dcf_ide'],keep='last')
DCF_AFTER_UPDATE['dcf_status'] = DCF_AFTER_UPDATE.apply(lambda row: 2 if row['duplicate'] else row['dcf_status'],axis=1)
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop_duplicates(subset=['dcf_ide'],keep='first').drop(columns=['duplicate'])
DCF_AFTER_UPDATE.rename(columns = {'pat':'patient',}, inplace = True)
# Cast date into string format to be able to dumps data
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.astype({'record_date': str}) if not DCF_AFTER_UPDATE.empty else DCF_AFTER_UPDATE
records_to_update = [
DataCorrectionForm(
ide=record['ide'],
dcf_ide=record['dcf_ide'],
category=record['category'],
crf=record['crf'],
crf_ide=record['crf_ide'],
patient=record['patient'],
record_date=record['record_date'],
field_name=record['field_name'],
field_label=record['field_label'],
message=record['message'],
field_value=record['field_value'],
dcf_status=record['dcf_status'],
created_date=record['created_date'],
query_id=record['query_id'],
deactivated=record['deactivated'],
comments=record['comments']
) for i, record in DCF_AFTER_UPDATE[(DCF_AFTER_UPDATE['dcf_status'] != 1)].iterrows()
]
if records_to_update:
DataCorrectionForm.objects.bulk_update(records_to_update,['dcf_status'])
records_to_create = [
DataCorrectionForm(
dcf_ide=record['dcf_ide'],
category=record['category'],
crf=record['crf'],
crf_ide=record['crf_ide'],
patient=record['patient'],
record_date=record['record_date'],
field_name=record['field_name'],
field_label=record['field_label'],
message=record['message'],
field_value=record['field_value'],
dcf_status=record['dcf_status'],
created_date=record['created_date'],
query_id=record['query_id'],
deactivated=record['deactivated'],
comments=record['comments']
) for i, record in DCF_AFTER_UPDATE[(DCF_AFTER_UPDATE['dcf_status'] == 1)].iterrows()
]
if records_to_create:
DataCorrectionForm.objects.bulk_create(records_to_create)
EDIT 2022-10-04 13:40
I've tried to "optimized" code using update_or_create() method but doesn't change anything
I still have an OperationalError with the line DataCorrectionForm.objects.update_or_create(...)
How can I update my database?
'''
Task to edit data correction forms (DCF) online
'''
#shared_task(bind=True)
def DCF_edition(self):
DCF_BEFORE_UPDATE = pd.DataFrame.from_records(DataCorrectionForm.objects.all().values())
if not DCF_BEFORE_UPDATE.empty :
DCF_BEFORE_UPDATE.drop(columns=['ide'])
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.rename(columns={"patient": "pat"})
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.astype({'record_date': str,'created_date': str})
DCF_BEFORE_UPDATE['dcf_status'] = DCF_BEFORE_UPDATE.apply(lambda status: 0, axis=1)
# list of dataframe to concat
data = []
# load queries definition
queries = queries_definitions()
if not queries.empty:
for index, row in queries.iterrows()
try:
missing_or_inconsistent = missing_or_inconsistent_data(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification date'],row['variable_name'],row['variable_label'],row['query_condition'],row['fields_to_display']) #.iloc[:10] #to limit rows
missing_or_inconsistent.columns.values[2] = 'record_date' # rename the date column (that have database name)
missing_or_inconsistent['dcf_ide'] = str(row['ide']) + '_' + row['variable_name'] + '_' + missing_or_inconsistent[row['crf primary key']].astype(str)
missing_or_inconsistent['category'] = row['query_type']
missing_or_inconsistent['crf'] = row['crf_name']
missing_or_inconsistent['crf_ide'] = missing_or_inconsistent[row['crf primary key']]
missing_or_inconsistent['field_name'] = row['variable_name']
missing_or_inconsistent['field_label'] = row['variable_label']
missing_or_inconsistent['field_value'] = missing_or_inconsistent[row['variable_name']]
missing_or_inconsistent['message'] = row['query_message']
missing_or_inconsistent['DEF'] = 'Query ide ' + str(row['ide'])
missing_or_inconsistent['dcf_status'] = 1
missing_or_inconsistent['created_date'] = timezone.now()
missing_or_inconsistent['deactivated'] = False
missing_or_inconsistent['comments'] = None
data.append(missing_or_inconsistent[['dcf_ide','category','crf','crf_ide','pat','record_date','field_name','field_label','message','field_value','dcf_status','created_date','DEF','deactivated','comments']])
dcf = pd.concat(data)
except Exception as e:
Log.objects.create(dcf_edition_status=0,dcf_edition_exception=str(e)[:200])
continue
DCF_AFTER_UPDATE = pd.concat([DCF_BEFORE_UPDATE,dcf])
DCF_AFTER_UPDATE['duplicate'] = DCF_AFTER_UPDATE.duplicated(subset=['dcf_ide'],keep='last')
DCF_AFTER_UPDATE['dcf_status'] = DCF_AFTER_UPDATE.apply(lambda row: 2 if row['duplicate'] else row['dcf_status'],axis=1)
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop_duplicates(subset=['dcf_ide'],keep='first').drop(columns=['duplicate'])
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop(['DEF'], axis=1)
DCF_AFTER_UPDATE.rename(columns = {'pat':'patient',}, inplace = True)
# Cast date into string format to be able to dumps data
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.astype({'record_date': str}) if not DCF_AFTER_UPDATE.empty else DCF_AFTER_UPDATE
records = DCF_AFTER_UPDATE.to_dict(orient='records')
for record in records:
DataCorrectionForm.objects.update_or_create(
dcf_ide=record['dcf_ide'], # filter to search for existing objects => should not be pass to default (if not IntegrityError)
defaults = {
'category':record['category'],
'crf':record['crf'],
'crf_ide':record['crf_ide'],
'patient':record['patient'],
'record_date':record['record_date'],
'field_name':record['field_name'],
'field_label':record['field_label'],
'message':record['message'],
'field_value':record['field_value'],
'dcf_status':record['dcf_status'],
'created_date':record['created_date'],
# 'DEF':record['DEF'],
'deactivated':record['deactivated'],
'comments':record['comments']
}
)
Log.objects.create(dcf_edition_status=1)
return True
EDIT 2022-10-03 17:00
in fact reading CAVEATS:
The development server creates a new thread for each request it
handles, negating the effect of persistent connections. Don’t enable
them during development.
EDIT 2022-10-03 16:00
Django 2.2.5
I have tried to set DATABASES parameter CONN_MAX_AGE as per Django documentation but it doesn't change anythings
Default: 0
The lifetime of a database connection, as an integer of seconds. Use 0
to close database connections at the end of each request — Django’s
historical behavior — and None for unlimited persistent connections.
I use Celery task and got an error I do not understand.
I loop over a table (that contain query definitions) to edit missing/inconsistent data in a database (using API) and registered discrepencies in another table.
If I run query one at a time, it works but when I try to loop over queries, I got an error
OperationalError('server closed the connection unexpectedly\n\tThis probably means the server terminated abnormally\n\tbefore or while processing the request.\n')
def DCF_edition(self):
DCF_BEFORE_UPDATE = pd.DataFrame.from_records(DataCorrectionForm.objects.all().values())
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.astype({'record_date': str,'created_date': str}) if not DCF_BEFORE_UPDATE.empty else DCF_BEFORE_UPDATE
data = []
# load queries definition
queries = queries_definitions()
if not queries.empty:
for index, row in queries.iterrows():
try:
missing_or_inconsistent = missing_or_inconsistent_data(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification
data.append(missing_or_inconsistent[['dcf_ide','category','crf','crf_ide','pat','record_date','field_name','field_label','message','field_value','dcf_status','DEF','deactivated']])
DCF_AFTER_UPDATE = pd.concat(data)
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop_duplicates(keep='last')
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop(['DEF'], axis=1)
DCF_AFTER_UPDATE.rename(columns = {'pat':'patient',}, inplace = True)
except Exception as e:
Log.objects.create(dcf_edition_status=0,dcf_edition_exception=str(e)[:200])
continue
# Cast date into string format to be able to dumps data
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.astype({'record_date': str}) if not DCF_AFTER_UPDATE.empty else DCF_AFTER_UPDATE
records = json.loads(json.dumps(list(DCF_AFTER_UPDATE.T.to_dict().values())))
for record in records:
if not DCF_BEFORE_UPDATE.empty and record['dcf_ide'] in DCF_BEFORE_UPDATE.values:
DataCorrectionForm.objects.filter(dcf_ide=record['dcf_ide']).update(dcf_status=2)
else:
DataCorrectionForm.objects.get_or_create(**record)
# resolved dcf => status=0
if not DCF_BEFORE_UPDATE.empty:
records = json.loads(json.dumps(list(DCF_BEFORE_UPDATE.T.to_dict().values())))
for record in records:
if record['dcf_ide'] not in DCF_AFTER_UPDATE.values:
DataCorrectionForm.objects.filter(dcf_ide=record['dcf_ide']).update(dcf_status=0)
Log.objects.create(dcf_edition_status=1)
return True
The lifetime of a database connection, as an integer of seconds. Use 0 to close database connections at the end of each request — Django’s historical behavior — and None for unlimited persistent connections.
It seems that your task is long running task and need to hold the db connection for a long period. Did you try to set it to None
DATABASES = {
'default': env.db(),
}
# https://docs.djangoproject.com/en/3.1/ref/settings/#conn-max-age
DATABASES['default']['CONN_MAX_AGE'] = None
How long does your task need to finish? It could be another problem with server database setting, ex tcp_keepalives_ilde..

How to use beautifulsoup to save html of a link in a file and do the same with all the links in the html file

I'm trying to write a parser which will take a url and download it's html in a .html file. Then it'll go through the html file to find all links and save them as well. I want to repeat it multiple time. Can some one please help a little?
This is the code I have written:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
count = 1
give_url = raw_input("Enter url:\t")
def magic(give_url):
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
link_set.add(give_url + str(html_link))
magic(give_url)
for each_item in link_set:
print each_item
print "\n"
Although it's working fine but When I try to call the magic function in for loop, i get RuntimeError: Set changed size during iteration.
I got it working.
The code for recursive URL parsing using beautiful soup:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
#
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
pass
else:
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
else:
link_set.add(html_link)
# print "Total links in the given url are: " + str(len(link_set))
magic(give_url,link_set,0)
link_set2 = set()
link_set3 = set()
for element in link_set:
link_set2.add(element)
count = 1
for element in link_set:
magic(element,link_set3,count)
count += 1
for each_item in link_set3:
link_set2.add(each_item)
link_set3.clear()
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
There are many mistakes so I request you all to please tell me where I can improve the code.

getting indexerror : list index out of range

from BeautifulSoup import BeautifulSoup
import mechanize
import re
def price_walmart_match(keyword):
url = "http://www.walmart.com/search/?query="+keyword
br = mechanize.Browser()
br.set_handle_robots(False)
br.open(url)
html = br.response().read()
result_soup = BeautifulSoup(html)
found_results = result_soup.findAll('div' , attrs={'class':'js-tile tile-landscape'})
all_results=[]
for result in found_results:
title = result.find('a' , attrs={'class':'js-product-title'})
links = result.find('a' , href=True)
before_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('<span class="sup">$</span>')[1].split('<span class="visuallyhidden">')[0]
after_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('</span><span class="sup">')[1].split('</span>')[0]
prices = before_decimal+'.'+after_decimal
inArray = [float(prices), "http://www.walmart.com"+links['href']]
all_results.append(inArray)
print(all_result)
Sorry it is full code where i get error.
Thats Because the mentioned class is not there in the page.
Try to Represent query term which is constant.

Python AttributeError instance has no attribute, When I add a new method.

I am Trying to call a another method within my class, for some reason I am getting the AttributeError: portfinder instance has no attribute 'generatePortNumber' See my code below:
when I tried to call the generatePortNumber I'm getting the same error. I have never come across this issue.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sqlite3 as lite
import sys
import random
class portfinder:
"""docstring for ClassName"""
def __init__(self):
self.portsToCheck = ['agentport','BatchProcessingAgentPort','databaseport','indexserviceport','monitorport','servicefacadeport','webdriverport']
self.dataBasePort = (u'60025',)
self.portInUse = False
self.x = 0
def generatePortNumber(self):
self.newPortNumber = random.randrange(8000, 9000)
print self.newPortNumber
return self.newPortNumber
def findUsedPortsinDB(self):
con = lite.connect('D:\play\Opes\db.sqlite3')
with con:
cur = con.cursor()
sqlStatement = "Select " + self.portsToCheck[2] +' From Jobs_jobs'
print sqlStatement
cur.execute(sqlStatement)
rows = cur.fetchall()
for row in rows:
print row
if row == self.dataBasePort:
self.portInUse = "true"
self.generatePortNumber()
if __name__ == "__main__":
m = portfinder()
m.findUsedPortsinDB()
Found what was wrong I had a extra indentation in my method

Why is my Python code returning an error when I try to fetch YouTube videos for a given keyword?

Whenever I try to run my code, I receive the following error: "comment_content error! 'nonetype' object has no attribute 'href'" I am new to Python, and did not write this code myself; it was given to me to use. My understanding is that it was functioning properly before? Could this have to do with changes in the YouTube Data API since it was written?
import pdb
import gdata.youtube
import gdata.youtube.service
import codecs
import time
client = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery()
### the input words are here
query.vq = "4b hair"
#######
# the out put file are here
viewFile = codecs.open('views4b_hair.csv', 'w')
commentFile=codecs.open('comments4b_hair.csv', 'w')
##########
query.max_results = 50
query.start_index = 0
query.safesearch = "moderate"
#query.format = 5
query.orderby = "relevance"
#query.author = "hawaiinani"
#pdb.set_trace()
for i in range(19):
#pdb.set_trace()
query.start_index=str(int(query.start_index)+50)
feed = client.YouTubeQuery(query)
print len(feed.entry)
youtubeid=[]
youtubetitle=[]
for entry in feed.entry:
#youtubetitle.append(entry.title.text)
youtubeid.append(entry.id.text[38:])
print entry.id.text[38:],i
try:
entry_comment = client.GetYouTubeVideoEntry(video_id=entry.id.text[38:])
comment_feed = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
viewFile.write(','.join([entry.id.text[38:],entry_comment.published.text,
str(entry_comment.media.duration.seconds), str(entry_comment.statistics.view_count),comment_feed.total_results.text,entry_comment.media.title.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')]) + '\n')
#videop.append("%s, %s,%s, %s, %s, %s" % (search_result["id"]["videoId"],entry.published.text,
# entry.media.duration.seconds, entry.statistics.view_count,comment_feed.total_results.text,entry.media.title.text))
#
#time.sleep(3)
except Exception, ex:
print 'View_content Error', ex
time.sleep(10)
try:
comment_content = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
indexh=0
#while comment_content:
while indexh<10:
indexh=indexh+1
for comment_entry in comment_content.entry:
pubText = comment_entry.published.text
#print pubText
titleText = comment_entry.content.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')
#print titleText
#print 'Got title'
#pubText, titleText = comment_entry.published.text, comment_entry.title.text
commentFile.write(','.join([entry.id.text[38:],pubText,titleText]) + '\n'+'\n')
#commentFile.write(u',')
#commentFile.write(pubText + u',')
#print 'About to write title'
#print titleText
#print 'Wrote title'
#commentlist.append("%s, %s,%s" % (search_result["id"]["videoId"],pubText, titleText))
comment_content=client.Query(comment_content.GetNextLink().href)
#time.sleep(3)
#time.sleep(3)
except Exception, ex:
print 'Comment_content Error!', ex
time.sleep(5)
#pdb.set_trace()
viewFile.close()
commentFile.close()
The error occurs when comment_content.GetNextLink() becomes None. In order to fix it, replace:
while indexh < 10:
with:
while indexh < 10 and comment_content:
also replace:
comment_content=client.Query(comment_content.GetNextLink().href)
with:
next_link = comment_content.GetNextLink()
if next_link:
comment_content = client.Query(next_link.href)
else:
comment_content = None
Hope that helps.