EDIT 2022-10-04 18:40
I've tried using bulk_update and bulk_create as these method only query database once but still have the same issue
would appreciate any help/explanation on this issue
'''
Task to edit data correction forms (DCF) online
'''
#shared_task(bind=True)
def DCF_edition(self):
print(timezone.now())
DCF_BEFORE_UPDATE = pd.DataFrame.from_records(DataCorrectionForm.objects.all().values())
if not DCF_BEFORE_UPDATE.empty :
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.rename(columns={"patient": "pat"})
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.astype({'record_date': str,'created_date': str})
DCF_BEFORE_UPDATE['dcf_status'] = DCF_BEFORE_UPDATE.apply(lambda status: 0, axis=1)
# list of dataframe to concat
data = []
# load queries definition
queries = queries_definitions()
# print(queries)
if not queries.empty:
for index, row in queries.iterrows():
print('Query ide',row['ide'])
# print(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification date'],row['variable_name'],row['variable_label'],row['query_condition'],row['fields_to_display'])
try:
missing_or_inconsistent = missing_or_inconsistent_data(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification date'],row['variable_name'],row['variable_label'],row['query_condition'],row['fields_to_display']) #.iloc[:10] #to limit rows
missing_or_inconsistent.columns.values[2] = 'record_date' # rename the date column (that have database name)
missing_or_inconsistent['dcf_ide'] = str(row['ide']) + '_' + row['variable_name'] + '_' + missing_or_inconsistent[row['crf primary key']].astype(str)
missing_or_inconsistent['category'] = row['query_type']
missing_or_inconsistent['crf'] = row['crf_name']
missing_or_inconsistent['crf_ide'] = missing_or_inconsistent[row['crf primary key']]
missing_or_inconsistent['field_name'] = row['variable_name']
missing_or_inconsistent['field_label'] = row['variable_label']
missing_or_inconsistent['field_value'] = missing_or_inconsistent[row['variable_name']]
missing_or_inconsistent['message'] = row['query_message']
missing_or_inconsistent['query_id'] = 'Query ide ' + str(row['ide'])
missing_or_inconsistent['dcf_status'] = 1
missing_or_inconsistent['created_date'] = timezone.now()
missing_or_inconsistent['deactivated'] = False
missing_or_inconsistent['comments'] = None
data.append(missing_or_inconsistent[['ide','dcf_ide','category','crf','crf_ide','pat','record_date','field_name','field_label','message','field_value','dcf_status','created_date','query_id','deactivated','comments']])
dcf = pd.concat(data)
except Exception as e:
Log.objects.create(dcf_edition_status=0,dcf_edition_exception=str(e)[:200])
continue
DCF_AFTER_UPDATE = pd.concat([DCF_BEFORE_UPDATE,dcf])
DCF_AFTER_UPDATE['duplicate'] = DCF_AFTER_UPDATE.duplicated(subset=['dcf_ide'],keep='last')
DCF_AFTER_UPDATE['dcf_status'] = DCF_AFTER_UPDATE.apply(lambda row: 2 if row['duplicate'] else row['dcf_status'],axis=1)
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop_duplicates(subset=['dcf_ide'],keep='first').drop(columns=['duplicate'])
DCF_AFTER_UPDATE.rename(columns = {'pat':'patient',}, inplace = True)
# Cast date into string format to be able to dumps data
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.astype({'record_date': str}) if not DCF_AFTER_UPDATE.empty else DCF_AFTER_UPDATE
records_to_update = [
DataCorrectionForm(
ide=record['ide'],
dcf_ide=record['dcf_ide'],
category=record['category'],
crf=record['crf'],
crf_ide=record['crf_ide'],
patient=record['patient'],
record_date=record['record_date'],
field_name=record['field_name'],
field_label=record['field_label'],
message=record['message'],
field_value=record['field_value'],
dcf_status=record['dcf_status'],
created_date=record['created_date'],
query_id=record['query_id'],
deactivated=record['deactivated'],
comments=record['comments']
) for i, record in DCF_AFTER_UPDATE[(DCF_AFTER_UPDATE['dcf_status'] != 1)].iterrows()
]
if records_to_update:
DataCorrectionForm.objects.bulk_update(records_to_update,['dcf_status'])
records_to_create = [
DataCorrectionForm(
dcf_ide=record['dcf_ide'],
category=record['category'],
crf=record['crf'],
crf_ide=record['crf_ide'],
patient=record['patient'],
record_date=record['record_date'],
field_name=record['field_name'],
field_label=record['field_label'],
message=record['message'],
field_value=record['field_value'],
dcf_status=record['dcf_status'],
created_date=record['created_date'],
query_id=record['query_id'],
deactivated=record['deactivated'],
comments=record['comments']
) for i, record in DCF_AFTER_UPDATE[(DCF_AFTER_UPDATE['dcf_status'] == 1)].iterrows()
]
if records_to_create:
DataCorrectionForm.objects.bulk_create(records_to_create)
EDIT 2022-10-04 13:40
I've tried to "optimized" code using update_or_create() method but doesn't change anything
I still have an OperationalError with the line DataCorrectionForm.objects.update_or_create(...)
How can I update my database?
'''
Task to edit data correction forms (DCF) online
'''
#shared_task(bind=True)
def DCF_edition(self):
DCF_BEFORE_UPDATE = pd.DataFrame.from_records(DataCorrectionForm.objects.all().values())
if not DCF_BEFORE_UPDATE.empty :
DCF_BEFORE_UPDATE.drop(columns=['ide'])
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.rename(columns={"patient": "pat"})
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.astype({'record_date': str,'created_date': str})
DCF_BEFORE_UPDATE['dcf_status'] = DCF_BEFORE_UPDATE.apply(lambda status: 0, axis=1)
# list of dataframe to concat
data = []
# load queries definition
queries = queries_definitions()
if not queries.empty:
for index, row in queries.iterrows()
try:
missing_or_inconsistent = missing_or_inconsistent_data(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification date'],row['variable_name'],row['variable_label'],row['query_condition'],row['fields_to_display']) #.iloc[:10] #to limit rows
missing_or_inconsistent.columns.values[2] = 'record_date' # rename the date column (that have database name)
missing_or_inconsistent['dcf_ide'] = str(row['ide']) + '_' + row['variable_name'] + '_' + missing_or_inconsistent[row['crf primary key']].astype(str)
missing_or_inconsistent['category'] = row['query_type']
missing_or_inconsistent['crf'] = row['crf_name']
missing_or_inconsistent['crf_ide'] = missing_or_inconsistent[row['crf primary key']]
missing_or_inconsistent['field_name'] = row['variable_name']
missing_or_inconsistent['field_label'] = row['variable_label']
missing_or_inconsistent['field_value'] = missing_or_inconsistent[row['variable_name']]
missing_or_inconsistent['message'] = row['query_message']
missing_or_inconsistent['DEF'] = 'Query ide ' + str(row['ide'])
missing_or_inconsistent['dcf_status'] = 1
missing_or_inconsistent['created_date'] = timezone.now()
missing_or_inconsistent['deactivated'] = False
missing_or_inconsistent['comments'] = None
data.append(missing_or_inconsistent[['dcf_ide','category','crf','crf_ide','pat','record_date','field_name','field_label','message','field_value','dcf_status','created_date','DEF','deactivated','comments']])
dcf = pd.concat(data)
except Exception as e:
Log.objects.create(dcf_edition_status=0,dcf_edition_exception=str(e)[:200])
continue
DCF_AFTER_UPDATE = pd.concat([DCF_BEFORE_UPDATE,dcf])
DCF_AFTER_UPDATE['duplicate'] = DCF_AFTER_UPDATE.duplicated(subset=['dcf_ide'],keep='last')
DCF_AFTER_UPDATE['dcf_status'] = DCF_AFTER_UPDATE.apply(lambda row: 2 if row['duplicate'] else row['dcf_status'],axis=1)
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop_duplicates(subset=['dcf_ide'],keep='first').drop(columns=['duplicate'])
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop(['DEF'], axis=1)
DCF_AFTER_UPDATE.rename(columns = {'pat':'patient',}, inplace = True)
# Cast date into string format to be able to dumps data
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.astype({'record_date': str}) if not DCF_AFTER_UPDATE.empty else DCF_AFTER_UPDATE
records = DCF_AFTER_UPDATE.to_dict(orient='records')
for record in records:
DataCorrectionForm.objects.update_or_create(
dcf_ide=record['dcf_ide'], # filter to search for existing objects => should not be pass to default (if not IntegrityError)
defaults = {
'category':record['category'],
'crf':record['crf'],
'crf_ide':record['crf_ide'],
'patient':record['patient'],
'record_date':record['record_date'],
'field_name':record['field_name'],
'field_label':record['field_label'],
'message':record['message'],
'field_value':record['field_value'],
'dcf_status':record['dcf_status'],
'created_date':record['created_date'],
# 'DEF':record['DEF'],
'deactivated':record['deactivated'],
'comments':record['comments']
}
)
Log.objects.create(dcf_edition_status=1)
return True
EDIT 2022-10-03 17:00
in fact reading CAVEATS:
The development server creates a new thread for each request it
handles, negating the effect of persistent connections. Don’t enable
them during development.
EDIT 2022-10-03 16:00
Django 2.2.5
I have tried to set DATABASES parameter CONN_MAX_AGE as per Django documentation but it doesn't change anythings
Default: 0
The lifetime of a database connection, as an integer of seconds. Use 0
to close database connections at the end of each request — Django’s
historical behavior — and None for unlimited persistent connections.
I use Celery task and got an error I do not understand.
I loop over a table (that contain query definitions) to edit missing/inconsistent data in a database (using API) and registered discrepencies in another table.
If I run query one at a time, it works but when I try to loop over queries, I got an error
OperationalError('server closed the connection unexpectedly\n\tThis probably means the server terminated abnormally\n\tbefore or while processing the request.\n')
def DCF_edition(self):
DCF_BEFORE_UPDATE = pd.DataFrame.from_records(DataCorrectionForm.objects.all().values())
DCF_BEFORE_UPDATE = DCF_BEFORE_UPDATE.astype({'record_date': str,'created_date': str}) if not DCF_BEFORE_UPDATE.empty else DCF_BEFORE_UPDATE
data = []
# load queries definition
queries = queries_definitions()
if not queries.empty:
for index, row in queries.iterrows():
try:
missing_or_inconsistent = missing_or_inconsistent_data(row['ide'],row['query_type'],row['crf_name'].lower(),row['crf identification
data.append(missing_or_inconsistent[['dcf_ide','category','crf','crf_ide','pat','record_date','field_name','field_label','message','field_value','dcf_status','DEF','deactivated']])
DCF_AFTER_UPDATE = pd.concat(data)
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop_duplicates(keep='last')
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.drop(['DEF'], axis=1)
DCF_AFTER_UPDATE.rename(columns = {'pat':'patient',}, inplace = True)
except Exception as e:
Log.objects.create(dcf_edition_status=0,dcf_edition_exception=str(e)[:200])
continue
# Cast date into string format to be able to dumps data
DCF_AFTER_UPDATE = DCF_AFTER_UPDATE.astype({'record_date': str}) if not DCF_AFTER_UPDATE.empty else DCF_AFTER_UPDATE
records = json.loads(json.dumps(list(DCF_AFTER_UPDATE.T.to_dict().values())))
for record in records:
if not DCF_BEFORE_UPDATE.empty and record['dcf_ide'] in DCF_BEFORE_UPDATE.values:
DataCorrectionForm.objects.filter(dcf_ide=record['dcf_ide']).update(dcf_status=2)
else:
DataCorrectionForm.objects.get_or_create(**record)
# resolved dcf => status=0
if not DCF_BEFORE_UPDATE.empty:
records = json.loads(json.dumps(list(DCF_BEFORE_UPDATE.T.to_dict().values())))
for record in records:
if record['dcf_ide'] not in DCF_AFTER_UPDATE.values:
DataCorrectionForm.objects.filter(dcf_ide=record['dcf_ide']).update(dcf_status=0)
Log.objects.create(dcf_edition_status=1)
return True
The lifetime of a database connection, as an integer of seconds. Use 0 to close database connections at the end of each request — Django’s historical behavior — and None for unlimited persistent connections.
It seems that your task is long running task and need to hold the db connection for a long period. Did you try to set it to None
DATABASES = {
'default': env.db(),
}
# https://docs.djangoproject.com/en/3.1/ref/settings/#conn-max-age
DATABASES['default']['CONN_MAX_AGE'] = None
How long does your task need to finish? It could be another problem with server database setting, ex tcp_keepalives_ilde..
I'm trying to write a parser which will take a url and download it's html in a .html file. Then it'll go through the html file to find all links and save them as well. I want to repeat it multiple time. Can some one please help a little?
This is the code I have written:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
count = 1
give_url = raw_input("Enter url:\t")
def magic(give_url):
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
link_set.add(give_url + str(html_link))
magic(give_url)
for each_item in link_set:
print each_item
print "\n"
Although it's working fine but When I try to call the magic function in for loop, i get RuntimeError: Set changed size during iteration.
I got it working.
The code for recursive URL parsing using beautiful soup:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
#
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
pass
else:
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
else:
link_set.add(html_link)
# print "Total links in the given url are: " + str(len(link_set))
magic(give_url,link_set,0)
link_set2 = set()
link_set3 = set()
for element in link_set:
link_set2.add(element)
count = 1
for element in link_set:
magic(element,link_set3,count)
count += 1
for each_item in link_set3:
link_set2.add(each_item)
link_set3.clear()
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
There are many mistakes so I request you all to please tell me where I can improve the code.
from BeautifulSoup import BeautifulSoup
import mechanize
import re
def price_walmart_match(keyword):
url = "http://www.walmart.com/search/?query="+keyword
br = mechanize.Browser()
br.set_handle_robots(False)
br.open(url)
html = br.response().read()
result_soup = BeautifulSoup(html)
found_results = result_soup.findAll('div' , attrs={'class':'js-tile tile-landscape'})
all_results=[]
for result in found_results:
title = result.find('a' , attrs={'class':'js-product-title'})
links = result.find('a' , href=True)
before_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('<span class="sup">$</span>')[1].split('<span class="visuallyhidden">')[0]
after_decimal= str(result.find('span', attrs={'class':'price price-display'})).split('</span><span class="sup">')[1].split('</span>')[0]
prices = before_decimal+'.'+after_decimal
inArray = [float(prices), "http://www.walmart.com"+links['href']]
all_results.append(inArray)
print(all_result)
Sorry it is full code where i get error.
Thats Because the mentioned class is not there in the page.
Try to Represent query term which is constant.
I am Trying to call a another method within my class, for some reason I am getting the AttributeError: portfinder instance has no attribute 'generatePortNumber' See my code below:
when I tried to call the generatePortNumber I'm getting the same error. I have never come across this issue.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sqlite3 as lite
import sys
import random
class portfinder:
"""docstring for ClassName"""
def __init__(self):
self.portsToCheck = ['agentport','BatchProcessingAgentPort','databaseport','indexserviceport','monitorport','servicefacadeport','webdriverport']
self.dataBasePort = (u'60025',)
self.portInUse = False
self.x = 0
def generatePortNumber(self):
self.newPortNumber = random.randrange(8000, 9000)
print self.newPortNumber
return self.newPortNumber
def findUsedPortsinDB(self):
con = lite.connect('D:\play\Opes\db.sqlite3')
with con:
cur = con.cursor()
sqlStatement = "Select " + self.portsToCheck[2] +' From Jobs_jobs'
print sqlStatement
cur.execute(sqlStatement)
rows = cur.fetchall()
for row in rows:
print row
if row == self.dataBasePort:
self.portInUse = "true"
self.generatePortNumber()
if __name__ == "__main__":
m = portfinder()
m.findUsedPortsinDB()
Found what was wrong I had a extra indentation in my method
Whenever I try to run my code, I receive the following error: "comment_content error! 'nonetype' object has no attribute 'href'" I am new to Python, and did not write this code myself; it was given to me to use. My understanding is that it was functioning properly before? Could this have to do with changes in the YouTube Data API since it was written?
import pdb
import gdata.youtube
import gdata.youtube.service
import codecs
import time
client = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery()
### the input words are here
query.vq = "4b hair"
#######
# the out put file are here
viewFile = codecs.open('views4b_hair.csv', 'w')
commentFile=codecs.open('comments4b_hair.csv', 'w')
##########
query.max_results = 50
query.start_index = 0
query.safesearch = "moderate"
#query.format = 5
query.orderby = "relevance"
#query.author = "hawaiinani"
#pdb.set_trace()
for i in range(19):
#pdb.set_trace()
query.start_index=str(int(query.start_index)+50)
feed = client.YouTubeQuery(query)
print len(feed.entry)
youtubeid=[]
youtubetitle=[]
for entry in feed.entry:
#youtubetitle.append(entry.title.text)
youtubeid.append(entry.id.text[38:])
print entry.id.text[38:],i
try:
entry_comment = client.GetYouTubeVideoEntry(video_id=entry.id.text[38:])
comment_feed = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
viewFile.write(','.join([entry.id.text[38:],entry_comment.published.text,
str(entry_comment.media.duration.seconds), str(entry_comment.statistics.view_count),comment_feed.total_results.text,entry_comment.media.title.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')]) + '\n')
#videop.append("%s, %s,%s, %s, %s, %s" % (search_result["id"]["videoId"],entry.published.text,
# entry.media.duration.seconds, entry.statistics.view_count,comment_feed.total_results.text,entry.media.title.text))
#
#time.sleep(3)
except Exception, ex:
print 'View_content Error', ex
time.sleep(10)
try:
comment_content = client.GetYouTubeVideoCommentFeed(video_id=entry.id.text[38:])
indexh=0
#while comment_content:
while indexh<10:
indexh=indexh+1
for comment_entry in comment_content.entry:
pubText = comment_entry.published.text
#print pubText
titleText = comment_entry.content.text.decode('ascii', errors='ignore').encode('ascii', 'ignore')
#print titleText
#print 'Got title'
#pubText, titleText = comment_entry.published.text, comment_entry.title.text
commentFile.write(','.join([entry.id.text[38:],pubText,titleText]) + '\n'+'\n')
#commentFile.write(u',')
#commentFile.write(pubText + u',')
#print 'About to write title'
#print titleText
#print 'Wrote title'
#commentlist.append("%s, %s,%s" % (search_result["id"]["videoId"],pubText, titleText))
comment_content=client.Query(comment_content.GetNextLink().href)
#time.sleep(3)
#time.sleep(3)
except Exception, ex:
print 'Comment_content Error!', ex
time.sleep(5)
#pdb.set_trace()
viewFile.close()
commentFile.close()
The error occurs when comment_content.GetNextLink() becomes None. In order to fix it, replace:
while indexh < 10:
with:
while indexh < 10 and comment_content:
also replace:
comment_content=client.Query(comment_content.GetNextLink().href)
with:
next_link = comment_content.GetNextLink()
if next_link:
comment_content = client.Query(next_link.href)
else:
comment_content = None
Hope that helps.