html looks like thisI have written this code to scrape all courses from a url. For this I am trying to get the count of courses using xpath. But it does not give me anything. Where I am doing wrong?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait`
class FinalProject:
def __init__(self,url= "https://www.class-central.com/subject/data-science"):`
self.url = url
base_url = 'https://www.class-central.com'
self.error_flag = False
self.driver = webdriver.Chrome(<path to chromedriver>)
self.driver.get(self.url)
sleep(2)
self.count_course_and_scroll()
def count_course_and_scroll(self):
wait = WebDriverWait(self.driver, 30);
ele = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Not right now, thanks.')));
ele.click()
print "----------------------POP UP CLOSED---------------------"
total_courses = self.driver.find_element_by_xpath("//span[#id='number-of-courses']")
print total_courses
print total_courses.text
self.driver.close()
fp = FinalProject()
If text doesn't work you can try get_attribute
print total_courses.get_attribute('text')
#or
print total_courses.get_attribute('innerHTML')
ele = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Not right now, thanks.')));
ele.click()
print "----------------------POP UP CLOSED---------------------"
total_courses = self.driver.find_element_by_xpath("//span[#id='number-of-courses']")
In that piece of code, I suspicious 2 things:
Does the popup always appear?
Does the text of number-of-coursesshow in time?
If you are not sure about 1., I would recommend to put it in a try catch
And about 2. - wait until some text appears on that element
try:
ele = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Not right now, thanks.')));
ele.click()
finally:
total_courses = wait.until(EC.presence_of_element_located(By.XPATH, "//span[#id='number-of-courses' and text() != '']")
print total_courses.text
Related
After writing the following code and expecting the output to be an updated database with random names, websites, etc., I get no error message and no updated database
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'First_project.settings')
import django
django.setup()
import random
from first_app.models import AccessRecord,Webpage,Topic
from faker import Faker
fakegen = Faker()
topics = ['Search','Social','Marketplace','News','Games']
def add_topic():
t = Topic.object.get_or_create(top_name=random.choice(topics))[0]
t.save()
return t
def populate(N=5):
for entry in range(N):
top = add_topic()
fake_url = fakegen.url()
fake_date = fakegen.date()
fake_name = fakegen.company()
webpg = webpage.objects.get_or_create(topic=top, url=fake_ur, name=fake_name)[0]
acc_rec = AccessRecord.object.get_or_create(name=webpg,date=fake_date)[0]
if __name__ == ' __main__':
print("populate")
populate(20)
print("populating complete!")
please what do I do?
i ran the whole code in my terminal and it populated the database
So I'm doing a relatively simple project so I can teach myself Python. I've come to a point where I'm stuck. So I have a variable named element in pycharm debugger which shows as
This variable is type Tag, which is correct to me. In element I want to see if the class="schedule_dgrd_time/result"which is not the case in the above image.
I see that within element there is an attrs.
How can I access that value? If I do element.string I get the text value which in this case would be Sat.(...I could make that work), but I was wondering if I can check the class attribute value first.
I've been searching for this for a couple days now and just can't get it. I've googled myself to death at this point. Any help or pointers would be greatly appreciated. Thanks for reading.
Update
Here is my code
import urllib2
import datetime
import re
from bs4 import BeautifulSoup
# today's date
date = datetime.datetime.today().strftime('%-m/%d/%Y')
validDay = "Mon\.|Tue\.|Wed\.|Thu(r)?(s)?\.|Fri\."
website = "http://www.texassports.com/schedule.aspx?path=baseball"
opener = urllib2.build_opener()
##add headers that make it look like I'm a browser
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
page = opener.open(website)
# turn page into html object
soup = BeautifulSoup(page, 'html.parser')
#print soup.prettify()
#get all home games
all_rows = soup.find_all('tr', class_='schedule_home_tr')
# see if any game is today
# entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('.*({}).*'.format(date)))]
# hard coding for testing weekend
entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('3/11/2017'))]
time = "schedule_dgrd_time/result"
for elements in entryForToday:
for element in elements:
#this is where I'm stuck.
# if element.attrs:
# print element.attrs['class'][0]
I know a double nested for loop is not ideal so if you have a better way I'm glad to hear it. Thanks
So I was able to figure out. I have some NavigableString which doesn't have attrs so that was throwing an error. element.attrs['class'][0] does work now. I had to check if isinstanceOf a tag, if not it would skip it. Anywho, my code is below for anyone that is interested.
import urllib2
import datetime
import re
from bs4 import BeautifulSoup
from bs4 import Tag
# today's date
date = datetime.datetime.today().strftime('%-m/%d/%Y')
validDay = "Mon\.|Tue\.|Wed\.|Thu(r)?(s)?\.|Fri\."
website = "http://www.texassports.com/schedule.aspx?path=baseball"
opener = urllib2.build_opener()
##add headers that make it look like I'm a browser
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
page = opener.open(website)
# turn page into html object
soup = BeautifulSoup(page, 'html.parser')
#print soup.prettify()
#get all home games
all_rows = soup.find_all('tr', class_='schedule_home_tr')
# see if any game is today
# entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('.*({}).*'.format(date)))]
# hard coding for testing weekend
entryForToday = [t for t in all_rows if t.findAll('nobr',text=re.compile('3/14/2017'))]
classForTime = "schedule_dgrd_time/result"
timeOfGame = "none";
if entryForToday:
entryForToday = [t for t in entryForToday if t.findAll('td',
class_='schedule_dgrd_game_day_of_week',
text=re.compile('.*({}).*'.format(validDay)))]
if entryForToday:
for elements in entryForToday:
for element in elements:
if isinstance(element, Tag):
if element.attrs['class'][0] == classForTime:
timeOfGame = element.text
# print element.text
break
print timeOfGame
I am new to Python/Selenium and coded the following in python /Windows to scrape the 5484 physician demo's in the, MA-Board of Reg. Website.
My Issue: The website is .aspx, so I initially chose Selenium. However, would really appreciate any insights/recommendations on coding the next steps (see below). More specifically, if it is more efficient to continue with selenium or incorporate scrapy? Any insights are greatly appreciated!:
Select each physician's hyperlink (1-10 per page) by clicking each hyperlinked "PhysicianProfile.aspx?PhysicianID=XXXX" on the "ChooseAPhysician page".
Follow each, and Extract the, "Demographic info"
Demographic info: "phy_name", "lic_issue_date", prim_worksetting, etc
Return to, "ChooseAPhysician page", click "Next"
Repeat for additional 5474 physician
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome() driver.get('http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1')
#Locate the elements
zip = driver.find_element_by_xpath("//*[#id=\"ctl00_ContentPlaceHolder1_txtZip\"]")
select = Select(driver.find_element_by_xpath("//select[#id=\"ctl00_ContentPlaceHolder1_cmbDistance\"]"))
print select.options
print [o.text for o in select.options]
select.select_by_visible_text("15")
prim_care_chekbox = driver.find_element_by_xpath("//*[#id=\"ctl00_ContentPlaceHolder1_SpecialtyGroupsCheckbox_6\"]")
find_phy_button = driver.find_element_by_xpath("//*[#id=\"ctl00_ContentPlaceHolder1_btnSearch\"]")
#Input zipcode, check "primary care box", and click "find phy" button
zip.send_keys("02109")
prim_care_chekbox.click()
find_phy_button.click()
#wait for "ChooseAPhysician" page to open
wait = WebDriverWait(driver, 10)
open_phy_bio = driver.find_element_by_xpath("//*[#id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a")
element = wait.until(EC.element_to_be_selected(open_phy_bio))
open_phy_bio.click()
links = self.driver.find_element_by_xpath("//*[#id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a")
for link in links:
link = link.get_attribute("href")
self.driver.get(link)
def parse(self, response):
item = SummaryItem()
sel = self.selenium
sel.open(response.url)
time.sleep(4)
item["phy_name"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/p[1]").extract()
item["lic_status"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/a[1]").extract()
item["lic_issue_date"] = driver.find.elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[3]/td[2]").extract()
item["prim_worksetting"] = driver.find.elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[5]/td[2]").extract()
item["npi"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[2]/table/tbody/tr[6]/td[2]").extract()
item["Med_sch_grad_date"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[3]/tbody/tr[3]/td/table/tbody/tr[2]/td[2]").extract()
item["Area_of_speciality"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[4]/tbody/tr[3]/td/table/tbody/tr/td[2]").extract()
item["link"] = driver.find_element_by_xpath("//*[#id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a").extract()
return item
I have this website and there are four input boxes which are Symbol, Expiry Date, From, To. Now i have written a code to scrape data from the Symbol and Expiry Date which is like this:
import requests
import json
from bs4 import BeautifulSoup
r = requests.get("http://www.mcxindia.com/sitepages/BhavCopyCommodityWise.aspx")
soup = BeautifulSoup(r.content)
pop = []
pop_dates = []
count = 0
print soup.prettify()
option_list = soup.findAll("option")
#print option_list
for value in option_list:
#print value
if value.find(text = True):
text = ''.join(value.find(text = True))
text1 = text.encode('ascii')
if count < 32:
pop.append(text1)
while count == 32 or count > 32:
pop_dates.append(text1)
break
count = count + 1
print pop
print pop_dates
So What i want to do is for From and To i want to give the dates from my code and it will take that input, use it on the website's html and give the output as usual in that website. How can i do this?? I heard mechanize can do this stuffs but how could i use mechanize in this case??
You can try out something like this:
from mechanize import Browser
from bs4 import BeautifulSoup
br = Browser()
br.set_handle_robots( False )
br.addheaders = [('User-agent', 'Firefox')]
br.open("http://www.mcxindia.com/sitepages/BhavCopyCommodityWise.aspx")
br.select_form("form1")
#now enter the dates according to your choice
br.form["mTbFromDate"] = "date-From"
br.form["mTbFromDate"] = "date-To"
response = br.submit()
#now read the response with BeautifulSoup and do whatever you want
soup = BeautifulSoup(response.read())
I am working on a django-postgresql project and I need to see every query that django run on database(so I can fine-tune queries). Is there a way to get those queries.
Update: My development environment is on ubuntu linux
Well, you could just set the pgsql server to log every query. Or just to log the slow ones. Look in the postgresql.conf file, it's pretty close to self-documenting.
Check out this Question (and the two top most answers):
django orm, how to view (or log) the executed query?
You can also have a look at the Djando documenation:
https://docs.djangoproject.com/en/dev/faq/models/#how-can-i-see-the-raw-sql-queries-django-is-running
Hope this helps,
Anton
You can decorate a request handler or other function with this and it will print the sql nicely formated with totals at the end.
from functools import wraps
from django.utils import termcolors
format_ok = termcolors.make_style(opts=('bold',), fg='green')
format_warning = termcolors.make_style(opts=('bold',), fg='yellow')
format_error = termcolors.make_style(opts=('bold',), fg='red')
try:
from pygments import highlight
from pygments.lexers import SqlLexer
from pygments.formatters import TerminalFormatter
pygments_sql_lexer = SqlLexer()
pygments_terminal_formatter = TerminalFormatter()
highlight_sql = lambda s: highlight(s, pygments_sql_lexer,
pygments_terminal_formatter)
except ImportError:
highlight_sql = lambda s: s
def debug_sql(f):
"""
Turn SQL statement debugging on for a test run.
"""
#wraps(f)
def wrapper(*a, **kw):
from django.conf import settings
from django.db import connection
try:
debug = settings.DEBUG
settings.DEBUG = True
connection.queries = []
return f(*a, **kw)
finally:
total_time = 0
for q in connection.queries:
fmt = format_ok
t = float(q['time'])
total_time += t
if t > 1:
fmt = format_error
elif t > 0.3:
fmt = format_warning
print '[%s] %s' % (fmt(q['time']), highlight_sql(q['sql']))
print "total time =", total_time
print "num queries =", len(connection.queries)
settings.DEBUG = debug
return wrapper
Try the django debug toolbar. It'll show you all the SQL executed over the request. When something is executing way too many queries, it becomes really slow, though. For that, I've been meaning to try out this profiler. However, I've rolled this middleware on a couple of projects:
try:
from cStringIO import StringIO
except ImportError:
import StringIO
from django.conf import settings
from django.db import connection
class DatabaseProfilerMiddleware(object):
def can(self, request):
return settings.DEBUG and 'dbprof' in request.GET
def process_response(self, request, response):
if self.can(request):
out = StringIO()
out.write('time sql\n')
total_time = 0
for query in reversed(sorted(connection.queries, key=lambda x: x['time'])):
total_time += float(query['time'])*1000
out.write('%s %s\n' % (query['time'], query['sql']))
response.content = '<pre style="white-space:pre-wrap">%d queries executed in %.3f seconds\n%s</pre>' \
% (len(connection.queries), total_time/1000, out.getvalue())
return response
Just go to the relevant URL for the request you are interested in and add a dbprof GET parameter, you'll see the profiling output instead of the normal response.