webscraping an .ASPX site with Selenium and/or Scrapy - python-2.7

I am new to Python/Selenium and coded the following in python /Windows to scrape the 5484 physician demo's in the, MA-Board of Reg. Website.
My Issue: The website is .aspx, so I initially chose Selenium. However, would really appreciate any insights/recommendations on coding the next steps (see below). More specifically, if it is more efficient to continue with selenium or incorporate scrapy? Any insights are greatly appreciated!:
Select each physician's hyperlink (1-10 per page) by clicking each hyperlinked "PhysicianProfile.aspx?PhysicianID=XXXX" on the "ChooseAPhysician page".
Follow each, and Extract the, "Demographic info"
Demographic info: "phy_name", "lic_issue_date", prim_worksetting, etc
Return to, "ChooseAPhysician page", click "Next"
Repeat for additional 5474 physician
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome() driver.get('http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1')
#Locate the elements
zip = driver.find_element_by_xpath("//*[#id=\"ctl00_ContentPlaceHolder1_txtZip\"]")
select = Select(driver.find_element_by_xpath("//select[#id=\"ctl00_ContentPlaceHolder1_cmbDistance\"]"))
print select.options
print [o.text for o in select.options]
select.select_by_visible_text("15")
prim_care_chekbox = driver.find_element_by_xpath("//*[#id=\"ctl00_ContentPlaceHolder1_SpecialtyGroupsCheckbox_6\"]")
find_phy_button = driver.find_element_by_xpath("//*[#id=\"ctl00_ContentPlaceHolder1_btnSearch\"]")
#Input zipcode, check "primary care box", and click "find phy" button
zip.send_keys("02109")
prim_care_chekbox.click()
find_phy_button.click()
#wait for "ChooseAPhysician" page to open
wait = WebDriverWait(driver, 10)
open_phy_bio = driver.find_element_by_xpath("//*[#id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a")
element = wait.until(EC.element_to_be_selected(open_phy_bio))
open_phy_bio.click()
links = self.driver.find_element_by_xpath("//*[#id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a")
for link in links:
link = link.get_attribute("href")
self.driver.get(link)
def parse(self, response):
item = SummaryItem()
sel = self.selenium
sel.open(response.url)
time.sleep(4)
item["phy_name"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/p[1]").extract()
item["lic_status"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/a[1]").extract()
item["lic_issue_date"] = driver.find.elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[3]/td[2]").extract()
item["prim_worksetting"] = driver.find.elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[5]/td[2]").extract()
item["npi"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[2]/table/tbody/tr[6]/td[2]").extract()
item["Med_sch_grad_date"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[3]/tbody/tr[3]/td/table/tbody/tr[2]/td[2]").extract()
item["Area_of_speciality"] = driver.find_elements_by_xpaths("//*[#id=\"content\"]/center/table[4]/tbody/tr[3]/td/table/tbody/tr/td[2]").extract()
item["link"] = driver.find_element_by_xpath("//*[#id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a").extract()
return item

Related

Able to Inputing log-in details, but Submit botton does not work while automated

whilst the scrip below seems to work, it doesnt go beyond after introducing username and password, and clicking submit. Any idea why?
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
url = 'https://www.waitrose.com/'
email = 'email#gmail.com'
password = 'password!'
s = Service("C:/Users/mn/Downloads/chromedriver/chromedriver.exe")
driver = webdriver.Chrome(service=s)
wait = WebDriverWait(driver, 20)
driver.maximize_window() ## Maximize the window and let code stall
time.sleep(0) # for 0s to properly maximise the window.
driver.get(url)
Accept_Cookies = wait.until(EC.visibility_of_element_located((By.XPATH, "//button[#data-test='accept-all']")))
Accept_Cookies.click()
Sign_in_button = driver.find_element(By.XPATH, value="//div/a[#data-test='loginAnchor']")
Sign_in_button.click()
input_email = wait.until(EC.visibility_of_element_located((By.XPATH, "//div/input[#type='email']")))
input_password = wait.until(EC.visibility_of_element_located((By.XPATH, "//div/input[#type='password']")))
click_submit = wait.until(EC.visibility_of_element_located((By.XPATH, "//button")))
input_email.send_keys(email)
input_password.send_keys(password)
click_submit.click()

Find element text using xpath in selenium-python NOt Working

html looks like thisI have written this code to scrape all courses from a url. For this I am trying to get the count of courses using xpath. But it does not give me anything. Where I am doing wrong?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait`
class FinalProject:
def __init__(self,url= "https://www.class-central.com/subject/data-science"):`
self.url = url
base_url = 'https://www.class-central.com'
self.error_flag = False
self.driver = webdriver.Chrome(<path to chromedriver>)
self.driver.get(self.url)
sleep(2)
self.count_course_and_scroll()
def count_course_and_scroll(self):
wait = WebDriverWait(self.driver, 30);
ele = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Not right now, thanks.')));
ele.click()
print "----------------------POP UP CLOSED---------------------"
total_courses = self.driver.find_element_by_xpath("//span[#id='number-of-courses']")
print total_courses
print total_courses.text
self.driver.close()
fp = FinalProject()
If text doesn't work you can try get_attribute
print total_courses.get_attribute('text')
#or
print total_courses.get_attribute('innerHTML')
ele = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Not right now, thanks.')));
ele.click()
print "----------------------POP UP CLOSED---------------------"
total_courses = self.driver.find_element_by_xpath("//span[#id='number-of-courses']")
In that piece of code, I suspicious 2 things:
Does the popup always appear?
Does the text of number-of-coursesshow in time?
If you are not sure about 1., I would recommend to put it in a try catch
And about 2. - wait until some text appears on that element
try:
ele = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Not right now, thanks.')));
ele.click()
finally:
total_courses = wait.until(EC.presence_of_element_located(By.XPATH, "//span[#id='number-of-courses' and text() != '']")
print total_courses.text

Spider won't run after updating Scrapy

As seems to frequently happen here, I am quite new to Python 2.7 and Scrapy. Our project has us scraping website date, following some links and more scraping, and so on. This was all working fine. Then I updated Scrapy.
Now when I launch my spider, I get the following message:
This wasn't coming up anywhere previously (none of my prior error messages looked anything like this). I am now running scrapy 1.1.0 on Python 2.7. And none of the spiders that had previously worked on this project are working.
I can provide some example code if need be, but my (admittedly limited) knowledge of Python suggests to me that its not even getting to my script before bombing out.
EDIT:
OK, so this code is supposed to start at the first authors page for Deakin University academics on The Conversation, and go through and scrape how many articles they have written and comments they have made.
import scrapy
from ltuconver.items import ConversationItem
from ltuconver.items import WebsitesItem
from ltuconver.items import PersonItem
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import bs4
class ConversationSpider(scrapy.Spider):
name = "urls"
allowed_domains = ["theconversation.com"]
start_urls = [
'http://theconversation.com/institutions/deakin-university/authors']
#URL grabber
def parse(self, response):
requests = []
people = Selector(response).xpath('///*[#id="experts"]/ul[*]/li[*]')
for person in people:
item = WebsitesItem()
item['url'] = 'http://theconversation.com/'+str(person.xpath('a/#href').extract())[4:-2]
self.logger.info('parseURL = %s',item['url'])
requests.append(Request(url=item['url'], callback=self.parseMainPage))
soup = bs4.BeautifulSoup(response.body, 'html.parser')
try:
nexturl = 'https://theconversation.com'+soup.find('span',class_='next').find('a')['href']
requests.append(Request(url=nexturl))
except:
pass
return requests
#go to URLs are grab the info
def parseMainPage(self, response):
person = Selector(response)
item = PersonItem()
item['name'] = str(person.xpath('//*[#id="outer"]/header/div/div[2]/h1/text()').extract())[3:-2]
item['occupation'] = str(person.xpath('//*[#id="outer"]/div/div[1]/div[1]/text()').extract())[11:-15]
item['art_count'] = int(str(person.xpath('//*[#id="outer"]/header/div/div[3]/a[1]/h2/text()').extract())[3:-3])
item['com_count'] = int(str(person.xpath('//*[#id="outer"]/header/div/div[3]/a[2]/h2/text()').extract())[3:-3])
And in my Settings, I have:
BOT_NAME = 'ltuconver'
SPIDER_MODULES = ['ltuconver.spiders']
NEWSPIDER_MODULE = 'ltuconver.spiders'
DEPTH_LIMIT=1
Apparently my six.py file was corrupt (or something like that). After swapping it out with the same file from a colleague, it started working again 8-\

Scrapy can not crawl link - comment of vnexpress website

I'm a newbie of Scrapy & Python. I try to get the comment from the following URL but the result always null : http://vnexpress.net/tin-tuc/oto-xe-may/toyota-camry-2016-dinh-loi-tui-khi-khong-bung-3386676.html
Here is my code :
from scrapy.spiders import Spider
from scrapy.selector import Selector
from tutorial.items import TutorialItem
import logging
class TutorialSpider(Spider):
name = "vnexpress"
allowed_domains = ["vnexpress.net"]
start_urls = [
"http://vnexpress.net/tin-tuc/oto-xe-may/toyota-camry-2016-dinh-loi-tui-khi-khong-bung-3386676.html"
]
def parse(self, response):
sel = Selector(response)
commentList = sel.xpath('//div[#class="comment_item"]')
items = []
id = 0;
logging.log(logging.INFO, "TOTAL COMMENT : " + str(len(commentList)))
for comment in commentList:
item = TutorialItem()
id = id + 1
item['id'] = id
item['mainId'] = 0
item['user'] = comment.xpath('//span[#class="left txt_666 txt_11"]/b').extract()
item['time'] = 'N/A'
item['content'] = comment.xpath('//p[#class="full_content"]').extract()
item['like'] = comment.xpath('//span[#class="txt_666 txt_11 right block_like_web"]/a[#class="txt_666 txt_11 total_like"]').extract()
items.append(item)
return items
Thanks for reading
Looks like the comments are loaded into the page with some JavaScript code.
Scrapy does not execute JavaScript on a page, it only downloads HTML pages. Try opening the page with JavaScript disabled in your browser, and you should see the page as Scrapy sees it.
You have a handful of options:
reverse-engineer how the comments are loaded into the page, using your browser's developer tools panel, in "network" tab (it could be some XHR call loading HTML or JSON data)
use a (headless)browser to render the page (selenium, casper.js, splash...);
e.g. you may want to try this page with Splash (one of the JavaScript rendering options for web scraping). This is the HTML you get back from Splash (it contains the comments): http://pastebin.com/njgCsM9w

bypassing body unload ="window.print" while scraping the page

I'm trying to scrape the page which loads after the print popup is gone(canceled).
testing the xpath to the product name and ID (as shown in the screenshot) with any possible combinations so far return empty and I suspect that print popup js is the reason.
Any tips about how to bypass the print popup would be appreciated.
Thanks :)
Here is the screenshot from the DOM:
Here's an example spider for getting the text you've highlighted on the screenshot:
from scrapy.item import Item, Field
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
class MarketItem(Item):
name = Field()
class MarketSpider(BaseSpider):
name = "market"
allowed_domains = ["mymarket.ge"]
start_urls = ["http://www.mymarket.ge/classified_details_print.php?product_id=5827165"]
def parse(self, response):
contacts = Selector(response)
item = MarketItem()
item['name'] = contacts.xpath('//td[#class="product_info_details_text"]/b/text()').extract()[0].strip()
return item
this gets an item:
{'name': u'Nokia asha 515 dual sim'}
Hope that helps.