Scraping aspx with Python mechanize - getting search results - python-2.7

I've been trying to scrape Congressional financial disclosure reports using mechanize; the form submits successfully, but I can't locate any of the search results. My script is below:
br = Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('http://clerk.house.gov/public_disc/financial-search.aspx')
br.select_form(name='aspnetForm')
br.set_all_readonly(False)
br['filing_year'] = ['2008']
response = br.submit(name='search_btn')
html = response.read()
I'm new to scraping, and would appreciate any corrections/advice on this. Thanks!

This is an alternative solution that involves a real browser with the help of selenium tool.
from selenium import webdriver
from selenium.webdriver.support.select import Select
# initialize webdriver instance and visit url
url = "http://clerk.house.gov/public_disc/financial-search.aspx"
browser = webdriver.Firefox()
browser.get(url)
# find select tag and select 2008
select = Select(browser.find_element_by_id('ctl00_cphMain_txbFiling_year'))
select.select_by_value('2008')
# find "search" button and click it
button = browser.find_element_by_id('ctl00_cphMain_btnSearch')
button.click()
# display results
table = browser.find_element_by_id('search_results')
for row in table.find_elements_by_tag_name('tr')[1:-1]:
print [cell.text for cell in row.find_elements_by_tag_name('td')]
# close the browser
browser.close()
Prints:
[u'ABERCROMBIE, HON.NEIL', u'HI01', u'2008', u'FD Amendment']
[u'ABERCROMBIE, HON.NEIL', u'HI01', u'2008', u'FD Original']
[u'ACKERMAN, HON.GARY L.', u'NY05', u'2008', u'FD Amendment']
[u'ACKERMAN, HON.GARY L.', u'NY05', u'2008', u'FD Amendment']
...

Related

Python panel (bokeh) server connection display empty html page without error message

I have a Django project where one view function start a bokeh server by a python script.
Popen(["panel", "serve", "/opt/bitnami/projects/blog/EnviAI/scripts/visz_pn_ssm_1.py", "--show"])
With another view, I try to connect to the server and display the dashboard from visz_pn_ssm_1.py .
def redirect_bokeh_server(request):
session = pull_session(url="http://localhost:5006/visz_pn_ssm_1")
script = server_session(model=None,session_id=session.id,url="http://localhost:5006/visz_pn_ssm_1")
return render(request, 'dashboard_ssm.html', {'script' : script})
in my dashboard_ssm.html
<body>
{{script | safe}}
</body>
From the console i get:
Starting Bokeh server version 2.4.2 (running on Tornado 6.1)
2022-04-03 08:26:03,800 User authentication hooks NOT provided (default user enabled)
2022-04-03 08:26:03,804 Bokeh app running at: http://localhost:5006/visz_pn_ssm_12022-04-03
08:26:03,804 Starting Bokeh server with process id: 269292022-04-03
08:26:06,550 WebSocket connection openedtest2022-04-03 08:26:07,762 ServerConnection created
But the page is empty?
The content of my panel script visz_pn_ssm_1.py:
import pandas as pd
import geopandas as gpd
import panel as pn
import hvplot.pandas
import pickle
pn.extension()
pn.config.js_files = {'deck': 'https://unpkg.com/deck.gl#~5.2.0/deckgl.min.js'}
pn.config.css_files = ['https://api.tiles.mapbox.com/mapbox-gl-js/v0.44.1/mapbox-gl.css']
with open ('/opt/bitnami/projects/data/filepath_ssm_user.pickl', 'rb') as temp:
res = pickle.load(temp)
# ried soil samples 30m 17-19
gdf = pd.read_csv(f'/opt/bitnami/projects/data/tables/{res[0]}'
,)[['date', 'ssm']].dropna().reset_index(drop=True)
gdf['date'] = gdf['date'].astype('datetime64[ns]')
#Options for Widgets
years = gdf.date.dt.year.unique()
# Widgets
year_slider = pn.widgets.IntSlider(name = 'Year', start=int(years.min()), end=int(years.max()), value=int(years[0]))
#pn.depends(year_slider)
def plot_data(year_slider):
data_select = gdf[gdf['date'].dt.year == year_slider]
# Scatter Plot
scatter = data_select.hvplot.scatter(
x = 'date',
y = 'ssm',
title = f'Oberflächennahe Bodenfeuchte'
)
return scatter
# Non Parameter Attributes
title = 'Oberflächennahe Bodenfeuchte berechnet mithilfe von Convolutional Neuronal Networks aus Sentinel 1 & 2 & ERA 5 Satelliten Daten'
header_box = pn.WidgetBox(title,
year_slider,
align="center"
)
# Plot Box
dashboard = pn.Row(header_box, plot_data)
# To start with panel serve script
dashboard.servable()

Python 2.7 Selenium unable to extract data

I am trying to extra data by return error
NoSuchElementException: Message: u'Unable to locate element: {"method":"xpath","selector":"//*[#id=\'searchpopbox\']"}' ; Stacktrace:
at FirefoxDriver.findElementInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8444)
at FirefoxDriver.findElement (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8453)
at DelayedCommand.executeInternal_/h (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10456)
at DelayedCommand.executeInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10461)
at DelayedCommand.execute/< (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10401)
My code is as below and I am trying to get the list from the link
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
browser = webdriver.Firefox(profile)
url = 'https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax'
browser.get(url)
time.sleep(15)
a = browser.find_element_by_xpath("//*[#id='searchpopbox']")
print a
I am seeking your help to get the right xpath for the url.
This gets all the listing for that table.
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax")
time.sleep(15)
a = driver.find_element_by_xpath("//*[#id='searchpopbox']")
print(a.text)
Or without chromedrivermanager same thing applies to firefox
.Chrome(executable_path='absolutepathofchromedriver.exe')

selenium with chromedriver on centOS7 for spidering

I trying to make Crawler for my server.
I Found chilkat Lib's CKSpider, but it is not support JS Rendering.
So I try to use selenium webdriver with Chrome.
I run with CentOS7, python2.7
I want spider all page with 1 baseDomain.
Example
BaseDomain = example.com
then find all page something like
example.com/event/.../../...
example.com/games/.../...
example.com/../.../..
...
My Crawler code
from selenium import webdriver
import time
options = webdriver.ChromeOptions()
options.binary_location = "/usr/bin/google-chrome"
chrome_driver_binary = "/root/chromedriver"
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko-KR,ko,en-US,en")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_driver_binary, chrome_options=options)
host = example.com
def Crawler(Url):
driver.get(Url)
driver.implicitly_wait(3)
#Do Something
time.sleep(3)
#Crawl next
Crawler(host)
driver.quit()
How can I crawl next page? Is there any other way in selenium
Or need other Lib for that?
Thanks for any Tips or Advice.

python 2.7 BeautifulSoup find the table containing a particular string

After searching a BeautifulSoup document for a string, how do I get the table which contains that string? I have a solution which works on one table that I am familiar with:
My code is as follows:
import mechanize
from bs4 import BeautifulSoup
sitemap_url = "https://www.rbi.org.in/scripts/sitemap.aspx"
br = mechanize.Browser()
br.addheaders = [('User-agent',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]
response = br.open(sitemap_url)
text = response.read()
br.close()
soup = BeautifulSoup(text, 'lxml')
# Find the table containing the financial intermediaries.
# First I find "Financial Intermediaries" in soup.
fin_str = soup.find(text="Financial Intermediaries")
# Next I step out through the parents
# until it turns out that I have found the table.
fin_tbl = fin_str.parent.parent.parent.parent
The problem with this is that I have to check the results each time I step out of the document. How can I add .parent until I see the table?
Append the following code onto the program:
# The first tag around the string is the parent.
fn_in = fin_str.parent
# Step out through the parents.
def step_out(i):
if isinstance(i, element.NavigableString):
pass
return i.parent
# Continue until 'table' is in the name of the tag.
while not 'table' in fn_in.name:
fn_in = step_out(fn_in)

Mechanize - Python

I am using mechanize in python to log into a HTTPS page. The login is successful but the output is just a SAML response. I am unable to get the actual page source which i get when opening with my browser.
import mechanize
import getpass
import cookielib
br=mechanize.Browser()
br.set_handle_robots(False)
b=[]
cj = cookielib.CookieJar()
br.set_cookiejar(cj)
pw=getpass.getpass("Enter Your Password Here: ")
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Encoding', 'gzip,deflate,sdch'),
('Accept-Language', 'en-US,en;q=0.8'),
('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3')]
br.open("https:***single sign on login url***")
br.select_form(name='login-form')
br.form['userid']='id'
br.form['password']=pw
response=br.submit()
print response.read()
a=br.open("https:****url****")
for i in range(1000):
b.append(a.readline())
print b
I get SAML output which is encrypted but i dont know how to reply with that SAML post to get to the actual page.