Scraping aspx with Python mechanize - getting search results

Scraping aspx with Python mechanize - getting search results - python-2.7

I've been trying to scrape Congressional financial disclosure reports using mechanize; the form submits successfully, but I can't locate any of the search results. My script is below:
br = Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('http://clerk.house.gov/public_disc/financial-search.aspx')
br.select_form(name='aspnetForm')
br.set_all_readonly(False)
br['filing_year'] = ['2008']
response = br.submit(name='search_btn')
html = response.read()
I'm new to scraping, and would appreciate any corrections/advice on this. Thanks!

This is an alternative solution that involves a real browser with the help of selenium tool.
from selenium import webdriver
from selenium.webdriver.support.select import Select
# initialize webdriver instance and visit url
url = "http://clerk.house.gov/public_disc/financial-search.aspx"
browser = webdriver.Firefox()
browser.get(url)
# find select tag and select 2008
select = Select(browser.find_element_by_id('ctl00_cphMain_txbFiling_year'))
select.select_by_value('2008')
# find "search" button and click it
button = browser.find_element_by_id('ctl00_cphMain_btnSearch')
button.click()
# display results
table = browser.find_element_by_id('search_results')
for row in table.find_elements_by_tag_name('tr')[1:-1]:
print [cell.text for cell in row.find_elements_by_tag_name('td')]
# close the browser
browser.close()
Prints:
[u'ABERCROMBIE, HON.NEIL', u'HI01', u'2008', u'FD Amendment']
[u'ABERCROMBIE, HON.NEIL', u'HI01', u'2008', u'FD Original']
[u'ACKERMAN, HON.GARY L.', u'NY05', u'2008', u'FD Amendment']
[u'ACKERMAN, HON.GARY L.', u'NY05', u'2008', u'FD Amendment']
...

Related

Python panel (bokeh) server connection display empty html page without error message

I have a Django project where one view function start a bokeh server by a python script.
Popen(["panel", "serve", "/opt/bitnami/projects/blog/EnviAI/scripts/visz_pn_ssm_1.py", "--show"])
With another view, I try to connect to the server and display the dashboard from visz_pn_ssm_1.py .
def redirect_bokeh_server(request):
session = pull_session(url="http://localhost:5006/visz_pn_ssm_1")
script = server_session(model=None,session_id=session.id,url="http://localhost:5006/visz_pn_ssm_1")
return render(request, 'dashboard_ssm.html', {'script' : script})
in my dashboard_ssm.html
<body>
{{script | safe}}
</body>
From the console i get:
Starting Bokeh server version 2.4.2 (running on Tornado 6.1)
2022-04-03 08:26:03,800 User authentication hooks NOT provided (default user enabled)
2022-04-03 08:26:03,804 Bokeh app running at: http://localhost:5006/visz_pn_ssm_12022-04-03
08:26:03,804 Starting Bokeh server with process id: 269292022-04-03
08:26:06,550 WebSocket connection openedtest2022-04-03 08:26:07,762 ServerConnection created
But the page is empty?
The content of my panel script visz_pn_ssm_1.py:
import pandas as pd
import geopandas as gpd
import panel as pn
import hvplot.pandas
import pickle
pn.extension()
pn.config.js_files = {'deck': 'https://unpkg.com/deck.gl#~5.2.0/deckgl.min.js'}
pn.config.css_files = ['https://api.tiles.mapbox.com/mapbox-gl-js/v0.44.1/mapbox-gl.css']
with open ('/opt/bitnami/projects/data/filepath_ssm_user.pickl', 'rb') as temp:
res = pickle.load(temp)
# ried soil samples 30m 17-19
gdf = pd.read_csv(f'/opt/bitnami/projects/data/tables/{res[0]}'
,)[['date', 'ssm']].dropna().reset_index(drop=True)
gdf['date'] = gdf['date'].astype('datetime64[ns]')
#Options for Widgets
years = gdf.date.dt.year.unique()
# Widgets
year_slider = pn.widgets.IntSlider(name = 'Year', start=int(years.min()), end=int(years.max()), value=int(years[0]))
#pn.depends(year_slider)
def plot_data(year_slider):
data_select = gdf[gdf['date'].dt.year == year_slider]
# Scatter Plot
scatter = data_select.hvplot.scatter(
x = 'date',
y = 'ssm',
title = f'Oberflächennahe Bodenfeuchte'
)
return scatter
# Non Parameter Attributes
title = 'Oberflächennahe Bodenfeuchte berechnet mithilfe von Convolutional Neuronal Networks aus Sentinel 1 & 2 & ERA 5 Satelliten Daten'
header_box = pn.WidgetBox(title,
year_slider,
align="center"
)
# Plot Box
dashboard = pn.Row(header_box, plot_data)
# To start with panel serve script
dashboard.servable()

Python 2.7 Selenium unable to extract data

I am trying to extra data by return error
NoSuchElementException: Message: u'Unable to locate element: {"method":"xpath","selector":"//*[#id=\'searchpopbox\']"}' ; Stacktrace:
at FirefoxDriver.findElementInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8444)
at FirefoxDriver.findElement (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8453)
at DelayedCommand.executeInternal_/h (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10456)
at DelayedCommand.executeInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10461)
at DelayedCommand.execute/< (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10401)
My code is as below and I am trying to get the list from the link
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
browser = webdriver.Firefox(profile)
url = 'https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax'
browser.get(url)
time.sleep(15)
a = browser.find_element_by_xpath("//*[#id='searchpopbox']")
print a
I am seeking your help to get the right xpath for the url.

This gets all the listing for that table.
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax")
time.sleep(15)
a = driver.find_element_by_xpath("//*[#id='searchpopbox']")
print(a.text)
Or without chromedrivermanager same thing applies to firefox
.Chrome(executable_path='absolutepathofchromedriver.exe')

selenium with chromedriver on centOS7 for spidering

I trying to make Crawler for my server.
I Found chilkat Lib's CKSpider, but it is not support JS Rendering.
So I try to use selenium webdriver with Chrome.
I run with CentOS7, python2.7
I want spider all page with 1 baseDomain.
Example
BaseDomain = example.com
then find all page something like
example.com/event/.../../...
example.com/games/.../...
example.com/../.../..
...
My Crawler code
from selenium import webdriver
import time
options = webdriver.ChromeOptions()
options.binary_location = "/usr/bin/google-chrome"
chrome_driver_binary = "/root/chromedriver"
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko-KR,ko,en-US,en")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_driver_binary, chrome_options=options)
host = example.com
def Crawler(Url):
driver.get(Url)
driver.implicitly_wait(3)
#Do Something
time.sleep(3)
#Crawl next
Crawler(host)
driver.quit()
How can I crawl next page? Is there any other way in selenium
Or need other Lib for that?
Thanks for any Tips or Advice.

python 2.7 BeautifulSoup find the table containing a particular string

After searching a BeautifulSoup document for a string, how do I get the table which contains that string? I have a solution which works on one table that I am familiar with:
My code is as follows:
import mechanize
from bs4 import BeautifulSoup
sitemap_url = "https://www.rbi.org.in/scripts/sitemap.aspx"
br = mechanize.Browser()
br.addheaders = [('User-agent',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]
response = br.open(sitemap_url)
text = response.read()
br.close()
soup = BeautifulSoup(text, 'lxml')
# Find the table containing the financial intermediaries.
# First I find "Financial Intermediaries" in soup.
fin_str = soup.find(text="Financial Intermediaries")
# Next I step out through the parents
# until it turns out that I have found the table.
fin_tbl = fin_str.parent.parent.parent.parent
The problem with this is that I have to check the results each time I step out of the document. How can I add .parent until I see the table?

Append the following code onto the program:
# The first tag around the string is the parent.
fn_in = fin_str.parent
# Step out through the parents.
def step_out(i):
if isinstance(i, element.NavigableString):
pass
return i.parent
# Continue until 'table' is in the name of the tag.
while not 'table' in fn_in.name:
fn_in = step_out(fn_in)

Mechanize - Python

I am using mechanize in python to log into a HTTPS page. The login is successful but the output is just a SAML response. I am unable to get the actual page source which i get when opening with my browser.
import mechanize
import getpass
import cookielib
br=mechanize.Browser()
br.set_handle_robots(False)
b=[]
cj = cookielib.CookieJar()
br.set_cookiejar(cj)
pw=getpass.getpass("Enter Your Password Here: ")
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Encoding', 'gzip,deflate,sdch'),
('Accept-Language', 'en-US,en;q=0.8'),
('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3')]
br.open("https:***single sign on login url***")
br.select_form(name='login-form')
br.form['userid']='id'
br.form['password']=pw
response=br.submit()
print response.read()
a=br.open("https:****url****")
for i in range(1000):
b.append(a.readline())
print b
I get SAML output which is encrypted but i dont know how to reply with that SAML post to get to the actual page.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Scraping aspx with Python mechanize - getting search results - python-2.7

Related

Python panel (bokeh) server connection display empty html page without error message

Python 2.7 Selenium unable to extract data

selenium with chromedriver on centOS7 for spidering

python 2.7 BeautifulSoup find the table containing a particular string

Mechanize - Python

Categories

Resources