How can I add my web scrape process using bs4 to selenium automation in Python to make it one single process which just asks for a zipcode? - python-2.7

I am using selenium to go to a website and then go to the search button type a zipcode which I am entering beforehand and then for that zip code I want the link that the webpage has to feed my web scraper created using beautiful soup and once the link comes up I can scrape required data to get my csv.
What I want:
I am having trouble getting that link to the beautiful soup URL. I basically want to automate it so that I just have to enter a zip code and it gives me my CSV.
What I am able to get:
I am able to enter the zip code and search using selenium and then add that url to my scraper to give csv.
Code I am using for selenium :
driver = webdriver.Chrome('/Users/akashgupta/Desktop/Courses and Learning/Automating Python and scraping/chromedriver')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
#web scraping Part:
url="https://forecast.weather.gov/MapClick.php?lat=32.99802500000004&lon=-96.79775499999994#.Xo5LnFNKgWo"
res= requests.get(url)
soup=BeautifulSoup(res.content,'html.parser')
tag=soup.find_all('div',id='seven-day-forecast-body')
weekly=soup.find_all(class_='tombstone-container')
main=soup.find_all(class_='period-name')
description=soup.find_all(class_='short-desc')
temp=soup.find_all(class_='temp')
Period_Name=[]
Desc=[]
Temp=[]
for a in range(0,len(main)):
Period_Name.append(main[a].get_text())
Desc.append(description[a].get_text())
Temp.append(temp[a].get_text())
df = pd.DataFrame(list(zip(Period_Name, Desc,Temp)),columns =['Period_Name', 'Short_Desc','Temperature'])

from selenium import webdriver
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
WebDriverWait(driver, 10).until(EC.url_contains("https://forecast.weather.gov/MapClick.php")) # here you are waiting until url will match your output pattern
currentURL = driver.current_url
print(currentURL)
time.sleep(3)
driver.quit()
#web scraping Part:
res= requests.get(currentURL)
....

Related

Python 2.7 Selenium unable to extract data

I am trying to extra data by return error
NoSuchElementException: Message: u'Unable to locate element: {"method":"xpath","selector":"//*[#id=\'searchpopbox\']"}' ; Stacktrace:
at FirefoxDriver.findElementInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8444)
at FirefoxDriver.findElement (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8453)
at DelayedCommand.executeInternal_/h (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10456)
at DelayedCommand.executeInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10461)
at DelayedCommand.execute/< (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10401)
My code is as below and I am trying to get the list from the link
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
browser = webdriver.Firefox(profile)
url = 'https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax'
browser.get(url)
time.sleep(15)
a = browser.find_element_by_xpath("//*[#id='searchpopbox']")
print a
I am seeking your help to get the right xpath for the url.
This gets all the listing for that table.
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax")
time.sleep(15)
a = driver.find_element_by_xpath("//*[#id='searchpopbox']")
print(a.text)
Or without chromedrivermanager same thing applies to firefox
.Chrome(executable_path='absolutepathofchromedriver.exe')

Format string to XML file

I want to reformat a string to the XML structure, but my string is not on an XML format (using Python 2.7).
I believe the correct way is to first create an XML format of the input in one line and then use XML Pretty Print for making it an XML file with multi rows and indentation (
Pretty printing XML in Python).
Below there is an example of an input after a History Server REST API's call to Hadoop server 1.
Input:
'{"jobAttempts":{"jobAttempt":[{"nodeHttpAddress":"slave2:8042","nodeId":"slave2:39637","id":1,"startTime":1544691730439,"containerId":"container_1544631848492_0013_01_000001","logsLink":"http://23.22.43.90:19888/jobhistory/logs/slave2:39637/container_1544631848492_0013_01_000001/job_1544631848492_0013/hadoop2"}]}}'
Output:
'<jobAttempts><jobAttempt><nodeHttpAddress>slave2:8042</nodeHttpAddress><nodeId>slave2:39637</nodeId><id>1</id><startTime>1544691730439</startTime><containerId>container_1544631848492_0013_01_000001</containerId><logsLink>http://23.22.43.90:19888/jobhistory/logs/slave2:39637/container_1544631848492_0013_01_000001/job_1544631848492_0013/hadoop2</logsLink></jobAttempt></jobAttempts>'
Final Output
<jobAttempts>
<jobAttempt>
<nodeHttpAddress>slave2:8042</nodeHttpAddress>
<nodeId>slave2:39637</nodeId>
<id>1</id>
<startTime>1544691730439</startTime>
<containerId>container_1544631848492_0013_01_000001</containerId>
<logsLink>http://23.22.43.90:19888/jobhistory/logs/slave2:39637/container_1544631848492_0013_01_000001/job_1544631848492_0013/hadoop2</logsLink>
</jobAttempts>
</jobAttempt>
*This string is actually an XML file which does not appear to have any style information associated with it.
I have found out that the source view of the History Server REST API's is indeed an XML file in one line. Thus, I had to read the source view and not the old problematic view with python.
Before I used
import urllib2
contents = urllib2.urlopen("http://http://23.22.43.90:19888/ws/v1/history/mapreduce/jobs/job_1544631848492_0013//jobattempts").read()
Now, I am downloading the source view of the html page with selenium and BeautifulSoup and I save it locally.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import xml.dom.minidom
driver = webdriver.Firefox()
driver.get("http://23.22.43.90:19888/ws/v1/history/mapreduce/jobs/job_1544631848492_0013/jobattempts")
page_source = driver.page_source
driver.close()
soup = BeautifulSoup(page_source, "html.parser")
print(soup)
xml = xml.dom.minidom.parseString(str(soup))
pretty_xml_as_string = xml.toprettyxml()
file = open("./content_new_2.xml", 'w')
file.write(pretty_xml_as_string)
file.close()

Python request download a file and save to a specific directory

Hello sorry if this question has been asked before.
But I have tried a lot of methods that provided.
Basically, I want to download the file from a website, which is I will show my coding below. The code works perfectly, but the problem is the file was auto download in our download folder path directory.
My concern is to download the file and save it to a specific folder.
I'm aware we can change our browser setting since this was a server that will remote by different users. So, it will automatically download to their temporarily /users/adam_01/download/ folder.
I want it to save in server disk which is, C://ExcelFile/
Below are my script and some of the data have been changing because it is confidential.
import pandas as pd
import html5lib
import time from bs4
import BeautifulSoup
import requests
import csv
from datetime
import datetime
import urllib.request
import os
with requests.Session() as c:
proxies = {"http": "http://:911"}
url = 'https://......./login.jsp'
USERNAME = 'mwirzonw'
PASSWORD = 'Fiqr123'
c.get(url,verify= False)
csrftoken = ''
login_data = dict(proxies,atl_token = csrftoken, os_username=USERNAME, os_password=PASSWORD, next='/')
c.post(url, data=login_data, headers={"referer" : "https://.....com"})
page = c.get('https://........s...../SearchRequest-96010.csv')
location = 'C:/Users/..../Downloads/'
with open('asdsad906010.csv', 'wb') as output:
output.write(page.content )
print("Done!")
Thank you, be pleased to ask if any confusing information was given.
Regards,
Fiqri
It seems that from your script you are writing the file to asdsad906010.csv. You should be able to change the output directory as follows.
# Set the output directory to your desired location
output_directory = 'C:/ExcelFile/'
# Create a file path by joining the directory name with the desired file name
file_path = os.path.join(output_directory, 'asdsad906010.csv')
# Write the file
with open(file_path, 'wb') as output:
output.write(page.content)

Python Web scraper using Beautifulsoup 4

I wanted to create a database with commonly used words. Right now when I run this script it works fine but my biggest issue is I need all of the words to be in one column. I feel like what I did was more of a hack than a real fix. Using Beautifulsoup, can you print everything in one column without having extra blank lines?
import requests
import re
from bs4 import BeautifulSoup
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
# Creating the CSV file
commonFile = open('common_words.csv', 'wb')
# Grabbing the lines you want
for node in soup.findAll("tr"):
# Getting just the text and removing the html
words = ''.join(node.findAll(text=True))
# Removing the extra lines
ID = re.sub(r'[\t\r\n]', '', words)
# Needed to add a break in the line to make the rows
update = ''.join(ID)+'\n'
# Now we add this to the file
commonFile.write(update)
commonFile.close()
How about this?
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open("common_words.csv", "w"))
f.writerow(["common_words"])
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
words = soup.select('div[class=file] tr')
for i in range(len(words)):
word = words[i].text
f.writerow([word.replace('\n', '')])

simple web crawler

i wrote below program in python for very simple web crawler, but when i run it it return me
'NoneType' object is not callable' , could you please help me?
import BeautifulSoup
import urllib2
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup('a')
if page not in crawled:
union(tocrawl,links)
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')
[UPDATE] Here is the complete project code
https://bitbucket.org/deshan/simple-web-crawler
[ANWSER]
soup('a') returns the complete html tag.
Buy Music Now
so the urlopen gives the error
'NoneType' object is not callable'. you need extract the only the url/href.
links=soup.findAll('a',href=True)
for l in links:
print(l['href'])
You need to validate the url too.refer to following anwsers
How do you validate a URL with a regular expression in Python?
Python - How to validate a url in python ? (Malformed or not)
Again i would like to suggest you to use python sets instead Arrays.you can easily add,ommit duplicate urls.
http://docs.python.org/2/library/sets.html
Try the following code:
import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def isValidUrl(url):
if regex.match(url) is not None:
return True;
return False
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
print 'Crawled:'+page
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup.findAll('a',href=True)
if page not in crawled:
for l in links:
if isValidUrl(l['href']):
tocrawl.append(l['href'])
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')