I am newbie when it comes to programming and python.
Therefore I've got a question. With my fellow students we have created few python scripts but now we are stuck and have no more ideas. We need to merge few python scripts into one working scripts. Could anyone help us with that, please?
Scripts:
# Script: webpage_get.py
# Desc: Fetches data from a webpage, and parses out hyperlinks.
# Author: Wojciech Kociszewski
# Created: Nov, 2013
#
import sys, urllib
def wget(url):
''' Try to retrieve a webpage via its url, and return its contents'''
print '[*] wget()'
#open file like url object from web, based on url
url_file = urllib.urlopen(url)
# get webpage contents
page = url_file.read()
return page
def main():
#temp testing url argument
sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html')
#check args
if len(sys.argv) != 2:
print '[-] Usage: webpage_get URL'
return
#Get and analyse web page
print wget(sys.argv[1])
if __name__ == '__main__':
main()
# Script: webpage_getlinks.py
# Desc: Basic web site info gathering and analysis script. From a URL gets
# page content, parsing links out.
# Author: Wojciech Kociszewski
# Created: Nov, 2013
#
import sys, re
import webpage_get
def print_links(page):
''' find all hyperlinks on a webpage passed in as input and print '''
print '[*] print_links()'
# regex to match on hyperlinks, returning 3 grps, links[1] being the link itself
links = re.findall(r'(\<a.*href\=.*)(http\:.+)(?:[^\'" >]+)', page)
# sort and print the links
links.sort()
print '[+]', str(len(links)), 'HyperLinks Found:'
for link in links:
print link[1]
def main():
# temp testing url argument
sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html')
# Check args
if len(sys.argv) != 2:
print '[-] Usage: webpage_getlinks URL'
return
# Get the web page
page = webpage_get.wget(sys.argv[1])
# Get the links
print_links(page)
if __name__ == '__main__':
main()
# Script: webpage_getemails.py
# Desc: Basic web site info gathering and analysis script. From a URL gets
# page content, parsing emails out.
# Author: Wojciech Kociszewski
# Created: Nov, 2013
#
import sys, re
import webpage_get
def print_emails(page):
''' find all emails on a webpage passed in as input and print '''
print '[*] print_emails()'
# regex to match on emails
emails = re.findall(r'([\d\w\.-_]+#[\w\d\.-_]+\.\w+)', page)
# sort and print the emails
emails.sort()
print '[+]', str(len(emails)), 'Emails Found:'
for email in emails:
print email
def main():
# temp testing url argument
sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html')
# Check args
if len(sys.argv) != 2:
print '[-] Usage: webpage_getemails'
return
# Get the web page
page = webpage_get.wget(sys.argv[1])
# Get the emails
print_emails(page)
if __name__ == '__main__':
main()
Analyse your scripts and find the common code
Convert the common code into a module
Rewrite the individual programs with the common code
If you then wish to make the individual programs into one big program it will be much easier
Related
I am using selenium to go to a website and then go to the search button type a zipcode which I am entering beforehand and then for that zip code I want the link that the webpage has to feed my web scraper created using beautiful soup and once the link comes up I can scrape required data to get my csv.
What I want:
I am having trouble getting that link to the beautiful soup URL. I basically want to automate it so that I just have to enter a zip code and it gives me my CSV.
What I am able to get:
I am able to enter the zip code and search using selenium and then add that url to my scraper to give csv.
Code I am using for selenium :
driver = webdriver.Chrome('/Users/akashgupta/Desktop/Courses and Learning/Automating Python and scraping/chromedriver')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
#web scraping Part:
url="https://forecast.weather.gov/MapClick.php?lat=32.99802500000004&lon=-96.79775499999994#.Xo5LnFNKgWo"
res= requests.get(url)
soup=BeautifulSoup(res.content,'html.parser')
tag=soup.find_all('div',id='seven-day-forecast-body')
weekly=soup.find_all(class_='tombstone-container')
main=soup.find_all(class_='period-name')
description=soup.find_all(class_='short-desc')
temp=soup.find_all(class_='temp')
Period_Name=[]
Desc=[]
Temp=[]
for a in range(0,len(main)):
Period_Name.append(main[a].get_text())
Desc.append(description[a].get_text())
Temp.append(temp[a].get_text())
df = pd.DataFrame(list(zip(Period_Name, Desc,Temp)),columns =['Period_Name', 'Short_Desc','Temperature'])
from selenium import webdriver
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
WebDriverWait(driver, 10).until(EC.url_contains("https://forecast.weather.gov/MapClick.php")) # here you are waiting until url will match your output pattern
currentURL = driver.current_url
print(currentURL)
time.sleep(3)
driver.quit()
#web scraping Part:
res= requests.get(currentURL)
....
I appreciate the help in advance. I am trying to write a python script that posts an IP address to a site referenced below, and get the results printed out in the terminal or file, and then read the file immediately after.
Here is my script:
#!/usr/bin/env python
import requests
IP = raw_input("Enter IP address here: ")
Alert_URL = 'http://www.blacklistalert.org'
def submit_form():
"""Submit a form"""
payload = IP
# make a get request
resp = requests.get(Alert_URL)
print "Response to GET request: %s" % resp.content
# send POST request
resp = requests.post(Alert_URL, payload)
print "Headers from a POST request response: %s" % resp.headers
# print "HTML Response: %s" %resp.read()
if __name__ == '__main__':
submit_form()
The site has section to input IP addresses on the web page, and inspecting the site I found lines to input as follows:
<form method=POST onsubmit="document.forms[0].submit.disabled='true';">
IP or Domain <input onclick="this.value='';" name=q value=11.11.154.23>
I would like to post an IP address that I want to check to the site using the input section above somehow. For instance using raw_input to post into the 'value=' section, and get the result.
Thanks for the help.
You need to parse the PHPSESSID and post:
import requests
from bs4 import BeautifulSoup
ip = raw_input("Enter IP address here: ")
data = {"q": ip} # ip goes here
url = "http://www.blacklistalert.org/"
with requests.Session() as s:
# get the page first to parse
soup = BeautifulSoup(s.get(url).content)
# extract and add the PHPSESSID
PHPSESSID = soup.select_one("input[name=PHPSESSID]")["value"]
data["PHPSESSID"] = PHPSESSID
# finally post
res = s.post(url, data=data)
print(res)
print(res.content)
I just wrote a simple webscraping script to give me all the episode links on a particular site's page. The script was working fine, but, now it's broke. I didn't change anything.
Try this URL (For scraping ) :- http://www.crunchyroll.com/tabi-machi-late-show
Now, the script works mid-way and gives me an error stating, ' Element not found in the cache - perhaps the page has changed since it was looked up'
I looked it up on internet and people said about using the 'implicit wait' command at certain places. I did that, still no luck.
UPDATE : I tried this script in a demote desktop and it's working there without any problems.
Here's my script :-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
from subprocess import Popen
#------------------------------------------------
try:
Link = raw_input("Please enter your Link : ")
if not Link:
raise ValueError('Please Enter A Link To The Anime Page. This Application Will now Exit in 5 Seconds.')
except ValueError as e:
print(e)
time.sleep(5)
exit()
print 'Analyzing the Page. Hold on a minute.'
driver = webdriver.Firefox()
driver.get(Link)
assert "Crunchyroll" in driver.title
driver.implicitly_wait(5) # <-- I tried removing this lines as well. No luck.
elem = driver.find_elements_by_xpath("//*[#href]")
driver.implicitly_wait(10) # <-- I tried removing this lines as well. No luck.
text_file = open("BatchLink.txt", "w")
print 'Fetching The Links, please wait.'
for elem in elem:
x = elem.get_attribute("href")
#print x
text_file.write(x+'\n')
print 'Links have been fetched. Just doing the final cleaning now.'
text_file.close()
CleanFile = open("queue.txt", "w")
with open('BatchLink.txt') as f:
mylist = f.read().splitlines()
#print mylist
with open('BatchLink.txt', 'r') as inF:
for line in inF:
if 'episode' in line:
CleanFile.write(line)
print 'Please Check the file named queue.txt'
CleanFile.close()
os.remove('BatchLink.txt')
driver.close()
Here's a screenshot of the error (might be of some help) :
http://i.imgur.com/SaANlsg.png
Ok i didn't work with python but know the problem
you have variable that you init -> elem = driver.find_elements_by_xpath("//*[#href]")
after that you doing some things with it in loop
before you finishing the loop try to init this variable again
elem = driver.find_elements_by_xpath("//*[#href]")
The thing is that the DOM is changes and you loosing the element collection.
I wrote a script to upload a video to YouTube using YouTube Data API v3 in the python with help of example given in Example code.
And I wrote another script to add uploaded video to playlist using same YouTube Data API v3 you can be seen here
After that I wrote a single script to upload video and add that video to playlist. In that I took care of authentication and scops still I am getting permission error. here is my new script
#!/usr/bin/python
import httplib
import httplib2
import os
import random
import sys
import time
from apiclient.discovery import build
from apiclient.errors import HttpError
from apiclient.http import MediaFileUpload
from oauth2client.file import Storage
from oauth2client.client import flow_from_clientsecrets
from oauth2client.tools import run
# Explicitly tell the underlying HTTP transport library not to retry, since
# we are handling retry logic ourselves.
httplib2.RETRIES = 1
# Maximum number of times to retry before giving up.
MAX_RETRIES = 10
# Always retry when these exceptions are raised.
RETRIABLE_EXCEPTIONS = (httplib2.HttpLib2Error, IOError, httplib.NotConnected,
httplib.IncompleteRead, httplib.ImproperConnectionState,
httplib.CannotSendRequest, httplib.CannotSendHeader,
httplib.ResponseNotReady, httplib.BadStatusLine)
# Always retry when an apiclient.errors.HttpError with one of these status
# codes is raised.
RETRIABLE_STATUS_CODES = [500, 502, 503, 504]
CLIENT_SECRETS_FILE = "client_secrets.json"
# A limited OAuth 2 access scope that allows for uploading files, but not other
# types of account access.
YOUTUBE_UPLOAD_SCOPE = "https://www.googleapis.com/auth/youtube.upload"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
# Helpful message to display if the CLIENT_SECRETS_FILE is missing.
MISSING_CLIENT_SECRETS_MESSAGE = """
WARNING: Please configure OAuth 2.0
To make this sample run you will need to populate the client_secrets.json file
found at:
%s
with information from the APIs Console
https://code.google.com/apis/console#access
For more information about the client_secrets.json file format, please visit:
https://developers.google.com/api-client-library/python/guide/aaa_client_secrets
""" % os.path.abspath(os.path.join(os.path.dirname(__file__),
CLIENT_SECRETS_FILE))
def get_authenticated_service():
flow = flow_from_clientsecrets(CLIENT_SECRETS_FILE, scope=YOUTUBE_UPLOAD_SCOPE,
message=MISSING_CLIENT_SECRETS_MESSAGE)
storage = Storage("%s-oauth2.json" % sys.argv[0])
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run(flow, storage)
return build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
http=credentials.authorize(httplib2.Http()))
def initialize_upload(title,description,keywords,privacyStatus,file):
youtube = get_authenticated_service()
tags = None
if keywords:
tags = keywords.split(",")
insert_request = youtube.videos().insert(
part="snippet,status",
body=dict(
snippet=dict(
title=title,
description=description,
tags=tags,
categoryId='26'
),
status=dict(
privacyStatus=privacyStatus
)
),
# chunksize=-1 means that the entire file will be uploaded in a single
# HTTP request. (If the upload fails, it will still be retried where it
# left off.) This is usually a best practice, but if you're using Python
# older than 2.6 or if you're running on App Engine, you should set the
# chunksize to something like 1024 * 1024 (1 megabyte).
media_body=MediaFileUpload(file, chunksize=-1, resumable=True)
)
vid=resumable_upload(insert_request)
#Here I added lines to add video to playlist
#add_video_to_playlist(youtube,vid,"PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ")
#youtube = get_authenticated_service()
add_video_request=youtube.playlistItems().insert(
part="snippet",
body={
'snippet': {
'playlistId': "PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ",
'resourceId': {
'kind': 'youtube#video',
'videoId': vid
}
#'position': 0
}
}
).execute()
def resumable_upload(insert_request):
response = None
error = None
retry = 0
vid=None
while response is None:
try:
print "Uploading file..."
status, response = insert_request.next_chunk()
if 'id' in response:
print "'%s' (video id: %s) was successfully uploaded." % (
title, response['id'])
vid=response['id']
else:
exit("The upload failed with an unexpected response: %s" % response)
except HttpError, e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = "A retriable HTTP error %d occurred:\n%s" % (e.resp.status,
e.content)
else:
raise
except RETRIABLE_EXCEPTIONS, e:
error = "A retriable error occurred: %s" % e
if error is not None:
print error
retry += 1
if retry > MAX_RETRIES:
exit("No longer attempting to retry.")
max_sleep = 2 ** retry
sleep_seconds = random.random() * max_sleep
print "Sleeping %f seconds and then retrying..." % sleep_seconds
time.sleep(sleep_seconds)
return vid
if __name__ == '__main__':
title="sample title"
description="sample description"
keywords="keyword1,keyword2,keyword3"
privacyStatus="public"
file="myfile.mp4"
vid=initialize_upload(title,description,keywords,privacyStatus,file)
print 'video ID is :',vid
I am not able to figure out what is wrong. I am getting permission error. both script works fine independently.
could anyone help me figure out where I am wrong or how to achieve uploading video and adding that too playlist.
I got the answer actually in both the independent script scope is different.
scope for uploading is "https://www.googleapis.com/auth/youtube.upload"
scope for adding to playlist is "https://www.googleapis.com/auth/youtube"
as scope is different so I had to handle authentication separately.
i wrote below program in python for very simple web crawler, but when i run it it return me
'NoneType' object is not callable' , could you please help me?
import BeautifulSoup
import urllib2
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup('a')
if page not in crawled:
union(tocrawl,links)
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')
[UPDATE] Here is the complete project code
https://bitbucket.org/deshan/simple-web-crawler
[ANWSER]
soup('a') returns the complete html tag.
Buy Music Now
so the urlopen gives the error
'NoneType' object is not callable'. you need extract the only the url/href.
links=soup.findAll('a',href=True)
for l in links:
print(l['href'])
You need to validate the url too.refer to following anwsers
How do you validate a URL with a regular expression in Python?
Python - How to validate a url in python ? (Malformed or not)
Again i would like to suggest you to use python sets instead Arrays.you can easily add,ommit duplicate urls.
http://docs.python.org/2/library/sets.html
Try the following code:
import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def isValidUrl(url):
if regex.match(url) is not None:
return True;
return False
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
print 'Crawled:'+page
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup.findAll('a',href=True)
if page not in crawled:
for l in links:
if isValidUrl(l['href']):
tocrawl.append(l['href'])
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')