LPTHW 41 - url failure - python-2.7

The script includes:
import random
from urllib import urlopen
import sys
WORD_URL = "http://learncodethehardway.org/words.txt"
WORDS = []
...
# load up the words from the website
for word in urlopen(WORD_URL).readlines():
WORDS.append(word.strip())
...
My problem occurs when I try to run the script. I get the following tracebacks:
xterm_003.jpg

Related

Django-grpc-framework generates strange gRPC code

When I generate gRPC code in my django-grpc-framework project using command:
python -m grpc_tools.protoc --proto_path=./ --python_out=./temp --grpc_python_out=./temp ./config.proto
something generates, but config_pb2 looks so empty:
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: config.proto
"""Generated protocol buffer code."""
from google.protobuf.internal import builder as _builder
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
# ##protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x63onfig.proto\x12\x0c\x63onfig_proto\x1a\x1bgoogle/protobuf/empty.proto\"u\n\rConfigMessage\x12\x0f\n\x07service\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12\x0f\n\x07is_used\x18\x03 \x01(\x08\x1a\x31\n\x03Key\x12\x13\n\x0bservice_key\x18\x01 \x01(\t\x12\x15\n\rservice_value\x18\x02 \x01(\t\"\x1a\n\x18\x43onfigMessageListRequest\"*\n\x1c\x43onfigMessageRetrieveRequest\x12\n\n\x02id\x18\x01 \x01(\x05\x32\x88\x03\n\x10\x43onfigController\x12O\n\x04List\x12&.config_proto.ConfigMessageListRequest\x1a\x1b.config_proto.ConfigMessage\"\x00\x30\x01\x12\x44\n\x06\x43reate\x12\x1b.config_proto.ConfigMessage\x1a\x1b.config_proto.ConfigMessage\"\x00\x12U\n\x08Retrieve\x12*.config_proto.ConfigMessageRetrieveRequest\x1a\x1b.config_proto.ConfigMessage\"\x00\x12\x44\n\x06Update\x12\x1b.config_proto.ConfigMessage\x1a\x1b.config_proto.ConfigMessage\"\x00\x12#\n\x07\x44\x65stroy\x12\x1b.config_proto.ConfigMessage\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'config_pb2', globals())
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_CONFIGMESSAGE._serialized_start=59
_CONFIGMESSAGE._serialized_end=176
_CONFIGMESSAGE_KEY._serialized_start=127
_CONFIGMESSAGE_KEY._serialized_end=176
_CONFIGMESSAGELISTREQUEST._serialized_start=178
_CONFIGMESSAGELISTREQUEST._serialized_end=204
_CONFIGMESSAGERETRIEVEREQUEST._serialized_start=206
_CONFIGMESSAGERETRIEVEREQUEST._serialized_end=248
_CONFIGCONTROLLER._serialized_start=251
_CONFIGCONTROLLER._serialized_end=643
# ##protoc_insertion_point(module_scope)
So when I look at config_pb2_grpc i see imports:
config__pb2.ConfigMessage
But there is no implemented ConfigMessage in config_pb2. And more then this config_pb2 has one underscore and this import has two underscores. In my mind this is rather strange.
Is it all right?
For example when i generate code in my ordinary django-rest-framework project using:
protoc -I=$SRC_DIR --python_out=$DST_DIR $SRC_DIR/data.proto
I get in data_pb2.py:
...
ConfigMessage = _reflection.GeneratedProtocolMessageType('ConfigMessage', (_message.Message,), {
...

How can I add my web scrape process using bs4 to selenium automation in Python to make it one single process which just asks for a zipcode?

I am using selenium to go to a website and then go to the search button type a zipcode which I am entering beforehand and then for that zip code I want the link that the webpage has to feed my web scraper created using beautiful soup and once the link comes up I can scrape required data to get my csv.
What I want:
I am having trouble getting that link to the beautiful soup URL. I basically want to automate it so that I just have to enter a zip code and it gives me my CSV.
What I am able to get:
I am able to enter the zip code and search using selenium and then add that url to my scraper to give csv.
Code I am using for selenium :
driver = webdriver.Chrome('/Users/akashgupta/Desktop/Courses and Learning/Automating Python and scraping/chromedriver')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
#web scraping Part:
url="https://forecast.weather.gov/MapClick.php?lat=32.99802500000004&lon=-96.79775499999994#.Xo5LnFNKgWo"
res= requests.get(url)
soup=BeautifulSoup(res.content,'html.parser')
tag=soup.find_all('div',id='seven-day-forecast-body')
weekly=soup.find_all(class_='tombstone-container')
main=soup.find_all(class_='period-name')
description=soup.find_all(class_='short-desc')
temp=soup.find_all(class_='temp')
Period_Name=[]
Desc=[]
Temp=[]
for a in range(0,len(main)):
Period_Name.append(main[a].get_text())
Desc.append(description[a].get_text())
Temp.append(temp[a].get_text())
df = pd.DataFrame(list(zip(Period_Name, Desc,Temp)),columns =['Period_Name', 'Short_Desc','Temperature'])
from selenium import webdriver
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
WebDriverWait(driver, 10).until(EC.url_contains("https://forecast.weather.gov/MapClick.php")) # here you are waiting until url will match your output pattern
currentURL = driver.current_url
print(currentURL)
time.sleep(3)
driver.quit()
#web scraping Part:
res= requests.get(currentURL)
....

Abaqus python macro function with for loop - 'unexpected indent'

I'm trying to write a simple macro with a for loop but keep getting this error about indentation. I've seen several threads about this and feel certain my indentation is fine so I'm not sure what's wrong. Here it is:
# -*- coding: mbcs -*-
# Do not delete the following import lines
from abaqus import *
from abaqusConstants import *
import __main__
def ex():
import section
#...more imports
import connectorBehavior
a = mdb.models['Model'].rootAssembly
r1 = a.referencePoints
for i in range(0,9):
print(r1(i))
And the error it returns is
IndentationError: ('unexpected indent', ('C:\\Users\\user\\abaqusMacros.py', 14, 1, '\tfor i in range(0,9):\n'))
Do I need to change the indentation? Thank you.

Python Web scraper using Beautifulsoup 4

I wanted to create a database with commonly used words. Right now when I run this script it works fine but my biggest issue is I need all of the words to be in one column. I feel like what I did was more of a hack than a real fix. Using Beautifulsoup, can you print everything in one column without having extra blank lines?
import requests
import re
from bs4 import BeautifulSoup
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
# Creating the CSV file
commonFile = open('common_words.csv', 'wb')
# Grabbing the lines you want
for node in soup.findAll("tr"):
# Getting just the text and removing the html
words = ''.join(node.findAll(text=True))
# Removing the extra lines
ID = re.sub(r'[\t\r\n]', '', words)
# Needed to add a break in the line to make the rows
update = ''.join(ID)+'\n'
# Now we add this to the file
commonFile.write(update)
commonFile.close()
How about this?
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open("common_words.csv", "w"))
f.writerow(["common_words"])
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
words = soup.select('div[class=file] tr')
for i in range(len(words)):
word = words[i].text
f.writerow([word.replace('\n', '')])

simple web crawler

i wrote below program in python for very simple web crawler, but when i run it it return me
'NoneType' object is not callable' , could you please help me?
import BeautifulSoup
import urllib2
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup('a')
if page not in crawled:
union(tocrawl,links)
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')
[UPDATE] Here is the complete project code
https://bitbucket.org/deshan/simple-web-crawler
[ANWSER]
soup('a') returns the complete html tag.
Buy Music Now
so the urlopen gives the error
'NoneType' object is not callable'. you need extract the only the url/href.
links=soup.findAll('a',href=True)
for l in links:
print(l['href'])
You need to validate the url too.refer to following anwsers
How do you validate a URL with a regular expression in Python?
Python - How to validate a url in python ? (Malformed or not)
Again i would like to suggest you to use python sets instead Arrays.you can easily add,ommit duplicate urls.
http://docs.python.org/2/library/sets.html
Try the following code:
import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def isValidUrl(url):
if regex.match(url) is not None:
return True;
return False
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
print 'Crawled:'+page
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup.findAll('a',href=True)
if page not in crawled:
for l in links:
if isValidUrl(l['href']):
tocrawl.append(l['href'])
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')