Replace White-space with hyphen then create URL - python-2.7

I'm trying to speed up a process of webscraping by sending raw data to python in lieu of correctly formatted data.
Current data is received as an excel file with data formatted as:
26 EXAMPLE RD EXAMPLEVILLE SA 5000
Data is formatted in excel via macros to:
Replace all spaces with hyphen
Change all text to lower-case
Paste text onto end of http://example.com/property/
Formatted data is http://www.example.com/property/26-example-rd-exampleville-sa-5000
What i'm trying to accomplish:
Get python to go into excel sheet and follow formatting rules listed above, then pass the records to the scraper.
Here is the code I have been trying to compile - please go easy i am VERY new.
Any advice or reading sources related to python formatting would be appreciated.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import csv
from lxml import html
import xlrd
# URL_BUILDER
# Source File for UNFORMATTED DATA
file_location = "C:\Python27\Projects\REA_SCRAPER\NewScraper\ScrapeFile.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_name('((PythonScraperDNC))')
# REA_SCRAPER
# Pass Data from URL_BUILDER to URL_LIST []
URL_LIST = []
# Search Phrase to capture suitable URL's for Scraping
text2search = \
'''<p class="property-value__title">
RECENTLY SOLD
</p>'''
# Write Sales .CSV file
with open('Results.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for (index, url) in enumerate(URL_LIST):
page = requests.get(url)
print '<Scanning Url For Sale>'
if text2search in page.text:
tree = html.fromstring(page.content)
(title, ) = (x.text_content() for x in tree.xpath('//title'))
(price, ) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold, ) = (x.text_content().strip() for x intree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
else:
writer.writerow(['No Sale'])

If you're just trying to figure out how to do the formatting in Python:
text = '26 EXAMPLE RD EXAMPLEVILLE SA 5000'
url = 'http://example.com/property/' + text.replace(' ', '-').lower()
print(url)
# Output:
# http://example.com/property/26-example-rd-exampleville-sa-5000

Related

How do I write all of these rows into a CSV file for a given range?

The purpose of the below code is the webscrape the oxford english dictionary for words that were "invented" in each year within a range of years. This all works as intended.
import csv
import os
import re
import requests
import urllib2
year_start= 1550
year_end = 1552
subject_search = ['Law']
for year in range(year_start, year_end +1):
path = '/Applications/Python 3.5/Economic'
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
urllib2.install_opener(opener)
user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
header = {'User-Agent':user_agent}
resultPath = os.path.join(path, 'OED_table.csv')
htmlPath = os.path.join(path, 'OED.html')
request = urllib2.Request('http://www.oed.com/search?browseType=sortAlpha&case-insensitive=true&dateFilter='+ str(year)+ '&nearDistance=1&ordered=false&page=1&pageSize=100&scope=ENTRY&sort=entry&subjectClass='+ str(subject_search)+ '&type=dictionarysearch', None, header)
page = opener.open(request)
with open(resultPath, 'wb') as outputw, open(htmlPath, 'w') as outputh:
urlpage = page.read()
outputh.write(urlpage)
new_words = re.findall(r'<span class=\"hwSect\"><span class=\"hw\">(.*?)</span>', urlpage)
print new_words
csv_writer = csv.writer(outputw)
if csv_writer.writerow([year] + new_words):
csv_writer.writerow([year, word])
However, when I actually run the code, the only portion that gets written to the csv file is the very last year that I call. So, my csv file ends up looking like a one row like this:
1552, word1, word2, word3, etc....
I basically want to have a separate row for each year in the range of years. How do I go about this?
You keep overwriting in the loop and every time you run the code, open it once outside the loops and append to the file opening with a instead of w so each run of the code will add to the existing data not overwrite.:
with open("/Applications/Python 3.5/Economic/OED_table.csv", 'a') as outputw, open("/Applications/Python 3.5/Economic/OED.html", 'a') as outputh:
for year in range(year_start, year_end +1):
.....................

Python Web scraper using Beautifulsoup 4

I wanted to create a database with commonly used words. Right now when I run this script it works fine but my biggest issue is I need all of the words to be in one column. I feel like what I did was more of a hack than a real fix. Using Beautifulsoup, can you print everything in one column without having extra blank lines?
import requests
import re
from bs4 import BeautifulSoup
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
# Creating the CSV file
commonFile = open('common_words.csv', 'wb')
# Grabbing the lines you want
for node in soup.findAll("tr"):
# Getting just the text and removing the html
words = ''.join(node.findAll(text=True))
# Removing the extra lines
ID = re.sub(r'[\t\r\n]', '', words)
# Needed to add a break in the line to make the rows
update = ''.join(ID)+'\n'
# Now we add this to the file
commonFile.write(update)
commonFile.close()
How about this?
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open("common_words.csv", "w"))
f.writerow(["common_words"])
#Website you want to scrap info from
res = requests.get("https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt")
# Getting just the content using bs4
soup = BeautifulSoup(res.content, "lxml")
words = soup.select('div[class=file] tr')
for i in range(len(words)):
word = words[i].text
f.writerow([word.replace('\n', '')])

how to lookup the numbers next to character using python

this is just part of the long python script. there is a file called aqfile and it has many parameters. I would like to extract what is next to "OWNER" and "NS".
Note:
OWNER = text
NS = numbers
i could extract what is next to OWNER, because they were just text and i could extract.
for line in aqfile.readlines():
if string.find(line,"OWNER")>0:
print line
m=re.search('<(.*)>',line)
owner=incorp(m.group(1))
break
but when i try to modify the script to extract the numbers
for line in aqfile.readlines():
if string.find(line,"NS")>0:
print line
m=re.search('<(.*)>',line)
ns=incorp(m.group(1))
break
it doesnt work any more.
Can anyone help me?
this is the whole script
#Make a CSV file of datasetnames. pulseprog and, if avaible, (part of) the title
#Note: the whole file tree is read into memory!!! Do not start too high in the tree!!!
import os
import os.path
import fnmatch
import re
import string
max=20000
outfiledesc=0
def incorp(c):
#Vervang " door """ ,CRLF door blankos
c=c.replace('"','"""')
c=c.replace("\r"," ")
c=c.replace("\n"," ")
return "\"%s\"" % (c)
def process(arg,root,files):
global max
global outfiledesc
#Get name,expno,procno from the root
if "proc" in files:
procno = incorp(os.path.basename(root))
oneup = os.path.dirname(root)
oneup = os.path.dirname(oneup)
aqdir=oneup
expno = incorp(os.path.basename(oneup))
oneup = os.path.dirname(oneup)
dsname = incorp(os.path.basename(oneup))
#Read the titlefile, if any
if (os.path.isfile(root + "/title")):
f=open(root+"/title","r")
title=incorp(f.read(max))
f.close()
else:
title=""
#Grab the pulse program name from the acqus parameter
aqfile=open(aqdir+"/acqus")
for line in aqfile.readlines():
if string.find(line,"PULPROG")>0:
print line
m=re.search('<(.*)>',line)
pulprog=incorp(m.group(1))
break
towrite= "%s;%s;%s;%s;%s\n" % (dsname,expno,procno,pulprog,title)
outfiledesc.write(towrite)
#Main program
dialogline1="Starting point of the search"
dialogline2="Maximum length of the title"
dialogline3="output CSV file"
def1="/opt/topspin3.2/data/nmrafd/nmr"
def2="20000"
def3="/home/nmrafd/filelist.csv"
result = INPUT_DIALOG("CSV file creator","Create a CSV list",[dialogline1,dialogline2,dialogline3],[def1,def2,def3])
start=result[0]
tlength=int(result[1])
outfile=result[2]
#Search for procs files. They should be in any dataset.
outfiledesc = open(outfile,"w")
print start
os.path.walk(start,process,"")
outfiledesc.close()

removing double quotes and brackets from csv in python

I am trying to remove quotes and brackets from csv in python,I tryed for the folloing code but it can't give proper csv the code is:
import json
import urllib2
import re
import os
from BeautifulSoup import BeautifulSoup
import csv
u = urllib2.urlopen("http://timesofindia.indiatimes.com/")
content = u.read()
u.close()
soup2 = BeautifulSoup(content)
blog_posts = []
for e in soup2.findAll("a", attrs={'pg': re.compile('^Head')}):
for b in soup2.findAll("div", attrs={'style': re.compile('^color:#ffffff;font-size:12px;font-family:arial;padding-top:3px;text-align:center;')}):
blog_posts.append(("The Times Of India",e.text,b.text))
print blog_posts
out_file = os.path.join('resources', 'ch05-webpages','newspapers','time1.csv')
f = open(out_file, 'wb')
wr = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
#f.write(json.dumps(blog_posts, indent=1))
wr.writerow(blog_posts)
f.close()
print 'Wrote output file to %s' % (f.name, )
the csv looks like:
"('The Times Of India', u'Missing jet: Air search expands to remote south Indian Ocean', u'Fri, Mar 21, 2014 | Updated 11.53AM IST')",
but i want csv like this:
The Times Of India,u'Missing jet: Air search expands to remote south Indian Ocean, u'Fri, Mar 21, 2014 | Updated 11.53AM IST
So what can i do for getting this type of csv?
Writer.writerow() expects a sequence containing strings or numbers. You are passing a sequence of tuples. Use Writer.writerows() instead.

How do you convert the multi-line content scraped into a list?

I was trying to convert the content scraped into a list for data manipulation, but got the following error: TypeError: 'NoneType' object is not callable
#! /usr/bin/python
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import os
import re
# Copy all of the content from the provided web page
webpage = urlopen("http://www.optionstrategist.com/calculators/free-volatility- data").read()
# Grab everything that lies between the title tags using a REGEX
preBegin = webpage.find('<pre>') # Locate the pre provided
preEnd = webpage.find('</pre>') # Locate the /pre provided
# Copy the content between the pre tags
voltable = webpage[preBegin:preEnd]
# Pass the content to the Beautiful Soup Module
raw_data = BeautifulSoup(voltable).splitline()
The code is very simple. This is the code for BeautifulSoup4:
# Find all <pre> tag in the HTML page
preTags = webpage.find_all('pre')
for tag in preTags:
# Get the text inside the tag
print(tag.get_text())
Reference:
find_all()
Kinds of filters to put into name field of find()/findall()
get_text()
To get the text from the first pre element:
#!/usr/bin/env python
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
url = "http://www.optionstrategist.com/calculators/free-volatility-data"
soup = BeautifulSoup(urlopen(url))
print soup.pre.string
To extract lines with data:
from itertools import dropwhile
lines = soup.pre.string.splitlines()
# drop lines before the data table header
lines = dropwhile(lambda line: not line.startswith("Symbol"), lines)
# extract lines with data
lines = (line for line in lines if '%ile' in line)
Now each line contains data in a fixed-column format. You could use slicing and/or regex to parse/validate individual fields in each row.