How to delete common words from two documents thats extracted from two websites? I already extracted the news from two sites now I want to delete the common words from the two documents. I used the following code to extract news from two different websites:
from __future__import unicode_literals
import feedparser
import re
d=feedparser.parse('http://feeds.bbci.co.uk./news/rss.xml')
i=0
for post in d.entries
titl = post.title
desc = post.description
titl2 = tit1.replace('\\'," ")
desc1 = desc.replace('/'," ")
print(str(i) + ' ' + titl2)
i=i+1
print "indian Express"
g=feedparser.parse('http://www.rssmicro.com/rss.web?q=Android')
i=0
for pos in g.entries:
tit = post.title
#desc=post.description
tit4 = tit.replace('\\'," ")
print(str(i) + ' ' + tit4)
i=i+1
Related
I've been using python-docx to produce large documents full of tables and figures conforming with a standard template. I have discovered how to make them cross-referenceable using https://github.com/python-openxml/python-docx/issues/359 . However this labels my figures/tables starting at 1 within each section and continuing until the next section where it restarts from 1.
I would like the figure numbers to be dependent on the section number (i.e. 1st figure in 2nd section = Figure 2.1 etc.). Does anyone know if this is possible?
Currently the numbering is produced by the function:
def Table(paragraph):
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
run = run = paragraph.add_run()
r = run._r
fldChar = OxmlElement('w:fldChar')
fldChar.set(qn('w:fldCharType'), 'begin')
r.append(fldChar)
instrText = OxmlElement('w:instrText')
instrText.text = ' SEQ TableMain \* ARABIC \s 1 '
print instrText
r.append(instrText)
fldChar = OxmlElement('w:fldChar')
fldChar.set(qn('w:fldCharType'), 'end')
r.append(fldChar)
Called by the following code which also populates the table and table title and footer
table3 = document.add_table(rows=1, cols=1)
table3.cell(0,0).text="Table "
for paragraph in table4.cell(0,0).paragraphs:
paragraph.style = document.styles['Caption']
Table(paragraph)
paragraph.add_run(text="this is the full table name")
row_cells = table3.add_row().cells
call_func_that_makes_actual_table(row_cells[0],...)
row_cells = table3.add_row().cells
row_cells[0].text="Source: ..."
for paragraph in row_cells[0].paragraphs:
paragraph.style = document.styles['Source']
This produces a table like
this
Whereas I would like the table numbering like
this
Managed to work this out myself the solution is adding a further function:
def section(paragraph):
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
run = run = paragraph.add_run()
r = run._r
fldChar = OxmlElement('w:fldChar')
fldChar.set(qn('w:fldCharType'), 'begin')
r.append(fldChar)
instrText = OxmlElement('w:instrText')
instrText.text = ' STYLEREF 1 \s '
r.append(instrText)
fldChar = OxmlElement('w:fldChar')
fldChar.set(qn('w:fldCharType'), 'end')
r.append(fldChar)
and changing the call to:
for paragraph in table.cell(1,0).paragraphs:
paragraph.style = document.styles['Caption']
section(paragraph)
paragraph.add_run(text=".")
Figure(paragraph)
paragraph.add_run(text=": this is the full table name")
I am currently trying to open a file and using shlex.split to segment the lines. Here is an example of 2 lines from the text file.
set group address "Untrust" "This is a test group"
set group address "Untrust" "This is a test group" add "Test-address"
When I run my code it says "IndexError: list index out of range". I do realize that this is because it doesn't recognize my linetoken[5]. Since both lines begin almost identical, how would I get the code to move beyond the first line and just go to the second. My current code is below. The user input and count are for entering zones and then having it loop through using the input zones, however, I erased most of the code in an attempt to fix this issue first.
import shlex
import sys
def main():
zone = []
zone = raw_input(str('enter zones: '))
zone = shlex.split(zone)
count = 0
configfile = open('convert.txt','r')
for configline in configfile:
with open('converted.txt','a')
linetoken = shlex.split(configline)
if(linetoken[0]=='set' and linetoken[1]=='group' and linetoken[5]=='add'):
converted.write(linetoken[0] +' ' +linetoken[1] +' ' +linetoken[2] +' ' +linetoken[3] +' ' +linetoken[4] +' ' +linetoken[5])
break
main()
I figured out how the answer to my problem and so I figured I would post the solution. To get bypass the first line I had it check that the word 'add' wasn't in the linetoken. If this was true then I added the statement pass and continued to the elif statement. Below is my new code.
import shlex
import sys
def main():
zone = []
zone = raw_input(str('enter zones: '))
zone = shlex.split(zone)
count = 0
configfile = open('convert.txt','r')
for configline in configfile:
with open('converted.txt','a')
linetoken = shlex.split(configline)
if(linetoken[0]=='set' and linetoken[1]=='group' and 'add' not in linetoken):
pass
elif(linetoken[0]=='set' and linetoken[1]=='group' and linetoken[5]=='add'):
converted.write(linetoken[0] +' ' +linetoken[1] +' ' +linetoken[2] +' ' +linetoken[3] +' ' +linetoken[4] +' ' +linetoken[5])
break
main()
The following is my python3 programme to display the 12 subcategories of Wikipedia category. It prints 12 subcategories. Now, i want to show only first 3 subcategories in print. How? But in future while developing my programme, i am going to write all the 12 subcategories in a file.
from bs4 import BeautifulSoup
import requests
url = 'https://en.wikipedia.org/wiki/Category:proprietary software'
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
noOFsubcategories = soup.find('p')
print('------------------------------------------------------------------')
print(noOFsubcategories.text+'------------------------------------------------------------------')
tag = soup.find('div', {'class' : 'mw-category'})
links = tag.findAll('a')
counter = 1
for link in links:
print ( str(counter) + " " + link.text)
counter = counter + 1
You can simply do for link in links[:3]: to display only the first three elements from a list.
I am trying to scrape a list of all the restaurants in Hong Kong and their corresponding URLs. Currently, in my code below, I am able to scrape the 1st and 2nd pages. But I want my for loop towards the bottom to be a bit more dynamic and keep scraping until it hits the amount of entries I specified in range().
I am still a novice at this so any help would be awesome.
#import libraries
import requests
from bs4 import BeautifulSoup
import csv
#scrape the first page because this URL is different then when you start moving to different pages
url0 = 'https://www.tripadvisor.com/Restaurants-g294217-Hong_Kong.html#EATERY_LIST_CONTENTS'
r = requests.get(url0)
data = r.text
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.findAll('a', {'property_title'}):
print 'https://www.tripadvisor.com/Restaurant_Review-g294217-' + link.get('href')
print link.string
#loop to move into the next pages. entries are in increments of 30 per page
for i in range(0, 120, 30):
entries = str(30)
#url format offsets the restaurants in increments of 30 after the oa; hence entries as variable
url1 = 'https://www.tripadvisor.com/Restaurants-g294217-oa' + entries + '-Hong_Kong.html#EATERY_LIST_CONTENTS'
r1 = requests.get(url1)
data1 = r1.text
soup1 = BeautifulSoup(data1, "html.parser")
for link in soup1.findAll('a', {'property_title'}):
print 'https://www.tripadvisor.com/Restaurant_Review-g294217-' + link.get('href')
print link.string
break
Ended up adding a while that got it to loop the way I wanted it to. Hope this helps people in the future
for i in range(30, 120, 30):
while i <= range:
i = str(i)
#url format offsets the restaurants in increments of 30 after the oa; hence entries as variable
url1 = 'https://www.tripadvisor.com/Restaurants-g294217-oa' + i + '-Hong_Kong.html#EATERY_LIST_CONTENTS'
r1 = requests.get(url1)
data1 = r1.text
soup1 = BeautifulSoup(data1, "html.parser")
for link in soup1.findAll('a', {'property_title'}):
print 'https://www.tripadvisor.com/Restaurant_Review-g294217-' + link.get('href')
print link.string
break
I'm trying to create a data-scraping file for a class, and the data I have to scrape requires that I use while loops to get the right data into separate arrays-- i.e. for states, and SAT averages, etc.
However, once I set up the while loops, my regex that cleared the majority of the html tags from the data broke, and I am getting an error that reads:
Attribute Error: 'NoneType' object has no attribute 'groups'
My Code is:
import re, util
from BeautifulSoup import BeautifulStoneSoup
# create a comma-delineated file
delim = ", "
#base url for sat data
base = "http://www.usatoday.com/news/education/2007-08-28-sat-table_N.htm"
#get webpage object for site
soup = util.mysoupopen(base)
#get column headings
colCols = soup.findAll("td", {"class":"vaTextBold"})
#get data
dataCols = soup.findAll("td", {"class":"vaText"})
#append data to cols
for i in range(len(dataCols)):
colCols.append(dataCols[i])
#open a csv file to write the data to
fob=open("sat.csv", 'a')
#initiate the 5 arrays
states = []
participate = []
math = []
read = []
write = []
#split into 5 lists for each row
for i in range(len(colCols)):
if i%5 == 0:
states.append(colCols[i])
i=1
while i<=250:
participate.append(colCols[i])
i = i+5
i=2
while i<=250:
math.append(colCols[i])
i = i+5
i=3
while i<=250:
read.append(colCols[i])
i = i+5
i=4
while i<=250:
write.append(colCols[i])
i = i+5
#write data to the file
for i in range(len(states)):
states = str(states[i])
participate = str(participate[i])
math = str(math[i])
read = str(read[i])
write = str(write[i])
#regex to remove html from data scraped
#remove <td> tags
line = re.search(">(.*)<", states).groups()[0] + delim + re.search(">(.*)<", participate).groups()[0]+ delim + re.search(">(.*)<", math).groups()[0] + delim + re.search(">(.*)<", read).groups()[0] + delim + re.search(">(.*)<", write).groups()[0]
#append data point to the file
fob.write(line)
Any ideas regarding why this error suddenly appeared? The regex was working fine until I tried to split the data into different lists. I have already tried printing the various strings inside the final "for" loop to see if any of them were "None" for the first i value (0), but they were all the string that they were supposed to be.
Any help would be greatly appreciated!
It looks like the regex search is failing on (one of) the strings, so it returns None instead of a MatchObject.
Try the following instead of the very long #remove <td> tags line:
out_list = []
for item in (states, participate, math, read, write):
try:
out_list.append(re.search(">(.*)<", item).groups()[0])
except AttributeError:
print "Regex match failed on", item
sys.exit()
line = delim.join(out_list)
That way, you can find out where your regex is failing.
Also, I suggest you use .group(1) instead of .groups()[0]. The former is more explicit.