I'm a newbie to Python and started playing with NLTK this evening.
I've read in a file with pandas and now trying to call one column, the description. I'm hitting a problem with the code because it was originally written to call a bit of text in script.
I've tried to amend it but think I've broken it. Code and errors below
import nltk
import pandas as pd
data = pd.read_csv("data.csv", sep=",")
#data.head()
datatext = data['Description']
#datatext.head()
import nltk
from nameparser import HumanName
#nltk.download()
def get_human_names(datatext):
tokens = nltk.tokenize.word_tokenize(datatext.read())
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
person_list = []
person = []
name = ""
for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'):
for leaf in subtree.leaves():
person.append(leaf[0])
if len(person) > 1: #avoid grabbing lone surnames
for part in person:
name += part + ' '
if name[:-1] not in person_list:
person_list.append(name[:-1])
name = ''
person = []
return (person_list)
text = datatext
names = get_human_names(datatext)
print "LAST, FIRST"
for name in names:
last_first = HumanName(name).last + ', ' + HumanName(name).first
print last_first
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-18-2f6ec1c0b04c> in <module>()
21 text = datatext
22
---> 23 names = get_human_names(datatext)
24 print "LAST, FIRST"
25 for name in names:
<ipython-input-18-2f6ec1c0b04c> in get_human_names(datatext)
1 def get_human_names(datatext):
----> 2 tokens = nltk.tokenize.word_tokenize(datatext.read())
3 pos = nltk.pos_tag(tokens)
4 sentt = nltk.ne_chunk(pos, binary = False)
5 person_list = []
C:\Anaconda2\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
2358 return self[name]
2359 raise AttributeError("'%s' object has no attribute '%s'" %
-> 2360 (type(self).__name__, name))
2361
2362 def __setattr__(self, name, value):
AttributeError: 'Series' object has no attribute 'read'
I'm trying to write a parser which will take a url and download it's html in a .html file. Then it'll go through the html file to find all links and save them as well. I want to repeat it multiple time. Can some one please help a little?
This is the code I have written:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
count = 1
give_url = raw_input("Enter url:\t")
def magic(give_url):
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
link_set.add(give_url + str(html_link))
magic(give_url)
for each_item in link_set:
print each_item
print "\n"
Although it's working fine but When I try to call the magic function in for loop, i get RuntimeError: Set changed size during iteration.
I got it working.
The code for recursive URL parsing using beautiful soup:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
#
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
pass
else:
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
else:
link_set.add(html_link)
# print "Total links in the given url are: " + str(len(link_set))
magic(give_url,link_set,0)
link_set2 = set()
link_set3 = set()
for element in link_set:
link_set2.add(element)
count = 1
for element in link_set:
magic(element,link_set3,count)
count += 1
for each_item in link_set3:
link_set2.add(each_item)
link_set3.clear()
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
There are many mistakes so I request you all to please tell me where I can improve the code.
I'm trying to run this code
from math import sqrt
import numpy as np
import warnings
from collections import Counter
import pandas as pd
import random
def k_nearest_neighbors(data,predict, k =3):
if len(data) >= k:
warnings.warn('K is set to a value less than total voting groups')
distances = []
for group in data:
for features in data[group]:
eucliden_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([eucliden_distance,group])
votes = [i[1] for i in sorted(distances)[:k]]
print(Counter(votes).most_common(1))
vote_result = Counter(votes).most_common(1)[0][0]
return vote_result
df = pd.read_csv('bc2.txt')
df.replace('?',-99999,inplace=True)
df.drop(['id'],1,inplace = True)
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]
for i in train_data:
train_set[i[-1]].append(i[:-1])
for i in train_data:
test_set[i[-1]].append(i[:-1])
correct = 0
total = 0
for group in test_set:
for data in test_set[group]:
vote = k_nearest_neighbors(train_set,data, k=5)
if group == vote:
correct += 1
total += 1
print ('Accuracy:',correct/total)
it comes out with this error msg
File "ml8.py", line 38, in <module>
train_set[i[-1]].append(i[:-1])
KeyError: 1.0
file m18.py is this above code file
below is the sample of txt file
id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
1000025,2,5,1,1,1,2,1,3,1,1
1002945,2,5,4,4,5,7,10,3,2,1
1015425,2,3,1,1,1,2,2,3,1,1
1016277,2,6,8,8,1,3,4,3,7,1
1017023,2,4,1,1,3,2,1,3,1,1
1017122,4,8,10,10,8,7,10,9,7,1
1018099,2,1,1,1,1,2,10,3,1,1
1018561,2,2,1,2,1,2,1,3,1,1
1033078,2,2,1,1,1,2,1,1,1,5
1033078,2,4,2,1,1,2,1,2,1,1
1035283,2,1,1,1,1,1,1,3,1,1
1036172,2,2,1,1,1,2,1,2,1,1
1041801,4,5,3,3,3,2,3,4,4,1
I'm using 2.7.11 version
Your train_set only contains keys 2 and 4, whereas your classes in that sample are 1 and 5.
Instead of using
train_set = {2:[],4:[]}
you might have better luck with defaultdict:
from collections import defaultdict
train_set = defaultdict(list)
This way a non-existent key will be initialized to a new empty list on first access.
I created a list comprehension to provide me the following:
listoflists = [[] for i in range(252*5)]
I then simplified the list in variable newlists to contain only the number of lists in range(weeks) which is a dynamic variable.
I want to append each individual list in the following loop for a specified range with the append process moving through each list after its reached a specified length. The values are generated from an input function. For instance, if the first list in newlists exceeds a length of 5 I want the values following the 5th loop to then append to the next list and so on. The code I currently have is:
p = 0
singlist = []
listoflists = [[] for i in range(252*5)]
newlists= [listoflists[i] for i in range(weeks)]
while p<(int(people)*weeks): #fix appending process
for i in range(int(people)*weeks):
weekly =input("Put your hours: ")
singlist.append(int(weekly))
p += 1
if weekly.isalpha() == True:
print("Not a valid amount of time")
for i in range(0,weeks):
while len(newlists[i])<int(people):
newlists[i].append(singlist[i])
This code however appends the same values to all lists in range weeks. What is the most efficient way to fix this? Thank you!
if singlist = [10,15,20,25]
desire output for newlists is: [[10,15],[20,25]]
How I've structured the program:
import sys
import numpy as np
import pandas as pd
from datetime import tzinfo,timedelta,datetime
import matplotlib.pyplot as plt
import itertools as it
from itertools import count,islice
team = []
y = 0
while y == 0:
try:
people = input("How many people are on your engagement? ")
if people.isdigit() == True:
y += 1
except:
print("Not a number try again")
z= 0
while z<int(people):
for i in range(int(people)):
names = input("Name: ")
if names.isalpha() == False:
team.append(names)
z+=1
elif names.isdigit() == True:
print("Not a name try again")
ties = [] # fix looping for more than one person
e = 0
while e<int(people):
for i in range(int(people)):
title = input("What is their title: ")
if title.isdigit() == True:
print("Not a title try again")
else:
ties.append(title)
e+=1
values = [] #fix looping for more than one person
t= 0
while t <int(people):
for i in range(int(people)):
charge = input("How much are you charging for them: ")
if charge.isalpha() == True:
print("Not a valid rate")
else:
values.append(int(charge))
t +=1
weeks = int(input("How many weeks are you including: "))
days = []
x = 0
while x<weeks: #include a parameter for dates of a 7 day difference to only be permitted
try:
for i in range(int(weeks)):
dates = input("Input the dates (mm/dd/yy): ")
dt_start = datetime.strptime(dates,'%m/%d/%y')
days.append(dates)
x+=1
except:
print("Incorrect format")
p = 0
singlist = []
listoflists = [[] for i in range(252*5)]
newlists= [listoflists[i] for i in range(weeks)]
while p<(int(people)*weeks): #fix appending process
for i in range(int(people)*weeks):
weekly =input("Put your hours: ")
singlist.append(int(weekly))
p += 1
if weekly.isalpha() == True:
print("Not a valid amount of time")
def func(items,n):
items = iter(items)
for i in it.count():
out = it.islice(items,weeks*i,weeks*i+n)
if not out:
break
output = list(func(singlist,weeks))
# items = [1,2,3,...n]
# output = [[1,2],[3,4],..], n = 2 elements each
items_ = iter(items)
outiter = iter(lambda: [next(items_) for i in range(n)],[])
outlist = list(outiter)
You can do the same thing using while loop in place of count() and [a:b] slice operation on list instead of islice(). But using iterators is very efficient.
Hello I have written a python program to parse data specific data from txt file
my code is:
f = open('C:/Users/aikaterini/Desktop/Ericsson_PARSER/BSC_alarms_vf_OSS.txt','r')
from datetime import datetime
import MySQLdb
def firstl():
with f as lines:
lines = lines.readlines()
print len(lines)
for i,line in enumerate(lines):
if line.startswith("*** Disconnected from"):
conline = line.split()
bsc = conline[-2]
print "\n"*5
print bsc
print "*"*70
break
for i,line in enumerate(lines):
if line.startswith("*** Connected to"):
conline = line.split()
bsc = conline[-2]
print "\n"*5
print bsc
print "*"*70
elif line[:3] == "A1/" or line[:3] == "A2/":
if lines[i+1].startswith("RADIO"):
fal = line.split()
first_alarm_line = [fal[0][:2],fal[-2],fal[-1]]
year = first_alarm_line[1][:2]
month = first_alarm_line[1][2:4]
day = first_alarm_line[1][4:]
hours = first_alarm_line[2][:2]
minutes = first_alarm_line[2][2:]
date = datetime.strptime( day + " " + month + " " + year + " " + \
hours+":"+minutes,"%d %m %y %H:%M")
print first_alarm_line
print date, "\n"
print lines[i+1]
print lines[i+4]
print lines[i+5]
desc_line = lines[i+4]
desc_values_line = lines[i+5]
desc = desc_line.split(None,2)
print desc
desc_values = desc_values_line.split(None,2)
rsite = ""
#for x in desc_values[1]:
# if not (x.isalpha() or x == "0"):
# rsite += x
rsite = desc_values[1].lstrip('RBS0')
print "\t"*2 + "rsite:" + rsite
if desc[-1] == "ALARM SLOGAN\n":
alarm_slogan = desc_values[-1]
print alarm_slogan
x = i
print x # to check the line
print len(line) #check length of lines
while not lines[x].startswith("EXTERNAL"):
x+=1
if lines[x].startswith("EXTERNAL"):
while not lines[x] == "\n":
print lines[x]
x+=1
print "\n"*5
elif lines[i+1].startswith("CELL LOGICAL"):
fal = line.split()
first_alarm_line = [fal[0][:2],fal[-2],fal[-1]]
#print i
print first_alarm_line
type = lines[i+1]
print type
cell_line = lines[i+3]
cell = cell_line.split()[0]
print cell
print "\n"*5
##########Database query###########
#db = MySQLdb.connect(host,user,password,database)
firstl()
when i run the program the results are correct
but it prints until line 50672 while there are 51027
and i get the last printed result with the following error:
['A2', '130919', '0309']
2013-09-19 03:09:00
RADIO X-CEIVER ADMINISTRATION
MO RSITE ALARM SLOGAN
RXOCF-18 RBS03668 OML FAULT
['MO', 'RSITE', 'ALARM SLOGAN\n']
rsite:3668
OML FAULT
50672
51027
Traceback (most recent call last):
File "C:\Python27\parser_v3.py", line 106, in <module>
firstl()
File "C:\Python27\parser_v3.py", line 72, in firstl
while not lines[x].startswith("EXTERNAL"):
IndexError: list index out of range
if i comment the while not line i get :
Traceback (most recent call last):
File "C:\Python27\parser_v3.py", line 106, in <module>
firstl()
File "C:\Python27\parser_v3.py", line 60, in firstl
rsite = desc_values[1].lstrip('RBS0')
IndexError: list index out of range
The txt content is like :
A1/EXT "FS G11B/25/13/3" 382 150308 1431
RADIO X-CEIVER ADMINISTRATION
BTS EXTERNAL FAULT
MO RSITE CLASS
RXOCF-16 RBS02190 1
EXTERNAL ALARM
ALARM SYSTEM ON/OFF G2190 DRAMA CNR
A1/EXT "FS G11B/25/13/3" 755 150312 1434
RADIO X-CEIVER ADMINISTRATION
BTS EXTERNAL FAULT
MO RSITE CLASS
RXOCF-113 RBS00674 1
EXTERNAL ALARM
IS.BOAR FAIL G0674 FALAKRO
I don't understand since i do a split with maxnumber 2 and i get 3 elements as u can see and i am picking the 2nd and if i comment that i get another error when i pick an element from a list and the thing is that returning the correct result.Please help me.
Sorry for the long post thank you in advance.
I'm haven't dug deep into your code, but have you tried validating that x does not exceed the number of elements in lines before trying to access that index? Also, for readability I'd suggest using lines[x] != rather than not lines[x] ==
while x < len(lines) and lines[x] != "\n":
I solved it although i don't know if it is correct way but it works.
I think the problem was that the x was exceeding the length of the list lines containing the file and there had to be a check after the split that the list had length larger or equal to number of elements so :
if len(desc_values) > 2 and len(desc) > 2:
rsite = desc_values[1].lstrip('RBS0')
print "\t"*2 + "rsite:" + rsite
if desc[-1] == "ALARM SLOGAN\n":
alarm_slogan = desc_values[-1]
print alarm_slogan
x = i
print x #to check the line
print len(lines) # check length of lines
while [x] < len(lines): #check so that x doesnt exceed the length of file list "line"
while not lines[x].startswith("EXTERNAL"):
x+=1
if lines[x].startswith("EXTERNAL"):
while lines[x] != "\n":
print lines[x]
x+=1
Thank you man you really helped me although i am trying to find a way to stop the iteration of x to gain some computation time i tried break but it throws you completely of the loop.
Thanks anyway