How to get NLTK sentence tokenizer read a column in a file

How to get NLTK sentence tokenizer read a column in a file - python-2.7

I'm a newbie to Python and started playing with NLTK this evening.
I've read in a file with pandas and now trying to call one column, the description. I'm hitting a problem with the code because it was originally written to call a bit of text in script.
I've tried to amend it but think I've broken it. Code and errors below
import nltk
import pandas as pd
data = pd.read_csv("data.csv", sep=",")
#data.head()
datatext = data['Description']
#datatext.head()
import nltk
from nameparser import HumanName
#nltk.download()
def get_human_names(datatext):
tokens = nltk.tokenize.word_tokenize(datatext.read())
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
person_list = []
person = []
name = ""
for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'):
for leaf in subtree.leaves():
person.append(leaf[0])
if len(person) > 1: #avoid grabbing lone surnames
for part in person:
name += part + ' '
if name[:-1] not in person_list:
person_list.append(name[:-1])
name = ''
person = []
return (person_list)
text = datatext
names = get_human_names(datatext)
print "LAST, FIRST"
for name in names:
last_first = HumanName(name).last + ', ' + HumanName(name).first
print last_first
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-18-2f6ec1c0b04c> in <module>()
21 text = datatext
22
---> 23 names = get_human_names(datatext)
24 print "LAST, FIRST"
25 for name in names:
<ipython-input-18-2f6ec1c0b04c> in get_human_names(datatext)
1 def get_human_names(datatext):
----> 2 tokens = nltk.tokenize.word_tokenize(datatext.read())
3 pos = nltk.pos_tag(tokens)
4 sentt = nltk.ne_chunk(pos, binary = False)
5 person_list = []
C:\Anaconda2\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
2358 return self[name]
2359 raise AttributeError("'%s' object has no attribute '%s'" %
-> 2360 (type(self).__name__, name))
2361
2362 def __setattr__(self, name, value):
AttributeError: 'Series' object has no attribute 'read'

Related

django+celery+rabbitmq encode error and sig-kill

I'm now doing a little project which uses celery to turn csv and xlsx files into postgresql table.
The code below works fine without celery(except large files),but after using celery it produce some errors and bugs.
I've looked for similar questions in StackOverFlow but don't have any idea how to do and why.
Hope you guys can help me with it,thanks.
First error is as follows:
csv-1
csv-2
I think it has something to do with my encoding part, but I tried to open it with utf-8-sig and big-5, not working.(It works fine without celery)
`
# -*- coding: utf-8 -*-
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.http import HttpResponseRedirect
from django.core.urlresolvers import reverse
from django.contrib import messages
from django.conf import settings
from django.db import connection
from django.views.decorators.csrf import csrf_exempt
from celery import Celery
from celery import task
import json
import csv
import sys
import random
import psycopg2
import xlrd
import openpyxl as pyxl
from .models import Document
from .forms import DocumentForm
app = Celery(
'tasks',
broker='amqp://guest:guest#localhost:5672//',
backend='rpc://'
)
CELERY_RESULT_BACKEND = 'rpc://'
CELERY_RESULT_PERSISTENT = False
#app.task()
def csvwritein(doc):# Transform csv to table
doc = doc
conn = psycopg2.connect("dbname='apidb' user='api' host='localhost'
password='eric40502' port='5432'")
readcur = conn.cursor()
readcur.execute("select exists(select * from
information_schema.tables where table_name='%s')" % doc.tablename) # check if
same file is already in database
check = readcur.fetchone()[0]
try:
fr = open(doc.path,encoding = 'utf-8-sig')
dr.delay(fr,doc,check)
fr.close()
except Exception as e:
fr = open(doc.path,encoding = 'big5')
dr.delay(fr,doc,check)
fr.close()
conn.commit()
readcur.close()
#app.task()
def dr(fr,doc,check): # make datareader as function to keep code 'dry'
csvt = 0 #count csv reader loop time
row_id = 1 # used for following id field
conn = psycopg2.connect("dbname='apidb' user='api' host='localhost'
password='eric40502' port='5432'")
maincur = conn.cursor()
writecur = conn.cursor()
datareader = csv.reader(fr, delimiter=',')
for row in datareader:
if csvt == 0: # first time in loop(create field) and check no
same file exists
if check == True:
app =
''.join([random.SystemRandom().choice('abcdefghijklmnopqrstuvwxyz0123456789')
for i in range(6)])
tname = '%s-%s' % (doc.tablename,app
tablename = '"%s-%s"' % (doc.tablename,app)
doc.tablename = tname
doc.save()
else:
tablename = '"%s"' % doc.tablename
maincur.execute("CREATE TABLE %s (id SERIAL PRIMARY
KEY);" % tablename)
row_count = sum(1 for line in datareader)
col_count = len(row)
frow = row
for i in range(0,col_count,1):
row[i] = '"%s"' % row[i] # change number to
string
maincur.execute("ALTER TABLE %s ADD %s
CITEXT;" % (tablename,row[i]))
csvt = csvt+1
fr.seek(0)
next(datareader)
elif csvt > 0: # not first time(insert data) and check no
same file exists
for j in range(0,col_count,1):
if j == 0:
writecur.execute("INSERT INTO %s (%s)
VALUES ('%s');" % (tablename,frow[j],row[j]))
else:
writecur.execute("UPDATE %s SET %s =
'%s' WHERE id = '%d';" %(tablename,frow[j],row[j],row_id))
csvt = csvt+1
row_id = row_id+1
else:
break
conn.commit()
maincur.close()
writecur.close()
conn.close()
csvt = 0
doc = Document.objects.all()
`
Second error is about turning a xlsx file(about 130,000 rows) into postgresql table, and the worker got sig-kill after 2-3 minutes.
Debug Message:
[2016-10-27 06:17:05,227: ERROR/MainProcess] Process 'Worker-1' pid:13829 exited with 'signal 9 (SIGKILL)' [2016-10-27 06:17:05,328:ERROR/MainProcess] Task data.tasks.xlsxwritein[5aec4679-c48b-4d07-a0a9-5e4e37fcd24b] raised unexpected: WorkerLostError('Worker exited prematurely: signal 9 (SIGKILL).',) Traceback (most recent call last): File "/usr/local/lib/python3.4/dist-packages/billiard/pool.py", line 1175, in mark_as_worker_lost human_status(exitcode)), billiard.exceptions.WorkerLostError: Worker exited prematurely: signal 9 (SIGKILL).
#The code continues from the above task.py file
#app.task()
def xlsxwritein(doc): # write into database for file type xlsx
xlsxt = 0
conn = psycopg2.connect("dbname='apidb' user='api' host='localhost'
password='eric40502' port='5432'")
maincur = conn.cursor()
readcur = conn.cursor()
writecur = conn.cursor()
readcur.execute("select exists(select * from
information_schema.tables where table_name='%s')" % doc.tablename) # check if
same file is already in database
check = readcur.fetchone()[0]
row_id = 1 # used for following id field
wb = pyxl.load_workbook(doc.path)
sheetnames = wb.get_sheet_names()
ws = wb.get_sheet_by_name(sheetnames[0])
for rown in range(ws.get_highest_row()):
if xlsxt == 0:
if check == True:
app =
''.join([random.SystemRandom().choice('abcdefghijklmnopqrstuvwxyz0123456789')
for i in range(6)])
tname = '%s-%s' % (doc.tablename,app)
tablename = '"%s-%s"' % (doc.tablename,app)
doc.tablename = tname
doc.save()
else:
tablename = '"%s"' % doc.tablename
field = [ws.cell(row=1,column=col_index).value for
col_index in range(1,ws.get_highest_column()+1)]
maincur.execute("CREATE TABLE %s (id SERIAL PRIMARY
KEY);" % tablename)
for coln in range(ws.get_highest_column()):
field[coln] = '"%s"' % field[coln] # change
number to string
if field[coln] == 'ID':
field[coln] = 'original_id'
maincur.execute("ALTER TABLE %s ADD %s
CITEXT;" % (tablename,field[coln]))
xlsxt = xlsxt+1
elif xlsxt > 0 and check == False: # not first time(insert
data) and check no same file exists
for coln in range(ws.get_highest_column()):
if coln == 0:
writecur.execute("INSERT INTO %s (%s)
VALUES ('%s');"
%(tablename,field[coln],str(ws.cell(row=rown,column=coln+1).value)))
else:
writecur.execute("UPDATE %s SET %s =
'%s' WHERE id = '%d';"
%(tablename,field[coln],str(ws.cell(row=rown+1,column=coln+1).value),row_id))
xlsxt = xlsxt+1
row_id = row_id+1
else:
break
conn.commit()
maincur.close()
readcur.close()
writecur.close()
conn.close()
xlsxt = 0

Probably something is going wrong during arguments deserialization. Instead of passing doc object try instead passing filename and then read file inside of task.

XLRDError: No sheet named <'Sheet1'> in python

I am trying to convert the xls into csv file using pandas in python. But I am getting the following error like 'XLRDError: No sheet named <'Sheet1'>'. I have verified the sheet name and it is same as specified above, but I don't how to correct this error. Please find my code below.
CODE:
def xls_2_csv():
import pandas as pd
data_xls = pd.read_excel(r'c:\delivery\file1.xls','Sheet1', index_col=None)
data_xls.to_csv(r'C:\test\file1.csv', encoding='utf-8',index=None)
xls_2_csv()
Please help me in solving this error. Thanks in advance.

I found the same problem in python 3.6 and pandas version is 0.25.1.
The following should work:
import pandas as pd
file = 'your excel file path'
# the file is endswith '.xls' and there is multiple sheets
# error method
df_sheet1 = pd.read_excel(file, sheet_name='Sheet1')
df_sheet2 = pd.read_excel(file, sheet_name='Sheet2')
# when read Sheet1 had no error, but when read Sheet2, had an error:
# xlrd.biffh.XLRDError: No sheet named <'Sheet2'>
# right method
with pd.ExcelFile(file) as xls:
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
print(df.head())

Hi I tried the following code it worked for me.
CODE:
import logging
import time
import traceback
import xlrd
import csv
import sys
import re
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
xls = input file path
target = output file path
logging.info("Start converting: From '" + xls + "' to '" + target + "'. ")
try:
start_time = time.time()
wb = xlrd.open_workbook(xls)
sh = wb.sheet_by_index(0)
csvFile = open(target, 'wb')
wr = csv.writer(csvFile, quoting=csv.QUOTE_ALL)
for row in xrange(sh.nrows):
rowValues = sh.row_values(row)
newValues = []
for s in rowValues:
if isinstance(s, unicode):
strValue = (str(s.encode("utf-8")))
else:
strValue = (str(s))
isInt = bool(re.match("^([0-9]+)\.0$", strValue))
if isInt:
strValue = int(float(strValue))
else:
isFloat = bool(re.match("^([0-9]+)\.([0-9]+)$", strValue))
isLong = bool(re.match("^([0-9]+)\.([0-9]+)e\+([0-9]+)$", strValue))
if isFloat:
strValue = float(strValue)
if isLong:
strValue = int(float(strValue))
newValues.append(strValue)
wr.writerow(newValues)
csvFile.close()
logging.info("Finished in %s seconds", time.time() - start_time)
except Exception as e:
print (str(e) + " " + traceback.format_exc())

Why did the if statement in the loop not work?

I was scraping Yahoo finance and somehow the if condition did not work. What the if was supposed to do was to print the stock price if the stock price was greater than 50, but it printed all the stocks which were above 50 and below 50.
Here is the code:
import urllib2
from bs4 import BeautifulSoup as bs4
list = ["aapl","goog","yhoo"]
i = 0
while i < len(list):
url = urllib2.urlopen("http://finance.yahoo.com/q?s="+ list[i] +"&q1=1")
soup = bs4(url,"html.parser")
for price in soup.find(attrs={'id':"yfs_l84_" + list[i]}):
if price > 50:
print price
i += 1
else:
print "failed"
1 += 1
Why did it print the stock "yahoo", cause "yahoo" is less than 50, no?

We can rewrite code in following:
1 += 1 will not work because LHS should be variable name. This is i += 1 . this might be typing mistake. :)
No need of i variable, we can iterate list by for loop. This will remove our i += 1 statements from the code.
Do not use in-built variable names as our variable names. e.g. list is list type variable used to create new list. If we use such variable name then this will create problem in code.
e.g.
>>> list
<type 'list'>
>>> a = list()
>>> a
[]
>>> list = [1,2,3,4]
>>> a = list()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: 'list' object is not callable
>>>
Use exception handling during Type Casting means when we convert one data type to other data type.
Demo:
import urllib2
from bs4 import BeautifulSoup as bs4
item_list = ["aapl","goog","yhoo"]
target_url = "http://finance.yahoo.com/q?s=%s&q1=1"
target_id = "yfs_l84_%s"
for i in item_list:
url = urllib2.urlopen(target_url%i)
soup = bs4(url, "html.parser")
for price in soup.find(attrs={'id':target_id%i}):
try:
price = float(price)
except:
print "Exception during type conversion. Value is %s. Type is %s."%(price, type(price))
continue
if price > 50:
print "Price:", price
else:
print "failed"

Seems to me you've mixed types:
if price > 50:
TypeError: unorderable types: NavigableString() > int()
Use this:
if float(price) > 50:
print price

price is string, you have to convert it to number:
if int(price) > 50:
print price
i += 1
else:
print "failed"
i += 1
or (with fractional part):
if float(price) > 50:
print price
i += 1
else:
print "failed"
i += 1

Can not constructing a class from another file

Here are my two files:
Character.py
def Check_File(fn):
try:
fp = open(fn);
except:
return None;
return fp;
class Character:
## variables ##
## atk ##
## def ##
## HP ##
## empty inv ##
'''
init (self, filename),
RETURN -1 == if the file is not exist
RETURN 0 == all good
all files will be save in format of
"skill: xxx, xxx; xxx, xxx; xxx, xxx;"
'''
def __init__(self, fn):
fp = Check_File(fn);
if(fp == None):
print "Error: no such file"
return None;
self.stats = {};
for line in fp:
nline = line.strip().split(": ");
if(type(nline) != list):
continue;
else:
self.stats[nline[0]] = nline[1];
##print self.stats[nline[0]]
fp.close();
'''
display character
'''
def Display_Character(self):
print "The Character had:";
## Parse into the character class ##
for item in self.stats:
print item + ": " + self.stats[item];
print "Finished Stats Displaying";
print Character("Sample.dat").stats
Another one is:
Interface.py
##from Interface_helper import *;
from Character import *;
wind = Character("Sample.dat");
wind.Display_Character();
When I run the code in Character.py, it gives
%run "C:/Users/Desktop/Helper Functions/New Folder/Character.py"
{'item': 'weird stuff', 'hp': '100', 'name': 'wind', 'def': '10', 'atk': '10'}
But when I run Interface.py:
I had
%run "C:/Users/Desktop/Helper Functions/New Folder/Interface.py"
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
E:\canopy\Canopy\App\appdata\canopy-1.4.0.1938.win-x86_64\lib\site-packages\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
195 else:
196 filename = fname
--> 197 exec compile(scripttext, filename, 'exec') in glob, loc
198 else:
199 def execfile(fname, *where):
C:\Users\Desktop\Helper Functions\New Folder\Interface.py in <module>()
13 from Character import *;
14
---> 15 wind = Character("Sample.dat");
16
17
C:\Users\Desktop\Helper Functions\New Folder\Character.py in __init__(self, fn)
48 for line in fp:
49 nline = line.strip().split(": ");
---> 50 if(type(nline) != list):
51 continue;
52 else:
AttributeError: Character instance has no attribute 'stats'
I was wondering what is going on for this piece of code, did I import the wrong way?

No there is no issue with your import. Are you sure that you are in the same location for both runs? Since your code just specifies the filename with no path, your python session needs to run in the directory where the Sample.dat file is located. The reason why I am asking this, is because you define a stats attribute in the middle of your __init__ and the only thing that can happen for this not to exist is for the return None above it to be invoked. Which happens only when the file doesn't exist (meaning doesn't exist where it looks, which is where you are running from).
P.S. in python:
Semicolons are not needed at the end of lines
Parentheses are not needed around the condition in an if statement
Classes should derive from object: class Character(object):
Docstrings (the strings you put in triple quotes above the method names) should be right below the method name. That will allow ipython and other tools to pick them up and display them when users put a question mark in front of them.

Cannot Pool.map() function because of UnpickleableError?

So I am trying to multi process function F. Which is accessed by a button press with tkinter.
def f(x):
global doom,results,info
doom = doom + 1
if check(x) == True:
results.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "1/"+doom
s.configure(text=texx)
root.update()
The function is called within a function like so:
def dojob():
index = ['URLS'...]
pool = Pool(processes=4)
s.configure(text="Shifting Workload to cores..")
root.update()
pool.map(f, index)
The button is inside root window.
I get the following error:
Exception in thread Thread-2:
Traceback (most recent call last):
File "C:\Python27\lib\threading.py", line 808, in __bootstrap_inner
self.run()
File "C:\Python27\lib\threading.py", line 761, in run
self.__target(*self.__args, **self.__kwargs)
File "C:\Python27\lib\multiprocessing\pool.py", line 342, in _handle_tasks
put(task)
UnpickleableError: Cannot pickle <type 'tkapp'> objects
I do not even know what a pickle does? Help?
Here is the complete code:
from Tkinter import *
from ttk import *
from tkMessageBox import showinfo
from multiprocessing import Pool
import random
emails = set()
import urllib2
import urllib2 as urllib
########
CONSTANT_PAGECOUNT = 20
######
def f(x):
global doom,emails,info
doom = doom + 1
if check(x) == True:
print "",
emails.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "Sk1nn1n "+str(doom)+'/'+str(CONSTANT_PAGECOUNT)+""
s.configure(text=texx)
root.update()
return 0
def f(x):
print ""
def showFile(site,info):
top = Toplevel()
top.title('Sites')
x = Text(top)
x.pack()
i=0
for site_url in site:
x.insert(END,site_url)
i=i+1
def get_column_number(url):
return True
def check(url):
return True
def getgoogleurl(search,siteurl=False,startr=0):
if siteurl==False:
return 'http://www.google.com/search?q='+urllib2.quote(search)+'&start='+str(startr)+'&oq='+urllib2.quote(search)
else:
return 'http://www.google.com/search?q=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)+'&oq=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)
def getgooglelinks(search,siteurl=False,startr=0):
#google returns 403 without user agent
headers = {'User-agent':'Mozilla/11.0'}
req = urllib2.Request(getgoogleurl(search,siteurl,startr),None,headers)
site = urllib2.urlopen(req)
data = site.read()
site.close()
#no beatifulsoup because google html is generated with javascript
start = data.find('<div id="res">')
end = data.find('<div id="foot">')
if data[start:end]=='':
#error, no links to find
return False
else:
links =[]
data = data[start:end]
start = 0
end = 0
while start>-1 and end>-1:
#get only results of the provided site
if siteurl==False:
start = data.find('<a href="/url?q=')
else:
start = data.find('<a href="/url?q='+str(siteurl))
data = data[start+len('<a href="/url?q='):]
end = data.find('&sa=U&ei=')
if start>-1 and end>-1:
link = urllib2.unquote(data[0:end])
data = data[end:len(data)]
if link.find('http')==0:
links.append(link)
return links
def rip(results=15,accuracy=16):
global e
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
linklist = []
counter = 0
doom = 0
while counter < results:
links = getgooglelinks(keyword,startr=counter)
for link in links:
if len(linklist) > CONSTANT_PAGECOUNT:
s.configure(text="Proccessing..")
root.update()
return linklist
else:
doom = doom + 1
linklist.append(link)
texx = str(doom)+"/"+str(CONSTANT_PAGECOUNT)
s.configure(text=texx)
root.update()
root.update()
counter = counter+accuracy
return linklist
def flip():
global e
emails = set()
info = []
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
s.configure(text="Generating index..")
root.update()
doom = -1
index = rip(CONSTANT_PAGECOUNT,10)
if 1:
try:
pool = Pool(processes=4)
#s.configure(text="Shifting Workload to cores..")
#root.update()
pool.map(f, index)
pool.close()
except:
print "The errors there.."
j.config(value=CONSTANT_PAGECOUNT)
if len(emails) > 0:
filepath='relavant_list_'+str(random.randint(1,9999))+'.emList.txt'
#print len(emails),
#print "emails found."
ggg = open(filepath,'a+')
for x in emails:
ggg.write(x+"\n")
showinfo(
str(len(emails))+" key word related sites found!",
" sites are saved in "+str(filepath)
)
showFile(emails,info)
s.configure(text=filepath)
else:
s.configure(text='No related sites found : (')
if __name__ == '__main__':
### CONSTANTS
version = '1.0'
### END CONSTANTS
root = Tk()
root.title('Program v'+version)
s = Style()
s.theme_use('default')
#print s.theme_names()
s.configure("black.Horizontal.TProgressbar", foreground='blue', background='blue')
j = Progressbar(root, style="black.Horizontal.TProgressbar", orient="vertical", length=200, mode="determinate", maximum=CONSTANT_PAGECOUNT, value=0)
j.pack(side='right',fill='y')
f = Frame(root)
x = Frame(f)
e = Entry(x,width=51)
s = Label(x,width=50,anchor='center',text='Waiting for task..')
Button(f,text='Generate List!',width=50,command=flip).pack(fill='both',expand=True)
s.pack(side='bottom',fill='y',expand=True)
e.pack(side='top',fill='both',expand=True)
x.pack(side='top',fill='y',expand=True)
f.pack(side='left',expand=True,fill="both")
root.mainloop()

You are leaking a tkinter object. Most likely because you are trying to update the interface from another process with the last line of f()
Update based on code
You have a name collision between your function f() and a variable f in your __main__ which gets assigned to your main window and causes the tkapp pickle error. Rename the function to def myfunc() or something. Also need to call pool.join() after pool.close()

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How to get NLTK sentence tokenizer read a column in a file - python-2.7

Related

django+celery+rabbitmq encode error and sig-kill

XLRDError: No sheet named <'Sheet1'> in python

Why did the if statement in the loop not work?

Can not constructing a class from another file

Cannot Pool.map() function because of UnpickleableError?

Categories

Resources