Ascii codec can't decode byte 0xc2 python nltk - python-2.7

I have a code that I'm using for Spam Classification and it works great but everytime I try to stem/lemmatize the word I get this error:
File "/Users/Ramit/Desktop/Bayes1/src/", line 16, in trim_word
word = ps.stem(word)
File "/Library/Python/2.7/site-packages/nltk/stem/", line 664, in stem
stem = self._step1a(stem)
File "/Library/Python/2.7/site-packages/nltk/stem/", line 289, in _step1a
if word.endswith('ies') and len(word) == 4:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
Here is my code:
from word import Word
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
class Filter():
def __init__(self):
self.words = dict()
def trim_word(self, word):
# Helper method to trim away some of the non-alphabetic characters
# I deliberately do not remove all non-alphabetic characters.
word = word.strip(' .:,-!()"?+<>*')
word = word.lower()
word = ps.stem(word)
return word
def train(self, train_file):
lineNumber = 1
ham_words = 0
spam_words = 0
stop = set(stopwords.words('english'))
# Loop through all the lines
for line in train_file:
if lineNumber % 2 != 0:
line = line.split('\t')
category = line[0]
input_words = line[1].strip().split(' ')
#Loop through all the words in the line, remove some characters
for input_word in input_words:
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
# Check if word is in dicionary, else add
if input_word in self.words:
word = self.words[input_word]
word = Word(input_word)
self.words[input_word] = word
# Check wether the word is in ham or spam sentence, increment counters
if category == "ham":
ham_words += 1
elif category == "spam":
spam_words += 1
# Probably bad training file input...
print "Not valid training file format"
# Compute the probability for each word in the training set
for word in self.words:
self.words[word].compute_probability(ham_words, spam_words)
def get_interesting_words(self, sms):
interesting_words = []
stop = set(stopwords.words('english'))
# Go through all words in the SMS and append to list.
# If we have not seen the word in training, assign probability of 0.4
for input_word in sms.split(' '):
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
if input_word in self.words:
word = self.words[input_word]
word = Word(input_word)
# Sort the list of interesting words, return top 15 elements if list is longer than 15
interesting_words.sort(key=lambda word: word.interesting(), reverse=True)
return interesting_words[0:15]
def filter(self, input_file, result_file):
# Loop through all SMSes and compute total spam probability of the sms-message
lineNumber = 0
for sms in input_file:
spam_product = 1.0
ham_product = 1.0
if lineNumber % 2 != 0:
for word in self.get_interesting_words(sms):
spam_product *= word.get_probability()
ham_product *= (1.0 - word.get_probability())
sms_spam_probability = spam_product / (spam_product + ham_product)
if sms_spam_probability > 0.8:
result_file.write("SPAM: "+sms)
result_file.write("HAM: "+sms)
I'm just looking for a solution that would allow me to lemmatize/stem the words. I tried looking around the net I did find similar problems, but they haven't been working for me.

Use sys.
import sys


When using pandas is it possible to replace the re package with the regex package? [duplicate]

I am trying to check for fuzzy match between a string column and a reference list. The string series contains over 1 m rows and the reference list contains over 10 k entries.
For eg:
ref_df['REF_NAMES'] = pd.Series(['XANDER','PARIS']) #10 k rows
###Output should look like
df['MATCH'] = pd.Series([Nan, 'XANDER', 'MANDER', 'PARIS', 'HARIS', Nan, 'PARIS', Nan])
It should generate match if the word appears separately in the string (and within that, upto 1 char substitution allowed)
For eg - 'PARIS' can match against 'PARIS HILTON', 'THE HARIS DOWNTOWN', but not against 'APARISIAN'.
Similarly, 'XANDER' matches against 'NOVA XANDER' and 'SALA MANDER' (MANDER being 1 char diff from XANDER) , but does not generate match against 'ALEXANDERS'.
As of now, we have written the logic for the same (shown below), although the match takes over 4 hrs to run.. Need to get this to under 30 mins.
Current code -
tags_regex = ref_df['REF_NAMES'].tolist()
tags_ptn_regex = '|'.join([f'\s+{tag}\s+|^{tag}\s+|\s+{tag}$' for tag in tags_regex])
def search_it(partyname):
m ="("+tags_ptn_regex+ ")"+"{s<=1:[A-Z]}",partyname):
if m is not None:
return None
df['MATCH'] = df['NAMES'].str.apply(search_it)
Also, will multiprocessing help with regex ? Many thanks in advance!
Your pattern is rather inefficient, as you repeat tag pattern thrice in the regex. You just need to create a pattern with the so-called whitespace boundaries, (?<!\S) and (?!\S), and you will only need one tag pattern.
Next, if you have several thousands alternative, even the single tag pattern regex will be extremely slow because there can appear such alternatives that match at the same location in the string, and thus, there will be too much backtracking.
To reduce this backtracking, you will need a regex trie.
Here is a working snippet:
import regex
import pandas as pd
## Class to build a regex trie, see
class Trie():
"""Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
The corresponding Regex should match much faster than a simple Regex union."""
def __init__(self): = {}
def add(self, word):
ref =
for char in word:
ref[char] = char in ref and ref[char] or {}
ref = ref[char]
ref[''] = 1
def dump(self):
def quote(self, char):
return regex.escape(char)
def _pattern(self, pData):
data = pData
if "" in data and len(data.keys()) == 1:
return None
alt = []
cc = []
q = 0
for char in sorted(data.keys()):
if isinstance(data[char], dict):
recurse = self._pattern(data[char])
alt.append(self.quote(char) + recurse)
q = 1
cconly = not len(alt) > 0
if len(cc) > 0:
if len(cc) == 1:
alt.append('[' + ''.join(cc) + ']')
if len(alt) == 1:
result = alt[0]
result = "(?:" + "|".join(alt) + ")"
if q:
if cconly:
result += "?"
result = "(?:%s)?" % result
return result
def pattern(self):
return self._pattern(self.dump())
## Start of main code
df = pd.DataFrame()
ref_df = pd.DataFrame()
ref_df['REF_NAMES'] = pd.Series(['XANDER','PARIS']) #10 k row
trie = Trie()
for word in ref_df['REF_NAMES'].tolist():
tags_ptn_regex = regex.compile(r"(?:(?<!\S)(?:{})(?!\S)){{s<=1:[A-Z]}}".format(trie.pattern()), regex.IGNORECASE)
def search_it(partyname):
m =
if m is not None:
return None
df['MATCH'] = df['NAMES'].apply(search_it)

Reading mailing addresses of varying length from a text file using regular expressions

I am trying to read a text file and collect addresses from it. Here's an example of one of the entries in the text file:
Electrical Vendor Contact: John Smith Phone #: 123-456-7890
Address: 1234 ADDRESS ROAD Ship To:
Suite 123 ,
Nowhere, CA United States 12345
Phone: 234-567-8901 E-Mail:
Fax: 345-678-9012 Web Address:
Acct. No: 123456 Monthly Due Date: Days Until Due
Tax ID: Fed 1099 Exempt Discount On Assets Only
G/L Liab. Override:
G/L Default Exp:
I cannot wrap my head around how to search for and store the address for each of these entries when the amount of lines in the address varies. Currently, I have a generator that reads each line of the file. Then the get_addrs() method attempts to capture markers such as the Address: and Ship keywords in the file to signify when an address needs to be stored. Then I use a regular expression to search for zip codes in the line following a line with the Address: keyword. I think I've figured out how successfully save the second line for all addresses using that method. However, in a few addresses,es there is a suite number or other piece of information that causes the address to become three lines instead of two. I'm not sure how to account for this and I tried expanding my save_previous() method to three lines, but I can't get it quite right. Here's the code that I was able to successfully save all of the two line addresses with:
import re
class GetAddress():
def __init__(self):
self.line1 = []
self.line2 = []
self.s_line1 = []
self.addr_index = 0
self.ship_index = 0
self.no_ship = False
self.addr_here = False
self.prev_line = []
self.us_zip = ''
# Check if there is a shipping address.
def set_no_ship(self, line):
self.no_ship = line.index(',') == len(line) - 1
except ValueError:
# Save two lines at a time to see whether or not the previous
# line contains 'Address:' and 'Ship'.
def save_previous(self, line):
self.prev_line += [line]
if len(self.prev_line) > 2:
del self.prev_line[0]
def get_addrs(self, line):
self.addr_here = 'Address:' in line and 'Ship' in line
self.po_box = False
self.no_ship = False
self.addr_index = 0
self.ship_index = 0
self.zip1_index = 0
# Check if 'Address:' and 'Ship' are in the previous line.
self.prev_addr = (
'Address:' in self.prev_line[0]
and 'Ship' in self.prev_line[0])
if self.addr_here:
self.po_box = 'Box' in line or 'BOX' in line
self.addr_index = line.index('Address:') + 1
self.ship_index = line.index('Ship')
# Get the contents of the line between 'Address:' and
# 'Ship' if both words are present in this line.
if self.addr_index is not self.ship_index:
self.line1 += [' '.join(line[self.addr_index:self.ship_index])]
elif self.addr_index is self.ship_index:
self.line1 += ['']
if len(self.prev_line) > 1 and self.prev_addr:
self.po_box = 'Box' in line or 'BOX' in line
self.us_zip ='(\d{5}(\-\d{4})?)', ' '.join(line))
if self.us_zip and not self.po_box:
self.zip1_index = line.index(
if self.no_ship:
self.line2 += [' '.join(line[:line.index(',')])]
elif self.zip1_index and not self.no_ship:
self.line2 += [' '.join(line[:self.zip1_index + 1])]
elif len(self.line1) > 0 and not self.line1[-1]:
self.line2 += ['']
# Create a generator to read each line of the file.
def read_gen(infile):
with open(infile, 'r') as file:
for line in file:
yield line.split()
infile = 'Vendor List.txt'
info = GetAddress()
for i, line in enumerate(read_gen(infile)):
I am still a beginner in Python so I'm sure a lot of my code may be redundant or unnecessary. I'd love some feedback as to how I might make this simpler and shorter while capturing both two and three line addresses.
I also posted this question to Reddit and u/Binary101010 pointed out that the text file is a fixed width, and it may be possible to slice each line in a way that only selects the necessary address information. Using this intuition I added some functionality to the generator expression, and I was able to produce the desired effect with the following code:
infile = 'Vendor List.txt'
# Create a generator with differing modes to read the specified lines of the file.
def read_gen(infile, mode=0, start=0, end=0, rows=[]):
lines = list()
with open(infile, 'r') as file:
for i, line in enumerate(file):
# Set end to correct value if no argument is given.
if end == 0:
end = len(line)
# Mode 0 gives all lines of the file
if mode == 0:
yield line[start:end]
# Mode 1 gives specific lines from the file using the rows keyword
# argument. Make sure rows is formatted as [start_row, end_row].
# rows list should only ever be length 2.
elif mode == 1:
if rows:
# Create a list for indices between specified rows.
for element in range(rows[0], rows[1]):
lines += [element]
# Return the current line if the index falls between the
# specified rows.
if i in lines:
yield line[start:end]
class GetAddress:
def __init__(self):
# Allow access to infile for use in set_addresses().
global infile
self.address_indices = list()
self.phone_indices = list()
self.addresses = list()
self.count = 0
def get(self, i, line):
# Search for appropriate substrings and set indices accordingly.
if 'Address:' in line[18:26]:
self.address_indices += [i]
if 'Phone:' in line[18:24]:
self.phone_indices += [i]
# Add address to list if both necessary indices have been collected.
if i in self.phone_indices:
def set_addresses(self):
self.address = list()
start = self.address_indices[self.count]
end = self.phone_indices[self.count]
# Create a generator that only yields substrings for rows between given
# indices.
self.generator = read_gen(
rows=[start, end])
# Collect each line of the address from the generator and remove
# unnecessary spaces.
for element in range(start, end):
self.address += [next(self.generator).strip()]
# This document has a header on each page and a portion of that is
# collected in the address substring. Search for the header substring
# and remove the corresponding elements from self.address.
if len(self.address) > 3 and not self.address[-1]:
self.address = self.address[:self.address.index('header text')]
self.addresses += [self.address]
self.count += 1
info = GetAddress()
for i, line in enumerate(read_gen(infile)):
info.get(i, line)

What is the best way for sum numbers at a big text file?

What is the best way for sum numbers at a big text file?
The text file will contain numbers separated by a comma (',').
The number can be from any type.
No line or row limits.
for example:
1 ,-2, -3.45-7.8j ,99.6,......
Input: path to the text file
Output: the sum of the numbers
I am tried to wrote one solution at myself and want to know for better solutions:
This is my try:
I am working with chunks of data and not read line by line, and because the end of the chunk can contain some of the number (just -2 and not -2+3j) i am looking just on the "safe piece" the last comma (',') and the other part save for the next chunk
import re
def calculate_sum(file_path):
_sum = 0
with open(file_path, 'r') as _f:
chunk =
while chunk:
chunk = chunk.replace(' ', '')
safe_piece = chunk.rfind(',')
next_chunk = chunk[safe_piece:] if safe_piece != 0 else ''
if safe_piece != 0:
chunk = chunk[:safe_piece]
_sum += sum(map(complex, re.findall(r"[+-]\d*\.?\d*[+-]?\d*\.?\d*j|[+-]?\d+(?:\.\d+)?", chunk)))
chunk = next_chunk +
return _sum
This will add up any amount of numbers in a text file. Example:
import csv
with open('input.txt','rb') as f:
r = csv.reader(f)
total = 0
for line in r:
total += sum(complex(col) for col in line)
print total
If you have really long lines and insufficient memory to read it in one go, then you could use a buffering class to chunk the reads and split numbers out of the buffer using a generator function:
import re
class Buffer:
def __init__(self,filename,chunksize=4096):
self.filename = filename
self.chunksize = chunksize
self.buf = ''
def __iter__(self):
with open(self.filename) as f:
while True:
if ',' in self.buf or '\n' in self.buf:
data,self.buf = re.split(r',|\n',self.buf,1) # split off the text up to the first separator
yield complex(data)
chunk =
if not chunk: # if no more data to read, return the remaining buffer and exit function
if self.buf:
yield complex(self.buf)
self.buf += chunk
total = 0
for num in Buffer('input.txt'):
total += num
print total

Count how many times a word appears in a text file

def paraula(file,wordtofind):
f = open(file,"r")
text =
count = 0
for i in text:
s = i.index(wordtofind)
count = count + s
return count
paraula (file,wordtofind)
Why reinvent the wheel?
def word_count(filename, word):
with open(filename, 'r') as f:
def paraula(file,wordtofind):
f = open(file,"r")
text =
count = 0
index = text.find(wordtofind) # Returns the index of the first instance of the word
while index != -1:
count += 1
text = text[index+len(wordtofind):] # Cut text starting from after that word
index = text.find(wordtofind) # Search again
return count
paraula (file,wordtofind)

Python 2 to 3 port - Still has one more error

I am porting a program called markovgenerator I found on the web from Python2 to Python3. It all seems to work just fine.
Here is the code: (Python3 version)
import random
class Markov:
def __init__(self, file, size):
self.size = size
self.starts = []
self.cache = {}
def file_to_words(self, file):
data =
self.words = data.split("\n")
def tuples(self, word):
if len(word) < self.size - 1:
word = word + "\n"
for i in range(len(word) - self.size):
yield (word[i:i+self.size], word[i+self.size])
def parse_words(self):
for word in self.words:
for key, next in self.tuples(word):
if key in self.cache:
self.cache[key] = [next]
def generate_word(self):
key = random.choice(self.starts)
word = key
next = random.choice(self.cache[key])
while next != "\n":
word = word + next
key = key[1:] + next
next = random.choice(self.cache[key])
return word
from optparse import OptionParser
def main():
parser = OptionParser()
parser.add_option('-p', type='int', dest='prev_num', default=3,
help='number of previous letters to base chain on')
parser.add_option('-n', type='int', dest='num', default=5,
help='number of generated words')
parser.add_option('-s', '--source-text', type='string',
default='wordlist-en.txt', dest='source',
help='file to use as basis for generating the words')
(options, args) = parser.parse_args()
file = open('alice.txt')
markov = Markov(file, options.prev_num)
for i in range(options.num):
if __name__ == '__main__':
Except I get this error:
next = random.choice(self.cache[key])
KeyError: ''
The error appears in the "generate_word()" function.
It must be from the translation to 3. Any ideas? I dont see why I am getting a key error, as I pass key to other places no problem.
Thanks for the help!!!
This fixes that error, and ignores any blank lines outputted:
import random
class Markov:
def __init__(self, file, size):
self.size = size
self.starts = []
self.cache = {}
self.cache[''] = '\n'
def file_to_words(self, file):
data =
self.words = data.split("\n")
def tuples(self, word):
if len(word) < self.size - 1:
word = word + "\n"
for i in range(len(word) - self.size):
yield (word[i:i+self.size], word[i+self.size])
def parse_words(self):
for word in self.words:
for key, next in self.tuples(word):
if key in self.cache:
self.cache[key] = [next]
def generate_word(self):
key = random.choice(self.starts)
word = key
next = random.choice(self.cache[key])
while not next == "\n":
word = word + next
key = key[1:] + next
next = random.choice(self.cache[key])
return word
from optparse import OptionParser
def main():
parser = OptionParser()
parser.add_option('-p', type='int', dest='prev_num', default=3,
help='number of previous letters to base chain on')
parser.add_option('-n', type='int', dest='num', default=5,
help='number of generated words')
parser.add_option('-s', '--source-text', type='string',
default='wordlist-en.txt', dest='source',
help='file to use as basis for generating the words')
(options, args) = parser.parse_args()
file = open('alice.txt')
markov = Markov(file, options.prev_num)
iters = 0
while iters < options.num:
word = markov.generate_word()
if word != '\n' and word != '':
iters += 1
if __name__ == '__main__':
For some reason, the string '', which raises KeyError when you try to use it in the dictionary cache, was registering as a word. Everything I tried to remove it caused the program to break, so I added a line to __init__ which sets the next word of '' to \n, giving the intended result by quitting when we see a newline.
If there's anything wrong with this code, let me know and I will be happy to fix it.