Count how many times a word appears in a text file - python-2.7

def paraula(file,wordtofind):
f = open(file,"r")
text = f.read()
f.close()
count = 0
for i in text:
s = i.index(wordtofind)
count = count + s
return count
paraula (file,wordtofind)

Why reinvent the wheel?
def word_count(filename, word):
with open(filename, 'r') as f:
return f.read().count(word)

def paraula(file,wordtofind):
f = open(file,"r")
text = f.read()
f.close()
count = 0
index = text.find(wordtofind) # Returns the index of the first instance of the word
while index != -1:
count += 1
text = text[index+len(wordtofind):] # Cut text starting from after that word
index = text.find(wordtofind) # Search again
return count
paraula (file,wordtofind)

Related

When using pandas is it possible to replace the re package with the regex package? [duplicate]

I am trying to check for fuzzy match between a string column and a reference list. The string series contains over 1 m rows and the reference list contains over 10 k entries.
For eg:
df['NAMES'] = pd.Series(['ALEXANDERS', 'NOVA XANDER', 'SALA MANDER', 'PARIS HILTON', 'THE HARIS DOWNTOWN', 'APARISIAN', 'PARIS', 'MARIN XO']) # 1mil rows
ref_df['REF_NAMES'] = pd.Series(['XANDER','PARIS']) #10 k rows
###Output should look like
df['MATCH'] = pd.Series([Nan, 'XANDER', 'MANDER', 'PARIS', 'HARIS', Nan, 'PARIS', Nan])
It should generate match if the word appears separately in the string (and within that, upto 1 char substitution allowed)
For eg - 'PARIS' can match against 'PARIS HILTON', 'THE HARIS DOWNTOWN', but not against 'APARISIAN'.
Similarly, 'XANDER' matches against 'NOVA XANDER' and 'SALA MANDER' (MANDER being 1 char diff from XANDER) , but does not generate match against 'ALEXANDERS'.
As of now, we have written the logic for the same (shown below), although the match takes over 4 hrs to run.. Need to get this to under 30 mins.
Current code -
tags_regex = ref_df['REF_NAMES'].tolist()
tags_ptn_regex = '|'.join([f'\s+{tag}\s+|^{tag}\s+|\s+{tag}$' for tag in tags_regex])
def search_it(partyname):
m = regex.search("("+tags_ptn_regex+ ")"+"{s<=1:[A-Z]}",partyname):
if m is not None:
return m.group()
else:
return None
df['MATCH'] = df['NAMES'].str.apply(search_it)
Also, will multiprocessing help with regex ? Many thanks in advance!
Your pattern is rather inefficient, as you repeat tag pattern thrice in the regex. You just need to create a pattern with the so-called whitespace boundaries, (?<!\S) and (?!\S), and you will only need one tag pattern.
Next, if you have several thousands alternative, even the single tag pattern regex will be extremely slow because there can appear such alternatives that match at the same location in the string, and thus, there will be too much backtracking.
To reduce this backtracking, you will need a regex trie.
Here is a working snippet:
import regex
import pandas as pd
## Class to build a regex trie, see https://stackoverflow.com/a/42789508/3832970
class Trie():
"""Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
The corresponding Regex should match much faster than a simple Regex union."""
def __init__(self):
self.data = {}
def add(self, word):
ref = self.data
for char in word:
ref[char] = char in ref and ref[char] or {}
ref = ref[char]
ref[''] = 1
def dump(self):
return self.data
def quote(self, char):
return regex.escape(char)
def _pattern(self, pData):
data = pData
if "" in data and len(data.keys()) == 1:
return None
alt = []
cc = []
q = 0
for char in sorted(data.keys()):
if isinstance(data[char], dict):
try:
recurse = self._pattern(data[char])
alt.append(self.quote(char) + recurse)
except:
cc.append(self.quote(char))
else:
q = 1
cconly = not len(alt) > 0
if len(cc) > 0:
if len(cc) == 1:
alt.append(cc[0])
else:
alt.append('[' + ''.join(cc) + ']')
if len(alt) == 1:
result = alt[0]
else:
result = "(?:" + "|".join(alt) + ")"
if q:
if cconly:
result += "?"
else:
result = "(?:%s)?" % result
return result
def pattern(self):
return self._pattern(self.dump())
## Start of main code
df = pd.DataFrame()
df['NAMES'] = pd.Series(['ALEXANDERS', 'NOVA XANDER', 'SALA MANDER', 'PARIS HILTON', 'THE HARIS DOWNTOWN', 'APARISIAN', 'PARIS', 'MARIN XO']) # 1mil rows
ref_df = pd.DataFrame()
ref_df['REF_NAMES'] = pd.Series(['XANDER','PARIS']) #10 k row
trie = Trie()
for word in ref_df['REF_NAMES'].tolist():
trie.add(word)
tags_ptn_regex = regex.compile(r"(?:(?<!\S)(?:{})(?!\S)){{s<=1:[A-Z]}}".format(trie.pattern()), regex.IGNORECASE)
def search_it(partyname):
m = tags_ptn_regex.search(partyname)
if m is not None:
return m.group()
else:
return None
df['MATCH'] = df['NAMES'].apply(search_it)

Text file value replace in python

I am trying to replace text value as below. I have 2 text file
1 - input.txt
abc = 123
xyz = 456
pqr = 789
2 - content.txt
AAA = abc
XXX = xyz
PPP = pqr
now I need to read the input.txt file and replace value on content.txt file with input.txt values and get the below output file.
3 - new.txt
AAA = 123
XXX = 456
PPP = 789
How can I do this ?
First read the contents of the file into 2 arrays in the following way
file1handle = open('filename1', 'r')
file1 = file1handle.readlines()
file2handle = open('filename2', 'r')
file2 = file2handle.readlines()
file2handle.close()
file2handle.close()
Then iterate over the contents and try finding the match with variable names and assignments and put the values into third array in following way
for item in file1:
name, value = item.split(' = ')
for item2 in file2:
name2, assignment = item2.split(' = ')
#Here we are trying to check which name is to be assigned which value
if assignment == name:
val = name2+'='+value
file3.append(val)
Then write the contents into file in following way
filehandle3 = open('filename3', 'w')
for line in file3:
filehandle3.write(line)
filehandle3.close()
This may help you,
_input = {}
with open('input.txt', 'r') as f:
s = f.read()
_input = dict((a.split(' = ')[0], int(a.split(' = ')[1])) for a in s.split('\n'))
_content = {}
with open('content.txt', 'r') as f:
s = f.read()
_content = dict((a.split(' = ')[0], a.split(' = ')[1]) for a in s.split('\n'))
for key in _content:
_content[key] = _input[_content[key]]
Result:
In [18]: _content
Out[19]: {'AAA': 123, 'PPP': 789, 'XXX': 456}
How about using pandas: It's shorter, easier to read and faster when using large files.
import pandas as pd
import numpy as np
input=pd.read_csv("input.txt",sep="=",header=None,usecols=[1])
content=pd.read_csv("content.txt",sep="=",header=None,usecols=[0])
foo=np.hstack(([content.values,input.values]))
new=pd.DataFrame(foo)
new.to_csv("new.txt",index=False,sep="=",header=None)
import re
class Defs:
def __init__(self, defs_file):
self._defs = {}
with open(defs_file) as df:
line_num = 0
for l in df:
line_num += 1
m = re.match(r'\s*(\w+)\s*=\s*(\S+)\s*', l)
assert m, \
"invalid assignment syntax with \"{}\" at line {}".format(
l.rstrip(), line_num)
self._defs[m.group(1)] = m.group(2)
def __getitem__(self, var):
return self._defs[var]
#property
def dict(self):
return self._defs
class Replacer:
def __init__(self, defs):
self._defs = defs
def replace_with_defs(self, context_file, output_file):
with open(context_file) as context, open(output_file, 'w') as output:
for line in context:
string_repl = re.sub(r'\b(\w+)\b',
lambda m: self._defs.dict.get(m.group(1)) or m.group(1), line)
output.write(string_repl)
def main():
defs = Defs('input.txt')
repl = Replacer(defs)
repl.replace_with_defs('context.txt', 'output.txt')
if __name__ == '__main__':
main()
To describe what's going on above, the Defs class takes a defs_file which is the input.txt assignments and stores them in a dict binding each variable name to the associated value. The Replacer class handles takes a Defs object and uses those to iterate over each line in the context_file i.e. context.txt and replaces any token (assuming the token is a variable name) with the value associated with it, specified within the Defs object, and writes this out to a file output_file i.e. output.txt. If the token doesn't exist in the Defs object as a valid variable name, it defaults to the write the token as is.

What is the best way for sum numbers at a big text file?

What is the best way for sum numbers at a big text file?
The text file will contain numbers separated by a comma (',').
The number can be from any type.
No line or row limits.
for example:
1 ,-2, -3.45-7.8j ,99.6,......
...
...
Input: path to the text file
Output: the sum of the numbers
I am tried to wrote one solution at myself and want to know for better solutions:
This is my try:
I am working with chunks of data and not read line by line, and because the end of the chunk can contain some of the number (just -2 and not -2+3j) i am looking just on the "safe piece" the last comma (',') and the other part save for the next chunk
import re
CHUNK_SIZE = 1017
def calculate_sum(file_path):
_sum = 0
with open(file_path, 'r') as _f:
chunk = _f.read(CHUNK_SIZE)
while chunk:
chunk = chunk.replace(' ', '')
safe_piece = chunk.rfind(',')
next_chunk = chunk[safe_piece:] if safe_piece != 0 else ''
if safe_piece != 0:
chunk = chunk[:safe_piece]
_sum += sum(map(complex, re.findall(r"[+-]\d*\.?\d*[+-]?\d*\.?\d*j|[+-]?\d+(?:\.\d+)?", chunk)))
chunk = next_chunk + _f.read(CHUNK_SIZE)
return _sum
Thanks!
This will add up any amount of numbers in a text file. Example:
input.csv
1,-2,-3.45-7.8j,99.6
-1,1-2j
1.5,2.5,1+1j
example.py
import csv
with open('input.txt','rb') as f:
r = csv.reader(f)
total = 0
for line in r:
total += sum(complex(col) for col in line)
print total
Output
(100.15-8.8j)
If you have really long lines and insufficient memory to read it in one go, then you could use a buffering class to chunk the reads and split numbers out of the buffer using a generator function:
import re
class Buffer:
def __init__(self,filename,chunksize=4096):
self.filename = filename
self.chunksize = chunksize
self.buf = ''
def __iter__(self):
with open(self.filename) as f:
while True:
if ',' in self.buf or '\n' in self.buf:
data,self.buf = re.split(r',|\n',self.buf,1) # split off the text up to the first separator
yield complex(data)
else:
chunk = f.read(self.chunksize)
if not chunk: # if no more data to read, return the remaining buffer and exit function
if self.buf:
yield complex(self.buf)
return
self.buf += chunk
total = 0
for num in Buffer('input.txt'):
total += num
print total
Output:
(100.15-8.8j)

Ascii codec can't decode byte 0xc2 python nltk

I have a code that I'm using for Spam Classification and it works great but everytime I try to stem/lemmatize the word I get this error:
File "/Users/Ramit/Desktop/Bayes1/src/filter.py", line 16, in trim_word
word = ps.stem(word)
File "/Library/Python/2.7/site-packages/nltk/stem/porter.py", line 664, in stem
stem = self._step1a(stem)
File "/Library/Python/2.7/site-packages/nltk/stem/porter.py", line 289, in _step1a
if word.endswith('ies') and len(word) == 4:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
Here is my code:
from word import Word
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
class Filter():
def __init__(self):
self.words = dict()
def trim_word(self, word):
# Helper method to trim away some of the non-alphabetic characters
# I deliberately do not remove all non-alphabetic characters.
word = word.strip(' .:,-!()"?+<>*')
word = word.lower()
word = ps.stem(word)
return word
def train(self, train_file):
lineNumber = 1
ham_words = 0
spam_words = 0
stop = set(stopwords.words('english'))
# Loop through all the lines
for line in train_file:
if lineNumber % 2 != 0:
line = line.split('\t')
category = line[0]
input_words = line[1].strip().split(' ')
#Loop through all the words in the line, remove some characters
for input_word in input_words:
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
# Check if word is in dicionary, else add
if input_word in self.words:
word = self.words[input_word]
else:
word = Word(input_word)
self.words[input_word] = word
# Check wether the word is in ham or spam sentence, increment counters
if category == "ham":
word.increment_ham()
ham_words += 1
elif category == "spam":
word.increment_spam()
spam_words += 1
# Probably bad training file input...
else:
print "Not valid training file format"
lineNumber+=1
# Compute the probability for each word in the training set
for word in self.words:
self.words[word].compute_probability(ham_words, spam_words)
def get_interesting_words(self, sms):
interesting_words = []
stop = set(stopwords.words('english'))
# Go through all words in the SMS and append to list.
# If we have not seen the word in training, assign probability of 0.4
for input_word in sms.split(' '):
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
if input_word in self.words:
word = self.words[input_word]
else:
word = Word(input_word)
word.set_probability(0.40)
interesting_words.append(word)
# Sort the list of interesting words, return top 15 elements if list is longer than 15
interesting_words.sort(key=lambda word: word.interesting(), reverse=True)
return interesting_words[0:15]
def filter(self, input_file, result_file):
# Loop through all SMSes and compute total spam probability of the sms-message
lineNumber = 0
for sms in input_file:
lineNumber+=1
spam_product = 1.0
ham_product = 1.0
if lineNumber % 2 != 0:
try:
for word in self.get_interesting_words(sms):
spam_product *= word.get_probability()
ham_product *= (1.0 - word.get_probability())
sms_spam_probability = spam_product / (spam_product + ham_product)
except:
result_file.write("error")
if sms_spam_probability > 0.8:
result_file.write("SPAM: "+sms)
else:
result_file.write("HAM: "+sms)
result_file.write("\n")
I'm just looking for a solution that would allow me to lemmatize/stem the words. I tried looking around the net I did find similar problems, but they haven't been working for me.
Use sys.
import sys
sys.setdefaultencoding('utf-8')
reload(sys)

Python 2 to 3 port - Still has one more error

I am porting a program called markovgenerator I found on the web from Python2 to Python3. It all seems to work just fine.
Here is the code: (Python3 version)
import random
class Markov:
def __init__(self, file, size):
self.size = size
self.starts = []
self.cache = {}
self.file_to_words(file)
self.parse_words()
def file_to_words(self, file):
file.seek(0)
data = file.read()
self.words = data.split("\n")
def tuples(self, word):
if len(word) < self.size - 1:
return
word = word + "\n"
for i in range(len(word) - self.size):
yield (word[i:i+self.size], word[i+self.size])
def parse_words(self):
for word in self.words:
self.starts.append(word[:self.size])
for key, next in self.tuples(word):
if key in self.cache:
self.cache[key].append(next)
else:
self.cache[key] = [next]
def generate_word(self):
key = random.choice(self.starts)
word = key
next = random.choice(self.cache[key])
while next != "\n":
word = word + next
key = key[1:] + next
next = random.choice(self.cache[key])
return word
from optparse import OptionParser
def main():
parser = OptionParser()
parser.add_option('-p', type='int', dest='prev_num', default=3,
help='number of previous letters to base chain on')
parser.add_option('-n', type='int', dest='num', default=5,
help='number of generated words')
parser.add_option('-s', '--source-text', type='string',
default='wordlist-en.txt', dest='source',
help='file to use as basis for generating the words')
(options, args) = parser.parse_args()
file = open('alice.txt')
markov = Markov(file, options.prev_num)
file.close()
for i in range(options.num):
print(markov.generate_word())
if __name__ == '__main__':
main()
Except I get this error:
next = random.choice(self.cache[key])
KeyError: ''
The error appears in the "generate_word()" function.
It must be from the translation to 3. Any ideas? I dont see why I am getting a key error, as I pass key to other places no problem.
Thanks for the help!!!
This fixes that error, and ignores any blank lines outputted:
import random
class Markov:
def __init__(self, file, size):
self.size = size
self.starts = []
self.cache = {}
self.file_to_words(file)
self.parse_words()
self.cache[''] = '\n'
def file_to_words(self, file):
file.seek(0)
data = file.read()
self.words = data.split("\n")
def tuples(self, word):
if len(word) < self.size - 1:
return
word = word + "\n"
for i in range(len(word) - self.size):
yield (word[i:i+self.size], word[i+self.size])
def parse_words(self):
for word in self.words:
self.starts.append(word[:self.size])
for key, next in self.tuples(word):
if key in self.cache:
self.cache[key].append(next)
else:
self.cache[key] = [next]
def generate_word(self):
key = random.choice(self.starts)
word = key
next = random.choice(self.cache[key])
while not next == "\n":
word = word + next
key = key[1:] + next
next = random.choice(self.cache[key])
return word
from optparse import OptionParser
def main():
parser = OptionParser()
parser.add_option('-p', type='int', dest='prev_num', default=3,
help='number of previous letters to base chain on')
parser.add_option('-n', type='int', dest='num', default=5,
help='number of generated words')
parser.add_option('-s', '--source-text', type='string',
default='wordlist-en.txt', dest='source',
help='file to use as basis for generating the words')
(options, args) = parser.parse_args()
file = open('alice.txt')
markov = Markov(file, options.prev_num)
file.close()
iters = 0
while iters < options.num:
word = markov.generate_word()
if word != '\n' and word != '':
print(word)
iters += 1
if __name__ == '__main__':
main()
For some reason, the string '', which raises KeyError when you try to use it in the dictionary cache, was registering as a word. Everything I tried to remove it caused the program to break, so I added a line to __init__ which sets the next word of '' to \n, giving the intended result by quitting when we see a newline.
If there's anything wrong with this code, let me know and I will be happy to fix it.