How to split tokens, count number of tokens, and write in a file in python? - python-2.7

I have file which has data in lines as follows:
['Marilyn Manson', 'Web', 'Skydera Inc.', 'Stone Sour', 'The Smashing Pumpkins', 'Warner Bros. Entertainment','This is a good Beer]
['Voices Inside', 'Expressivista', 'The Kentucky Fried Movie', 'The Bridges of Madison County']
and so on. I want to re-write the data into a file which has lines with tokens with words less than 3 or some other number. e.g.:
['Marilyn Manson', 'Web', 'Skydera Inc.', 'Stone Sour']
['Voices Inside', 'Expressivista']
this is what I have tried so far:
for line in open(file):
line = line.strip()
line = line.rstrip()
prog = re.compile("([a-z0-9]){32}")
if line:
line = line.replace('"', '')
line = line.split(",")
if re.match(prog, line[0]) and len(line)>2:
wo=[]
for words in line:
word=words.split()
if len(word)<3:
print word.append(word)
But the output says None. Any thoughts where I am making a mistake?

A better way to do what you're doing is to use ast.literal_eval, which automagically converts string representations of Python objects (e.g. lists) into actual Python objects.
import ast
# raw data
data = """
['Marilyn Manson', 'Web', 'Skydera Inc.', 'Stone Sour', 'The Smashing Pumpkins', 'Warner Bros. Entertainment','This is a good Beer']
['Voices Inside', 'Expressivista', 'The Kentucky Fried Movie', 'The Bridges of Madison County']
"""
# set threshold number of tokens
threshold = 3
# split into lines
lines = data.split('\n')
# parse non-blank lines into python lists
lists = [ast.literal_eval(line) for line in lines if line]
# for each list, keep only those tokens with less than `threshold` tokens
result = [[item for item in lst if len(item.split()) < threshold]
for lst in lists]
# show result
for line in result:
print(line)
Result:
['Marilyn Manson', 'Web', 'Skydera Inc.', 'Stone Sour']
['Voices Inside', 'Expressivista']
I think the reason your code isn't working is that you're trying to match line[0] against your regex prog - but the problem is that line[0] isn't 32 characters long for either of your lines, so your regex won't match.

Related

matching an entire list with each and every line of file

I had written a piece of code that basically performs find and replace from a list on a text file.
So, it maps the entire list into a dictionary. Then from text file each and every line is processed and is matched with entire list in the dictionary if a match anywhere in the line is found it replaces with corresponding value from the list(dictionary).
Here is the code:
import sys
import re
#open file using open file mode
fp1 = open(sys.argv[1]) # Open file on read mode
lines = fp1.read().split("\n") # Create a list containing all lines
fp1.close() # Close file
fp2 = open(sys.argv[2]) # Open file on read mode
words = fp2.read().split("\n") # Create a list containing all lines
fp2.close() # Close file
word_hash = {}
for word in words:
#print(word)
if(word != ""):
tsl = word.split("\t")
word_hash[tsl[0]] = tsl[1]
#print(word_hash)
keys = word_hash.keys()
#skeys = sorted(keys, key=lambda x:x.split(" "),reverse=True)
#print(keys)
#print (skeys)
for line in lines:
if(line != ""):
for key in keys:
#my_regex = key + r"\b"
my_regex = r"([\"\( ])" + key + r"([ ,\.!\"।)])"
#print(my_regex)
if((re.search(my_regex, line, re.IGNORECASE|re.UNICODE))):
line = re.sub(my_regex, r"\1" + word_hash[key]+r"\2",line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :1",line)
if((re.search(key + r"$", line, re.IGNORECASE|re.UNICODE))):
line = re.sub(key+r"$", word_hash[key],line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :2",line)
if((re.search(r"^" + key, line, re.IGNORECASE|re.UNICODE))):
#print(line)
line = re.sub(r"^" + key, word_hash[key],line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :",line)
print(line)
else:
print(line)
Problem here is when the list size grows execution slows up as all the lines of text file are matched with each and every key in list. So where can I improve the execution of this code.
List file:
word1===>replaceword1
word2===>replaceword2
.....
List is tab seperated. Here I used ===> for easy understanding.
Input file:
hello word1 I am here.
word2. how are you word1?
Expected Output:
hello replaceword1 I am here.
replaceword2. how are you replaceword1?
If your word list is small enough, the best speedup you can achieve with the match-and-replace process is to use a single big regexp and use a functionnal re.sub
This way you have a single call to the optimised function.
EDIT: In order to preserve order of replacements (this can lead to chain replacing, don't know if intended behavior) we can perform replacement by batches rather than in a single run, where batches order respects file order and each batch is made of disjoint possible string matches.
The code would be as follow
import sys
import re
word_hashes = []
def insert_word(word, replacement, hashes):
if not hashes:
return [{word: replacement}]
for prev_word in hashes[0]:
if word in prev_word or prev_word in word:
return [hashes[0]] + insert_word(word, replacement, hashes[1:])
hashes[0][word] = replacement
return hashes
with open(sys.argv[2]) as fp2: # Open file on read mode
words = fp2.readlines()
for word in [w.strip() for w in words if w.strip()]:
tsl = word.split("\t")
word_hashes = insert_word(tsl[0],tsl[1], word_hashes)
#open file using open file mode
lines = []
with open(sys.argv[1]) as fp1:
content = fp1.read()
for word_hash in word_hashes:
my_regex = r"([\"\( ])(" + '|'.join(word_hash.keys()) + r")([ ,\.!\"।)])"
content = re.sub(my_regex, lambda x: x.group(1) + word_hash[x.group(2)] + x.group(3) ,content,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
print(content)
We obtain chained replacement for the example data. For example, with the following words to replace
roses are red==>flowers are blue
are==>is
Text to parse
roses are red and beautiful
flowers are yellow
Output
roses is red and beautiful
flowers is yellow
Why don't you read the content of the entire file in a string, and just do string.replace. For example.
def find_replace():
txt = ''
#Read text from the file as a string
with open('file.txt', 'r') as fp:
txt = fp.read()
dct = {"word1":"replaceword1","word2":"replaceword2"}
#Find and replace characters
for k,v in dct.items():
txt = txt.replace(k,v)
#Write back the modified string
with open('file.txt', 'w') as fp:
fp.write(txt)
If the input file is:
hello word1 I am here.
word2. how are you word1?
The output will be:
hello replaceword1 I am here.
replaceword2. how are you replaceword1?

Missing words in word2vec vocabulary

I am training word2vec on my own text-corpus using mikolov's implementation from here. Not all unique words from the corpus get a vector even though I have set the min-count to 1. Are there any parameters I may have missed, that might be the reason not all unique words get a vector? What else might be the reason?
To test word2vecs behavior I have written the following script providing a text file with 20058 sentences and 278896 words (all words and punctuation are space separated and there is one sentence per line).
import subprocess
def get_w2v_vocab(path_embs):
vocab = set()
with open(path_embs, 'r', encoding='utf8') as f:
next(f)
for line in f:
word = line.split(' ')[0]
vocab.add(word)
return vocab - {'</s>'}
def train(path_corpus, path_embs):
subprocess.call(["./word2vec", "-threads", "6", "-train", path_corpus,
"-output", path_embs, "-min-count", "1"])
def get_unique_words_in_corpus(path_corpus):
vocab = []
with open(path_corpus, 'r', encoding='utf8') as f:
for line in f:
vocab.extend(line.strip('\n').split(' '))
return set(vocab)
def check_equality(expected, actual):
if not expected == actual:
diff = len(expected - actual)
raise Exception('Not equal! Vocab expected: {}, Vocab actual: {}, Diff: {}'.format(len(expected), len(actual), diff))
print('Expected vocab and actual vocab are equal.')
def main():
path_corpus = 'test_corpus2.txt'
path_embs = 'embeddings.vec'
vocab_expected = get_unique_words_in_corpus(path_corpus)
train(path_corpus, path_embs)
vocab_actual = get_w2v_vocab(path_embs)
check_equality(vocab_expected, vocab_actual)
if __name__ == '__main__':
main()
This script gives me the following output:
Starting training using file test_corpus2.txt
Vocab size: 33651
Words in train file: 298954
Alpha: 0.000048 Progress: 99.97% Words/thread/sec: 388.16k Traceback (most recent call last):
File "test_w2v_behaviour.py", line 44, in <module>
main()
File "test_w2v_behaviour.py", line 40, in main
check_equality(vocab_expected, vocab_actual)
File "test_w2v_behaviour.py", line 29, in check_equality
raise Exception('Not equal! Vocab expected: {}, Vocab actual: {}, Diff: {}'.format(len(expected), len(actual), diff))
Exception: Not equal! Vocab expected: 42116, Vocab actual: 33650, Diff: 17316
As long as you're using Python, you might want to use the Word2Vec implementation in the gensim package. It does everything the original Mikolov/Googleword2vec.c does, and more, and is usually performance-competitive.
In particular, it won't have any issues with UTF-8 encoding – while I'm not sure the Mikolov/Google word2vec.c handles UTF-8 correctly. And, that may be a source of your discrepancy.
If you need to get to the bottom of your discrepancy, I would suggest:
have your get_unique_words_in_corpus() also tally/report the total number of non-unique words its tokenization creates. If that's not the same as the 298954 reported by word2vec.c, then the two processes are clearly not working from the same baseline understanding of what 'words' are in the source file.
find some words, or at least one representative word, that your token-count expects to be in the final model, and isn't. Review those for any common characteristic – including in context in the file. That will probably reveal why the two tallies differ.
Again, I suspect something UTF-8 related, or perhaps related to other implementation-limits in word2vec.c (such as a maximum word-lenght) that are not mirrored in your Python-based word tallies.
You could use FastText instead of Word2Vec. FastText is able to embed out-of-vocabulary words by looking at subword information (character ngrams). Gensim also has a FastText implementation, which is very easy to use:
from gensim.models import FastText as ft
model = ft(sentences=training_data,)
word = 'blablabla' # can be out of vocabulary
embedded_word = model[word] # fetches the word embedding
See https://stackoverflow.com/a/54709303/3275464

In python insert one space after every 5th Character in each line of a text file

I am reading a text file in python(500 rows) and it seems like:
File Input:
0082335401
0094446049
01008544409
01037792084
01040763890
I wanted to ask that is it possible to insert one space after 5th Character in each line:
Desired Output:
00823 35401
00944 46049
01008 544409
01037 792084
01040 763890
I have tried below code
st = " ".join(st[i:i + 5] for i in range(0, len(st), 5))
but the below output was returned on executing it:
00823 35401
0094 44604 9
010 08544 409
0 10377 92084
0104 07638 90
I am a novice in Python. Any help would make a difference.
There seems to be two issues here - By running your provided code, you seem to be reading the file into one single string. It would be much preferable (in your case) to read the file in as a list of strings, like the following (assuming your input file is input_data.txt):
# Initialize a list for the data to be stored
data = []
# Iterate through your file to read the data
with open("input_data.txt") as f:
for line in f.readlines():
# Use .rstrip() to get rid of the newline character at the end
data.append(line.rstrip("\r\n"))
Then, to operate on the data you obtained in a list, you could use a list comprehension similar to the one you have tried to use.
# Assumes that data is the result from the above code
data = [i[:5] + " " + i[5:] if len(i) > 5 else i for i in data]
Hope this helped!
If your only requirement is to insert a space after the fifth character than you could use the following simple version:
#!/usr/bin/env python
with open("input_data") as data:
for line in data.readlines():
line = line.rstrip()
if len(line) > 5:
print(line[0:5]+" "+line[5:])
else:
print(line)
If you don't mind if lines with less than five characters get a space at the end, you could even omit the if-else-statement and go with the print-function from the if-clause:
#!/usr/bin/env python
with open("input_data") as data:
for line in data.readlines():
line = line.rstrip()
print(line[0:5]+" "+line[5:])

Finding Alliterative Word Sequences with Python

I am working in Python 3.6 with NLTK 3.2.
I am trying to write a program which takes raw text as input and outputs any (maximum) series of consecutive words beginning with the same letter (i.e. alliterative sequences).
When searching for sequences, I want to ignore certain words and punctuation (for instance, 'it', 'that', 'into', ''s', ',', and '.'), but to include them in the output.
For example, inputting
"The door was ajar. So it seems that Sam snuck into Sally's subaru."
should yield
["so", "it", "seems", "that", "sam", "snuck", "into", "sally's", "subaru"]
I am new to programming and the best I could come up with is:
import nltk
from nltk import word_tokenize
raw = "The door was ajar. So it seems that Sam snuck into Sally's subaru."
tokened_text = word_tokenize(raw) #word tokenize the raw text with NLTK's word_tokenize() function
tokened_text = [w.lower() for w in tokened_text] #make it lowercase
for w in tokened_text: #for each word of the text
letter = w[0] #consider its first letter
allit_str = []
allit_str.append(w) #add that word to a list
pos = tokened_text.index(w) #let "pos" be the position of the word being considered
for i in range(1,len(tokened_text)-pos): #consider the next word
if tokened_text[pos+i] in {"the","a","an","that","in","on","into","it",".",",","'s"}: #if it's one of these
allit_str.append(tokened_text[pos+i]) #add it to the list
i=+1 #and move on to the next word
elif tokened_text[pos+i][0] == letter: #or else, if the first letter is the same
allit_str.append(tokened_text[pos+i]) #add the word to the list
i=+1 #and move on to the next word
else: #or else, if the letter is different
break #break the for loop
if len(allit_str)>=2: #if the list has two or more members
print(allit_str) #print it
which outputs
['ajar', '.']
['so', 'it', 'seems', 'that', 'sam', 'snuck', 'into', 'sally', "'s", 'subaru', '.']
['seems', 'that', 'sam', 'snuck', 'into', 'sally', "'s", 'subaru', '.']
['sam', 'snuck', 'into', 'sally', "'s", 'subaru', '.']
['snuck', 'into', 'sally', "'s", 'subaru', '.']
['sally', "'s", 'subaru', '.']
['subaru', '.']
This is close to what I want, except that I don't know how to restrict the program to only print the maximum sequences.
So my questions are:
How can I modify this code to only print the maximum sequence
['so', 'it', 'seems', 'that', 'sam', 'snuck', 'into', 'sally', "'s", 'subaru', '.']?
Is there an easier way to do this in Python, maybe with regular expression or more elegant code?
Here are similar questions asked elsewhere, but which have not helped me modify my code:
How do you effectively use regular expressions to find alliterative expressions?
A reddit challenge asking for a similar program
4chan question regarding counting instances of alliteration
Blog about finding most common alliterative strings in a corpus
(I also think it would be nice to have this question answered on this site.)
Interesting task. Personally, I'd loop through without the use of indices, keeping track of the previous word to compare it with the current word.
Additionally, it's not enough to compare letters; you have to take into account that 's' and 'sh' etc don't alliterate. Here's my attempt:
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
import string
from collections import defaultdict, OrderedDict
import operator
raw = "The door was ajar. So it seems that Sam snuck into Sally's subaru. She seems shy sometimes. Someone save Simon."
# Get the English alphabet as a list of letters
letters = [letter for letter in string.ascii_lowercase]
# Here we add some extra phonemes that are distinguishable in text.
# ('sailboat' and 'shark' don't alliterate, for instance)
# Digraphs go first as we need to try matching these before the individual letters,
# and break out if found.
sounds = ["ch", "ph", "sh", "th"] + letters
# Use NLTK's built in stopwords and add "'s" to them
stopwords = stopwords.words('english') + ["'s"] # add extra stopwords here
stopwords = set(stopwords) # sets are MUCH faster to process
sents = sent_tokenize(raw)
alliterating_sents = defaultdict(list)
for sent in sents:
tokenized_sent = word_tokenize(sent)
# Create list of alliterating word sequences
alliterating_words = []
previous_initial_sound = ""
for word in tokenized_sent:
for sound in sounds:
if word.lower().startswith(sound): # only lowercasing when comparing retains original case
initial_sound = sound
if initial_sound == previous_initial_sound:
if len(alliterating_words) > 0:
if previous_word == alliterating_words[-1]: # prevents duplication in chains of more than 2 alliterations, but assumes repetition is not alliteration)
alliterating_words.append(word)
else:
alliterating_words.append(previous_word)
alliterating_words.append(word)
else:
alliterating_words.append(previous_word)
alliterating_words.append(word)
break # Allows us to treat sh/s distinctly
# This needs to be at the end of the loop
# It sets us up for the next iteration
if word not in stopwords: # ignores stopwords for the purpose of determining alliteration
previous_initial_sound = initial_sound
previous_word = word
alliterating_sents[len(alliterating_words)].append(sent)
sorted_alliterating_sents = OrderedDict(sorted(alliterating_sents.items(), key=operator.itemgetter(0), reverse=True))
# OUTPUT
print ("A sorted ordered dict of sentences by number of alliterations:")
print (sorted_alliterating_sents)
print ("-" * 15)
max_key = max([k for k in sorted_alliterating_sents]) # to get sent with max alliteration
print ("Sentence(s) with most alliteration:", sorted_alliterating_sents[max_key])
This produces a sorted ordered dictionary of sentences with their alliteration counts as its keys. The max_key variable contains the count for the highest alliterating sentence or sentences, and can be used to access the sentences themselves.
The accepted answer is very comprehensive, but I would suggest using Carnegie Mellon's pronouncing dictionary. This is partly because it makes life easier, and partly because identical sounding syllables that are not necessarily identical letter-to-letter are also considered alliterations. An example I found online (https://examples.yourdictionary.com/alliteration-examples.html) is "Finn fell for Phoebe".
# nltk.download('cmudict') ## download CMUdict for phoneme set
# The phoneme dictionary consists of ARPABET which encode
# vowels, consonants, and a representitive stress-level (wiki/ARPABET)
phoneme_dictionary = nltk.corpus.cmudict.dict()
stress_symbols = ['0', '1', '2', '3...', '-', '!', '+', '/',
'#', ':', ':1', '.', ':2', '?', ':3']
# nltk.download('stopwords') ## download stopwords (the, a, of, ...)
# Get stopwords that will be discarded in comparison
stopwords = nltk.corpus.stopwords.words("english")
# Function for removing all punctuation marks (. , ! * etc.)
no_punct = lambda x: re.sub(r'[^\w\s]', '', x)
def get_phonemes(word):
if word in phoneme_dictionary:
return phoneme_dictionary[word][0] # return first entry by convention
else:
return ["NONE"] # no entries found for input word
def get_alliteration_level(text): # alliteration based on sound, not only letter!
count, total_words = 0, 0
proximity = 2 # max phonemes to compare to for consideration of alliteration
i = 0 # index for placing phonemes into current_phonemes
lines = text.split(sep="\n")
for line in lines:
current_phonemes = [None] * proximity
for word in line.split(sep=" "):
word = no_punct(word) # remove punctuation marks for correct identification
total_words += 1
if word not in stopwords:
if (get_phonemes(word)[0] in current_phonemes): # alliteration occurred
count += 1
current_phonemes[i] = get_phonemes(word)[0] # update new comparison phoneme
i = 0 if i == 1 else 1 # update storage index
alliteration_score = count / total_words
return alliteration_score
Above is the proposed script. The variable proximity is introduced so that we consider syllables in alliteration, that are otherwise separated by multiple words. The stress_symbols variables reflect stress levels indicated on the CMU dictionary, and it could be easily incorporated in to the function.

Mutliple output files created but empty

I am trying to split one file with two articles in it into two separate files with one article in each, for subsequent analysis of the articles. Each article in the initial file has an ID that I want to use to separate the files with, using RE.
Below is the initial input file, with ID number:
166068619 #### "Epilepsy: let's end our ignorance of this neglected condition
Helen Stephens is a young woman with epilepsy [...]."
106899978 #### "Great British Payoff shows that BBC governance is broken
If it was a television series, they'd probably call it [...]."
However, when I run my code, I do get two separate files as an output but they are empty.
This is my code:
def file_split(path_to_file):
"""Function splits bigger file into N smaller ones, based on a certain RE
match, that is used to break the bigger file into smaller ones"""
def pattern_extract(path_to_file):
"""Function identifies the number of RE occurences in a file,
No. can be used in further analysis as range No."""
import re
x = []
with open(path_to_file) as f:
for line in f:
match = re.search(r'^\d+?\t####\t', line)
if match:
a = match.group()
x.append(a)
return len(x)
y = pattern_extract(path_to_file)
m = y + 1
files = [open('filename%i.txt' %i, 'w') for i in range(1,m)]
with open(path_to_file) as f:
for line in f:
match = re.search(r'^\d+?\t####\t', line)
if match:
a = match.group()
#files = [open('filename%i.txt' %i, 'w') for i in range(1, m)]
files[i-1].write(a)
for f in files:
f.close()
return files
Output result is as follows:
file_split(path)
Out[19]:
[<open file 'filename1.txt', mode 'w' at 0x7fe121b130c0>,
<open file 'filename2.txt', mode 'w' at 0x7fe121b131e0>]
I am new to Python and I am not quite sure where the problem lies. I checked some other answers that addressed the multiple file outputs but cannot figure out the solution. Help would be very much appreciated.
There are two problems with your code:
you write only the line matching the ID (actually, just the match itself), not the rest
you are always writing to the last file, as you use i, the loop variable "left over" from the list comprehension
To fix it, you could change the lower portion of your code to this:
y = pattern_extract(path_to_file)
files = [open('filename%i.txt' %i, 'w') for i in range(y)]
n = -1
with open(path_to_file) as f:
for line in f:
if re.search(r'^\d+\s+####\s+', line):
n += 1
files[n].write(line)
But you do not have to read the file two times at all, just to count the matches: Just open another file when the line matches an ID line and directly write to that last file in the list, then close all the files.
open_files = []
with open(path_to_file) as f:
for line in f:
if re.search(r'^\d+\s+####\s+', line):
open_files.append(open('filename%d.txt' % len(open_files), 'w'))
open_files[-1].write(line)
for f in open_files:
f.close()