Changing all occurences of similar word in csv python - python-2.7

I want to replace one specific word, 'my' with 'your'. But seems my code can only change one appearance.
import csv
path1 = "/home/bankdata/levelout.csv"
path2 = "/home/bankdata/leveloutmodify.csv"
in_file = open(path1,"rb")
reader = csv.reader(in_file)
out_file = open(path2,"wb")
writer = csv.writer(out_file)
with open(path1, 'r') as csv_file:
csvreader = csv.reader(csv_file)
col_count = 0
for row in csvreader:
while row[col_count] == 'my':
print 'my is used'
row[col_count] = 'your'
#writer.writerow(row[col_count])
writer.writerow(row)
col_count +=1
let's say the sentences is
'my book is gone and my bag is missing'
the output is
your book is gone and my bag is missing
the second thing is I want to make it appear without comma separated:
print row
the output is
your,book,is,gone,and,my,bag,is,missing,

for the second problem, im still trying to find the correct one as it keeps giving me the same output with comma separated.
with open(path1) as infile, open(path2, "w") as outfile:
for row in infile:
outfile.write(row.replace(",", ""))
print row
it gives me the result:
your,book,is,gone,and,my,bag,is,missing
I send out this sentence to my Nao robot and the robot seems pronouncing awkwardly as there are commas in between each word.
I solved it by:
with open(path1) as infile, open(path2, "w") as outfile:
for row in infile:
outfile.write(row.replace(",", ""))
with open(path2) as out:
for row in out:
print row
It gives me what I want:
your book is gone and your bag is missing too
However, any better way to do it?

Related

matching an entire list with each and every line of file

I had written a piece of code that basically performs find and replace from a list on a text file.
So, it maps the entire list into a dictionary. Then from text file each and every line is processed and is matched with entire list in the dictionary if a match anywhere in the line is found it replaces with corresponding value from the list(dictionary).
Here is the code:
import sys
import re
#open file using open file mode
fp1 = open(sys.argv[1]) # Open file on read mode
lines = fp1.read().split("\n") # Create a list containing all lines
fp1.close() # Close file
fp2 = open(sys.argv[2]) # Open file on read mode
words = fp2.read().split("\n") # Create a list containing all lines
fp2.close() # Close file
word_hash = {}
for word in words:
#print(word)
if(word != ""):
tsl = word.split("\t")
word_hash[tsl[0]] = tsl[1]
#print(word_hash)
keys = word_hash.keys()
#skeys = sorted(keys, key=lambda x:x.split(" "),reverse=True)
#print(keys)
#print (skeys)
for line in lines:
if(line != ""):
for key in keys:
#my_regex = key + r"\b"
my_regex = r"([\"\( ])" + key + r"([ ,\.!\"।)])"
#print(my_regex)
if((re.search(my_regex, line, re.IGNORECASE|re.UNICODE))):
line = re.sub(my_regex, r"\1" + word_hash[key]+r"\2",line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :1",line)
if((re.search(key + r"$", line, re.IGNORECASE|re.UNICODE))):
line = re.sub(key+r"$", word_hash[key],line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :2",line)
if((re.search(r"^" + key, line, re.IGNORECASE|re.UNICODE))):
#print(line)
line = re.sub(r"^" + key, word_hash[key],line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :",line)
print(line)
else:
print(line)
Problem here is when the list size grows execution slows up as all the lines of text file are matched with each and every key in list. So where can I improve the execution of this code.
List file:
word1===>replaceword1
word2===>replaceword2
.....
List is tab seperated. Here I used ===> for easy understanding.
Input file:
hello word1 I am here.
word2. how are you word1?
Expected Output:
hello replaceword1 I am here.
replaceword2. how are you replaceword1?
If your word list is small enough, the best speedup you can achieve with the match-and-replace process is to use a single big regexp and use a functionnal re.sub
This way you have a single call to the optimised function.
EDIT: In order to preserve order of replacements (this can lead to chain replacing, don't know if intended behavior) we can perform replacement by batches rather than in a single run, where batches order respects file order and each batch is made of disjoint possible string matches.
The code would be as follow
import sys
import re
word_hashes = []
def insert_word(word, replacement, hashes):
if not hashes:
return [{word: replacement}]
for prev_word in hashes[0]:
if word in prev_word or prev_word in word:
return [hashes[0]] + insert_word(word, replacement, hashes[1:])
hashes[0][word] = replacement
return hashes
with open(sys.argv[2]) as fp2: # Open file on read mode
words = fp2.readlines()
for word in [w.strip() for w in words if w.strip()]:
tsl = word.split("\t")
word_hashes = insert_word(tsl[0],tsl[1], word_hashes)
#open file using open file mode
lines = []
with open(sys.argv[1]) as fp1:
content = fp1.read()
for word_hash in word_hashes:
my_regex = r"([\"\( ])(" + '|'.join(word_hash.keys()) + r")([ ,\.!\"।)])"
content = re.sub(my_regex, lambda x: x.group(1) + word_hash[x.group(2)] + x.group(3) ,content,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
print(content)
We obtain chained replacement for the example data. For example, with the following words to replace
roses are red==>flowers are blue
are==>is
Text to parse
roses are red and beautiful
flowers are yellow
Output
roses is red and beautiful
flowers is yellow
Why don't you read the content of the entire file in a string, and just do string.replace. For example.
def find_replace():
txt = ''
#Read text from the file as a string
with open('file.txt', 'r') as fp:
txt = fp.read()
dct = {"word1":"replaceword1","word2":"replaceword2"}
#Find and replace characters
for k,v in dct.items():
txt = txt.replace(k,v)
#Write back the modified string
with open('file.txt', 'w') as fp:
fp.write(txt)
If the input file is:
hello word1 I am here.
word2. how are you word1?
The output will be:
hello replaceword1 I am here.
replaceword2. how are you replaceword1?

Saving line after line from a txt file

So I wrote this code:
import csv
data = []
filename = "S:\Doc\Python\Data\Dekomp\Hth.txt"
with open(filename) as f:
lines = f.readlines()
for line in lines:
if line.startswith('%'):
data.append(line.split('+')[0].strip())
if line.endswith('%'):
break
with open('S:\Doc\Python\Data\Dekomp\Test.csv', 'w') as f:
writer = csv.writer(f, delimiter=' ')
for line in data:
writer.writerow(line.split())
And my data looks like this:
Headline starts with "%th=number", while number changes from 2 to 180 (each segment plus 2, so it goes (2,4,6... up to180).
Between those segments I have three columns of data, which I would like to append to a csv file. While using my code I save only headliners so (%th=2, %th=4... %th=180). Do you have any idea how to change my code so it will start reading headline, then append data below to a .txt or .csv file, and then starts loop again when it "sees" another headline and continue the process with saving next segment to another file, and that up to "%th=180"?
UPDATE:
Input:
Expected output:
That the program will append to another file all the data below "%th=number", and then when the following segment appears it will save to another file, and the process will continue till the end of this file.
In other words each segment starts with even number so (2, 4, 6, 8 ... 180) so I should get 90 files, each for every segment.
UPDATE 2:
So I have change my code:
with open("S:\Doc\Python\Data\Dekomp\Hth.txt", 'r') as f:
with open("S:\Doc\Python\Data\Dekomp\Hth2.txt", 'w') as g:
for line in f:
if line.startswith("%"):
g.write(line)
if line.endswith("%"):
break
But right now the problem is that if I put this startswith and endswith python will save only headliner, if I delete them, the obivous thing happens, it saves everything from input file.
data = []
filename = "S:\Doc\Python\Data\Dekomp\Hth.txt"
with open(filename) as f:
lines = f.readlines() # Reading file
def _get_all_starting_index(data): # Calculating index of all lines starting with %
return [data.index(line) for line in data if line.startswith("%")]
indices= _get_all_starting_index(lines)
data_info_to_write_in_file = {} # for storing data to write in each individual file
for i in range(len(indices)): # looping over number of indices
key = lines[indices[i]] # key value for starting of a segment.
end_point = indices[i+1] if len(indices) > i+1 else len(indices) # finding end point.
lines_to_get = lines[indices[i]+1 : end_point] # getting lines in between and storing it in dictionary
data_info_to_write_in_file[key] = lines_to_get
for key in data_info_to_write_in_file.keys(): # writing info in each individual file
filename = "S:\Doc\Python\Data\Dekomp\{}.txt".format(key.strip().split("=")[-1])
with open(filename, 'w') as f:
for line in data_info_to_write_in_file[key]:
f.write(line)
Hope it will help.
Feel free to get any info.

Compare value in a column in one row of csv file to value in next row using python

I have been reading up on the csv.reader next but did not see a way to compare the values in a column from one row to the next. For instance, if my data looked like this in Maps.csv file:
County1 C:/maps/map1.pdf
County1 C:/maps/map2.pdf
County2 C:/maps/map1.pdf
County2 C:/maps/map3.pdf
County3 C:/maps/map3.pdf
County4 C:/maps/map2.pdf
County4 C:/maps/map4.pdf
If line two's county equals line one's county do something
The following code compares rows, I want to compare the county values between current and previous rows.
import csv.
f = open("Maps.csv", "r+")
ff = csv.reader(f)
pre_line = ff.next()
while(True):
try:
cur_line = ff.next()
if pre_line == cur_line:
print "Matches"
pre_line = cur_line
except:
break
I know I can grab the current value (see below) but do not know how to grab previous value. Is this possible? If so, could someone please tell me how. On day three of trying to solve writing my script to append pdf files from a csv file and am about to toss my coffee cup at my monitor. I am breaking these down into smaller parts and using simpler data as pilot. My file is much larger. I was advised to focus on just one issue at a time when posting to this forum. This is my latest issue. It seems no matter what tack I take, I can't seem to read the data the way I want. Arrrggghhhhh.
CurColor = row[color]
Using python 2.7
You already know how to look up the previous row. Why not get the column you need from that row?
import csv.
f = open("Maps.csv", "r+")
ff = csv.reader(f)
pre_line = ff.next()
while(True):
try:
cur_line = ff.next()
if pre_line[0] == cur_line[0]: # <-- compare first column
print "Matches"
pre_line = cur_line
except:
break
or more simply:
pre_line = ff.next()
for cur_line in ff:
if pre_line[0] == cur_line[0]: # <-- compare first column
print "Matches"
pre_line = cur_line
import csv
f = open("Maps.csv", "r+")
# Use delimiters to split each line into different elements
# In my example i used a comma. Your csv may have a different delimiter
# make sure the delimiter is a single character string though
# so no multiple spaces between "County1 C:/maps/map1.pdf"
# it should be something like "County1,C:/maps/map1.pdf"
ff = csv.reader(f, delimiter=',')
COUNTY_INDEX = 0
# each time ff.next() is called, it makes an array variable ['County1', 'C:/maps/map1.pdf ']
# since you want to compare the value in the first index, then you need to reference it like so
# the line below will set pre_line = 'County1'
pre_line = ff.next()[COUNTY_INDEX]
while(True):
try:
# the current line will be 'County1' or 'County2' etc...Depending on which line is read
cur_line = ff.next()[COUNTY_INDEX]
if pre_line == cur_line:
print "Matches"
pre_line = cur_line
except:
break

Mutliple output files created but empty

I am trying to split one file with two articles in it into two separate files with one article in each, for subsequent analysis of the articles. Each article in the initial file has an ID that I want to use to separate the files with, using RE.
Below is the initial input file, with ID number:
166068619 #### "Epilepsy: let's end our ignorance of this neglected condition
Helen Stephens is a young woman with epilepsy [...]."
106899978 #### "Great British Payoff shows that BBC governance is broken
If it was a television series, they'd probably call it [...]."
However, when I run my code, I do get two separate files as an output but they are empty.
This is my code:
def file_split(path_to_file):
"""Function splits bigger file into N smaller ones, based on a certain RE
match, that is used to break the bigger file into smaller ones"""
def pattern_extract(path_to_file):
"""Function identifies the number of RE occurences in a file,
No. can be used in further analysis as range No."""
import re
x = []
with open(path_to_file) as f:
for line in f:
match = re.search(r'^\d+?\t####\t', line)
if match:
a = match.group()
x.append(a)
return len(x)
y = pattern_extract(path_to_file)
m = y + 1
files = [open('filename%i.txt' %i, 'w') for i in range(1,m)]
with open(path_to_file) as f:
for line in f:
match = re.search(r'^\d+?\t####\t', line)
if match:
a = match.group()
#files = [open('filename%i.txt' %i, 'w') for i in range(1, m)]
files[i-1].write(a)
for f in files:
f.close()
return files
Output result is as follows:
file_split(path)
Out[19]:
[<open file 'filename1.txt', mode 'w' at 0x7fe121b130c0>,
<open file 'filename2.txt', mode 'w' at 0x7fe121b131e0>]
I am new to Python and I am not quite sure where the problem lies. I checked some other answers that addressed the multiple file outputs but cannot figure out the solution. Help would be very much appreciated.
There are two problems with your code:
you write only the line matching the ID (actually, just the match itself), not the rest
you are always writing to the last file, as you use i, the loop variable "left over" from the list comprehension
To fix it, you could change the lower portion of your code to this:
y = pattern_extract(path_to_file)
files = [open('filename%i.txt' %i, 'w') for i in range(y)]
n = -1
with open(path_to_file) as f:
for line in f:
if re.search(r'^\d+\s+####\s+', line):
n += 1
files[n].write(line)
But you do not have to read the file two times at all, just to count the matches: Just open another file when the line matches an ID line and directly write to that last file in the list, then close all the files.
open_files = []
with open(path_to_file) as f:
for line in f:
if re.search(r'^\d+\s+####\s+', line):
open_files.append(open('filename%d.txt' % len(open_files), 'w'))
open_files[-1].write(line)
for f in open_files:
f.close()

Converting a text file into a dictionary in python

So, I'm a bit new to Python and I've come across the following problem in one of my codes:
I have a txt file with the following text:
Jolly 77777
Fargo 88888
Hunt 68548
I want to convert it into a dictionary with BOTH the name and number as keys... Here's what I have so far but I keep getting a traceback error and am not sure as to what error I am making. It's driving me nuts; Help?
This is what I have so far:
filename = open("ident.txt","r")
dictionary={}
with open('ident.txt','r') as f:
for line in f.readlines():
a,b = line.split()
dictionary[a] = int(b)
You're close:
dictionary = {}
with open('ident.txt','r') as f:
for line in f:
a,b = line.split()
dictionary[a] = int(b)
That yields a dictionary value of:
{'Fargo': 88888, 'Hunt': 68548, 'Jolly': 77777}
FWIW, the line filename = open("ident.txt","r") isn't going to do you any favors, since filename will end up being an open file, not a filename. And you don't need f.readlines(). Files iterate fine on their own, line by line.