Notepad++: Replace find query with words from list - regex

I would like to replace all "var_"
var_
Hello
var_
Whats
var_
Up?
...
with words from this list
alpha
beta
gamma
...
so the end result is
alpha
Hello
beta
Whats
gamma
Up?
...
Would appreciate help on achieving this!

This is sort of impossible / overly complicated with a regex. However, if you combine it with a programming language, you can get it done quickly. E.g. in python it would look like this:
import sys
import re
import fileinput
if len(sys.argv) < 3:
exit("Usage: " + sys.argv[0] + " <filename> <replacements>")
input_file = sys.argv[1]
replacements = sys.argv[2:]
num_of_replacements = len(replacements)
replacement_index = 0
searcher = re.compile("^var_\\b")
for line in fileinput.input(input_file, inplace=True, backup='.bak'):
match = searcher.match(line)
if match is None:
print(line.rstrip())
else:
print(re.sub("^var_\\b", line.rstrip(), replacements[replacement_index]))
replacement_index = replacement_index + 1
Usage: replacer.py ExampleInput.txt alpha beta gamma
Update
It's possible to modify the program to accept the string you search for as the 1st param:
replacer.py "var_" ExampleInput.txt alpha beta gamma
The modified python script looks like this:
import sys
import re
import fileinput
if len(sys.argv) < 4:
exit("Usage: " + sys.argv[0] + " <pattern> <filename> <replacements>")
search = "\\b" + sys.argv[1] + "\\b"
input_file = sys.argv[2]
replacements = sys.argv[3:]
num_of_replacements = len(replacements)
replacement_index = 0
searcher = re.compile(search)
for line in fileinput.input(input_file, inplace=True, backup='.bak'):
match = searcher.match(line)
if match is None:
print(line.rstrip())
else:
print(re.sub(search, line.rstrip(), replacements[replacement_index]))
replacement_index = replacement_index + 1
Note: this script still has a few limitations:
it expects that the string you search for occurs only once each line.
it replaces the searched string only if it's a distinct word
you can accidentally incorporate any python regex syntax into the search param

Related

Text processing to get if else type condition from a string

First of all, I am sorry about the weird question heading. Couldn't express it in one line.
So, the problem statement is,
If I am given the following string --
"('James Gosling'/jamesgosling/james gosling) , ('SUN Microsystem'/sunmicrosystem), keyword"
I have to parse it as
list1 = ["'James Gosling'", 'jamesgosling', 'jame gosling']
list2 = ["'SUN Microsystem'", 'sunmicrosystem']
list3 = [ list1, list2, keyword]
So that, if I enter James Gosling Sun Microsystem keyword it should tell me that what I have entered is 100% correct
And if I enter J Gosling Sun Microsystem keyword it should say i am only 66.66% correct.
This is what I have tried so far.
import re
def main():
print("starting")
sentence = "('James Gosling'/jamesgosling/jame gosling) , ('SUN Microsystem'/sunmicrosystem), keyword"
splited = sentence.split(",")
number_of_primary_keywords = len(splited)
#print(number_of_primary_keywords, "primary keywords length")
number_of_brackets = 0
inside_quotes = ''
inside_quotes_1 = ''
inside_brackets = ''
for n in range(len(splited)):
#print(len(re.findall('\w+', splited[n])), "length of splitted")
inside_brackets = splited[n][splited[n].find("(") + 1: splited[n].find(")")]
synonyms = inside_brackets.split("/")
for x in range(len(synonyms)):
try:
inside_quotes_1 = synonyms[x][synonyms[x].find("\"") + 1: synonyms[n].find("\"")]
print(inside_quotes_1)
except:
pass
try:
inside_quotes = synonyms[x][synonyms[x].find("'") + 1: synonyms[n].find("'")]
print(inside_quotes)
except:
pass
#print(synonyms[x])
number_of_brackets += 1
print(number_of_brackets)
if __name__ == '__main__':
main()
Output is as follows
'James Gosling
jamesgoslin
jame goslin
'SUN Microsystem
SUN Microsystem
sunmicrosyste
sunmicrosyste
3
As you can see, the last letters of some words are missing.
So, if you read this far, I hope you can help me in getting the expected output
Unfortunately, your code has a logic issue that I could not figure it out, however there might be in these lines:
inside_quotes_1 = synonyms[x][synonyms[x].find("\"") + 1: synonyms[n].find("\"")]
inside_quotes = synonyms[x][synonyms[x].find("'") + 1: synonyms[n].find("'")]
which by the way you can simply use:
inside_quotes_1 = synonyms[x][synonyms[x].find("\x22") + 1: synonyms[n].find("\x22")]
inside_quotes = synonyms[x][synonyms[x].find("\x27") + 1: synonyms[n].find("\x27")]
Other than that, you seem to want to extract the words with their indices, which you can extract them using a basic expression:
(\w+)
Then, you might want to find a simple way to locate the indices, where the words are. Then, associate each word to the desired indices.
Example Test
# -*- coding: UTF-8 -*-
import re
string = "('James Gosling'/jamesgosling/james gosling) , ('SUN Microsystem'/sunmicrosystem), keyword"
expression = r'(\w+)'
match = re.search(expression, string)
if match:
print("YAAAY! \"" + match.group(1) + "\" is a match 💚💚💚 ")
else:
print('🙀 Sorry! No matches! Something is not right! Call 911 👮')

Rearranging elements in Python

i am new to Python and i cant get this.I have a List and i want to take the input from there and write those in files .
p = ['Eth1/1', 'Eth1/5','Eth2/1', 'Eth2/4','Eth101/1/1', 'Eth101/1/2', 'Eth101/1/3','Eth102/1/1', 'Eth102/1/2', 'Eth102/1/3','Eth103/1/1', 'Eth103/1/2', 'Eth103/1/3','Eth103/1/4','Eth104/1/1', 'Eth104/1/2', 'Eth104/1/3','Eth104/1/4']
What i am trying :
with open("abc1.txt", "w+") as fw1, open("abc2.txt", "w+") as fw2:
for i in p:
if len(i.partition("/")[0]) == 4:
fw1.write('int ' + i + '\n mode\n')
else:
i = 0
while i < len(p):
start = p[i].split('/')
if (start[0] == 'Eth101'):
i += 3
key = start[0]
i += 1
while i < len(p) and p[i].split('/')[0] == key:
i += 1
end = p[i-1].split('/')
fw2.write('confi ' + start[0] + '/' + start[1] + '-' + end[1] + '\n mode\n')
What i am looking for :
abc1.txt should have
int Eth1/1
mode
int Eth1/5
mode
int Eth2/1
mode
int Eth 2/4
mode
abc2.txt should have :
int Eth101/1/1-3
mode
int Eth102/1/1-3
mode
int Eth103/1/1-4
mode
int Eth104/1/1-4
mode
So any Eth having 1 digit before " / " ( e:g Eth1/1 or Eth2/2
)should be in one file that is abc1.txt .
Any Eth having 3 digit before " / " ( e:g Eth101/1/1 or Eth 102/1/1
) should be in another file that is abc2.txt and .As these are in
ranges , need to write it like Eth101/1/1-3, Eth102/1/1-3 etc
Any Idea ?
I don't think you need a regex here, at all. All your items begin with 'Eth' followed by one or more digits. So you can check the length of the items before first / occurs and then write it to a file.
p = ['Eth1/1', 'Eth1/5','Eth2/1', 'Eth2/4','Eth101/1/1', 'Eth101/1/2', 'Eth101/1/3','Eth102/1/1', 'Eth102/1/2', 'Eth102/1/3','Eth103/1/1', 'Eth103/1/2', 'Eth103/1/3','Eth103/1/4','Eth104/1/1', 'Eth104/1/2', 'Eth104/1/3','Eth104/1/4']
with open("abc1.txt", "w+") as fw1, open("abc2.txt", "w+") as fw2:
for i in p:
if len(i.partition("/")[0]) == 4:
fw1.write('int ' + i + '\n mode\n')
else:
fw2.write('int ' + i + '\n mode\n')
I refactored your code a little to bring with-statement into play. This will handle correctly closing the file at the end. Also it is not necessary to iterate twice over the sequence, so it's all done in one iteration.
If the data is not as clean as provided, then you maybe want to use regexes. Independent of the regex itself, by writing if re.match(r'((Eth\d{1}\/\d{1,2})', "p" ) you proof if a match object can be created for given regex on the string "p", not the value of the variable p. This is because you used " around p.
So this should work for your example. If you really need a regex, this will turn your problem in finding a good regex to match your needs without any other issues.
As these are in ranges , need to write it like Eth101/1/1-3, Eth102/1/1-3 etc
This is something you can achieve by first computing the string and then write it in the file. But this is more like a separate question.
UPDATE
It's not that trivial to compute the right network ranges. Here I can present you one approach which doesn't change my code but adds some functionality. The trick here is to get groups of connected networks which aren't interrupted by their numbers. For that I've copied consecutive_groups. You can also do a pip install more-itertools of course to get that functionality. And also I transformed the list to a dict to prepare the magic and then retransformed dict to list again. There are definitely better ways of doing it, but this worked for your input data, at least.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from itertools import groupby
from operator import itemgetter
p = ['Eth1/1', 'Eth1/5', 'Eth2/1', 'Eth2/4', 'Eth101/1/1', 'Eth101/1/2',
'Eth101/1/3', 'Eth102/1/1', 'Eth102/1/2', 'Eth102/1/3', 'Eth103/1/1',
'Eth103/1/2', 'Eth103/1/3', 'Eth103/1/4', 'Eth104/1/1', 'Eth104/1/2',
'Eth104/1/3', 'Eth104/1/4']
def get_network_ranges(networks):
network_ranges = {}
result = []
for network in networks:
parts = network.rpartition("/")
network_ranges.setdefault(parts[0], []).append(int(parts[2]))
for network, ranges in network_ranges.items():
ranges.sort()
for group in consecutive_groups(ranges):
group = list(group)
if len(group) == 1:
result.append(network + "/" + str(group[0]))
else:
result.append(network + "/" + str(group[0]) + "-" +
str(group[-1]))
result.sort() # to get ordered results
return result
def consecutive_groups(iterable, ordering=lambda x: x):
"""taken from more-itertools (latest)"""
for k, g in groupby(
enumerate(iterable), key=lambda x: x[0] - ordering(x[1])
):
yield map(itemgetter(1), g)
# only one line added to do the magic
with open("abc1.txt", "w+") as fw1, open("abc2.txt", "w+") as fw2:
p = get_network_ranges(p)
for i in p:
if len(i.partition("/")[0]) == 4:
fw1.write('int ' + i + '\n mode\n')
else:
fw2.write('int ' + i + '\n mode\n')

Python regular expression with unicode

I use python 2.7 (I cannot use 3.4),
text = """
saú$_ß$¤×÷asd县阴őasdCharacters: \"县阴 asdsadsasd县阴
"""
text = unicode(text, "utf-8")
print("Method 1\n")
reg = "Characters: \"[\u4e00-\u9fff]+.*?"
reg = unicode(reg, "utf-8")
pattern = re.compile(reg, re.UNICODE | re.MULTILINE)
for m in re.findall(pattern, text): # Number of occurrences in the 'k' line.
print("Results: %s" % m.encode(sys.stdout.encoding, errors='replace'))
print("Method 2\n")
reg = u"Characters: \"[\u4e00-\u9fff]+.*?"
pattern = re.compile(reg, re.UNICODE | re.MULTILINE)
for m in re.findall(pattern, text): # Number of occurrences in the 'k' line.
print("Results: %s" % m.encode(sys.stdout.encoding, errors='replace'))
The output is:
Method 1
Method 2
Results: Characters: "??
The question is how can I make the method 2 result with variables. I didn't find any solution yet and I don't understand why the method 1 doesn't work.
Thanks for any suggestion.
Method 1 does not work because \u#### doesn't mean anything in the case of an encoded sequence. Instead, you need the correct sequence in bytes. If you do this, then method 1 will produce the same results as method 2. I modified your code as follows:
# -*- coding: utf-8 -*-
import sys
import re
text = """
saú$_ß$¤×÷asd县阴őasdCharacters: \"县阴 asdsadsasd县阴
"""
text = unicode(text, "utf-8")
print("\nMethod 1\n")
reg = "Characters: \"[\xe4\xb8\x80-\xe9\xbf\xbf]+.*?"
reg = unicode(reg, "utf-8")
pattern = re.compile(reg, re.UNICODE | re.MULTILINE)
for m in re.findall(pattern, text): # Number of occurrences in the 'k' line.
print("Results: %s" % m.encode(sys.stdout.encoding, errors='replace'))
print("\nMethod 2\n")
reg = u"Characters: \"[\u4e00-\u9fff]+.*?"
pattern = re.compile(reg, re.UNICODE | re.MULTILINE)
for m in re.findall(pattern, text): # Number of occurrences in the 'k' line.
print("Results: %s" % m.encode(sys.stdout.encoding, errors='replace'))
It produces the following results on my machine:
Method 1
Results: Characters: "县阴
Method 2
Results: Characters: "县阴

Error in mapper and reducer with python

There is a problem in the mapper.py file when I run it in the cluster. The error is " unexpected syntax before line" in "strl = line.strip()".
There is no error when I test it locally. I want to get the words of text file stored and change their format and count them and send to the output in s3 bucket.
Guidance most welcome. Thanks
mapper:
import sys, re
for line in sys.stdin:
strl = line.strip()
words = strl.split()
for word in words:
word = word.lower()
result = ""
charref = re.compile("[a-f]")
match = charref.search(word[0])
if match:
result+= "TR2234J"
else:
result+= ""
print result, "\t"
reducer:
import sys
for line in sys.stdin:
line = line.strip()
new_word =""
words = line.split("\t")
final_count = len(words)
my_num = final_count / 6
for i in range (my_num):
new_word = "".join(words[i*6:10+(i*6)])
print new_word, "\t"

regex for detecting subtitle errors

I'm having some issues with subtitles, I need a way to detect specific errors. I think regular expressions would help but need help figuring this one out. In this example of SRT formatted subtitle, line #13 ends at 00:01:10,130 and line #14 begins at 00:01:10:129.
13
00:01:05,549 --> 00:01:10,130
some text here.
14
00:01:10,129 --> 00:01:14,109
some other text here.
Problem is that next line can't begin before current one is over - embedding algorithm doesn't work when that happens. I need to check my SRT files and correct this manually, but looking for this manually in about 20 videos each an hour long just isn't an option. Specially since I need it 'yesterday' (:
Format for SRT subtitles is very specific:
XX
START --> END
TEXT
EMPTY LINE
[line number (digits)][new line character]
[start and end times in 00:00:00,000 format, separated by _space__minusSign__minusSign__greaterThenSign__space_][new line character]
[text - can be any character - letter, digit, punctuation sign.. pretty much anything][new line character]
[new line character]
I need to check if END time is greater then START time of the following subtitle. Help would be appreciated.
PS. I can work with Notepad++, Eclipse (Aptana), python or javascript...
Regular expressions can be used to achieve what you want, that being said, they can't do it on their own. Regular expressions are used for matching patterns and not numerical ranges.
If I where you, what I would do would be as following:
Parse the file and place the start-end time in one data structure (call it DS_A) and the text in another (call it DS_B).
Sort DS_A in ascending order. This should guarantee that you will not have overlapping ranges. (This previous SO post should point you in the right direction).
Iterate over and write the following in your file:j DS_A[i] --> DS_A[i + 1] <newline> DS_B[j] where i is a loop counter for DS_A and j is a loop counter for DS_B.
I ended up writing short script to fix this. here it is:
# -*- coding: utf-8 -*-
from datetime import datetime
import getopt, re, sys
count = 0
def fix_srt(inputfile):
global count
parsed_file, errors_file = '', ''
try:
with open( inputfile , 'r') as f:
srt_file = f.read()
parsed_file, errors_file = parse_srt(srt_file)
except:
pass
finally:
outputfile1 = ''.join( inputfile.split('.')[:-1] ) + '_fixed.srt'
outputfile2 = ''.join( inputfile.split('.')[:-1] ) + '_error.srt'
with open( outputfile1 , 'w') as f:
f.write(parsed_file)
with open( outputfile2 , 'w') as f:
f.write(errors_file)
print 'Detected %s errors in "%s". Fixed file saved as "%s"
(Errors only as "%s").' % ( count, inputfile, outputfile1, outputfile2 )
previous_end_time = datetime.strptime("00:00:00,000", "%H:%M:%S,%f")
def parse_times(times):
global previous_end_time
global count
_error = False
_times = []
for time_code in times:
t = datetime.strptime(time_code, "%H:%M:%S,%f")
_times.append(t)
if _times[0] < previous_end_time:
_times[0] = previous_end_time
count += 1
_error = True
previous_end_time = _times[1]
_times[0] = _times[0].strftime("%H:%M:%S,%f")[:12]
_times[1] = _times[1].strftime("%H:%M:%S,%f")[:12]
return _times, _error
def parse_srt(srt_file):
parsed_srt = []
parsed_err = []
for srt_group in re.sub('\r\n', '\n', srt_file).split('\n\n'):
lines = srt_group.split('\n')
if len(lines) >= 3:
times = lines[1].split(' --> ')
correct_times, error = parse_times(times)
if error:
clean_text = map( lambda x: x.strip(' '), lines[2:] )
srt_group = lines[0].strip(' ') + '\n' + ' --> '.join( correct_times ) + '\n' + '\n'.join( clean_text )
parsed_err.append( srt_group )
parsed_srt.append( srt_group )
return '\r\n'.join( parsed_srt ), '\r\n'.join( parsed_err )
def main(argv):
inputfile = None
try:
options, arguments = getopt.getopt(argv, "hi:", ["input="])
except:
print 'Usage: test.py -i <input file>'
for o, a in options:
if o == '-h':
print 'Usage: test.py -i <input file>'
sys.exit()
elif o in ['-i', '--input']:
inputfile = a
fix_srt(inputfile)
if __name__ == '__main__':
main( sys.argv[1:] )
If someone needs it save the code as srtfix.py, for example, and use it from command line:
python srtfix.py -i "my srt subtitle.srt"
I was lazy and used datetime module to process timecodes, so not sure script will work for subtitles longer then 24h (: I'm also not sure when miliseconds were added to Python's datetime module, I'm using version 2.7.5; it's possible script won't work on earlier versions because of this...