How to replace multiple value in python with the re module - regex

I need to replace some text inside a file with the python re module.
Here is the input value :
<li><span class="PCap CharOverride-4">Contrôles</span> <span class="PCap CharOverride-4">Testes</span></li>
and the excepting output is this :
<li><span class="PCap CharOverride-4">C<span style="font-size:83%">ONTRôLES</span></span>
<span class="PCap CharOverride-4">T<span style="font-size:83%">ESTES</span></span></li>
but insted, I get this as result :
<li><span class="PCap CharOverride-4">C<span style="font-size:83%">ONTRôLES</span></span> <span class="PCap CharOverride-4">C<span style="font-size:83%">ONTRôLES</span></span></li>
Is there something that I missed ?
Here is what I've done so far :
for line in file_data.readlines():
#print(line)
reg = re.compile(r'(?P<b1>(<'+balise_name+' class="(([a-zA-Z0-9_\-]*?) |)'+class_value+')(| ([a-zA-Z0-9_\-]*?))">)(?P<maj>([A-ZÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ]))(?P<min>([a-zàáâãäåæçèéëìíîïðòóôõöøùúûüýÿµœš]*?))(?P<b2>(<\/'+balise_name+'>))')
#print(reg)
search = reg.findall(line)
print(search)
if (search != None):
for matchObj in search:
print(matchObj)
#print(matchObj[8])
print(line)
balise1 = matchObj[0] #search.group('b1')
print(balise1)
balise2 = matchObj[10] #matchObj.group('b2')
print(balise2)
maj = matchObj[6] #matchObj.group('maj')
print(maj)
min = matchObj[8] #matchObj.group('min')
print(min)
sub_str = balise1+""+maj+"<span style=\"font-size:83%\">"+min.upper()+"</span>"+balise2
line = re.sub(reg, sub_str, line)
#ouverture du fichier pour ajour ligne
filename = file_name.split(".")
#file_result = open(filename[0]+"-OK."+filename[1], "a")
#file_result.writelines(line)
#file_data.writelines(line)
#file_result.close()
print(line)
NB : I don't know how to use the module Beautifulsoup of python so why I do it manually.
Pardon me for my poor english.
Thanks for your answer !!

So, I totally forgot about this question but here is the solution I came up with after fixing the code I wrote long time ago :
for line in file_data.readlines():
reg = re.compile(r'(?P<b1>(\<' + balise_name + ' class=\"(([a-zA-Z0-9_\-]*?) |)' + class_value +
')(| ([a-zA-Z0-9_\-]*?))\"\>)(?P<maj>([A-ZÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ]))(?P<min>([a-zàáâãäåæçèéëìíîïðòóôõöøùúûüýÿµœš]*?))(?P<b2>(\<\/' + balise_name + '\>))')
print(line)
while reg.search(line):
search = reg.search(line)
if search:
print(search)
while search:
balise1 = search[0] # search.group('b1')
print('b1 : ' + str(balise1))
balise2 = search[11] # search.group('b2')
print('b2 : ' + str(balise2))
maj = search[7] # search.group('maj')
print('maj : ' + str(maj))
min = search[9] # search.group('min')
print('min : ' + str(min))
sub_str = search[1] + "" + maj + "<span style=\"font-size:83%\">" + min.upper() + \
"</span>" + balise2
print(sub_str)
line = re.sub(str(search[0]), sub_str, line)
print(line)
search = None
Here is what I changed with the code :
Fix some unescaped char inside the pattern
Iterate the result one by one
Fix group number for the sub function
Hope it will help someone who faced the same problem as me.

Related

Text processing to get if else type condition from a string

First of all, I am sorry about the weird question heading. Couldn't express it in one line.
So, the problem statement is,
If I am given the following string --
"('James Gosling'/jamesgosling/james gosling) , ('SUN Microsystem'/sunmicrosystem), keyword"
I have to parse it as
list1 = ["'James Gosling'", 'jamesgosling', 'jame gosling']
list2 = ["'SUN Microsystem'", 'sunmicrosystem']
list3 = [ list1, list2, keyword]
So that, if I enter James Gosling Sun Microsystem keyword it should tell me that what I have entered is 100% correct
And if I enter J Gosling Sun Microsystem keyword it should say i am only 66.66% correct.
This is what I have tried so far.
import re
def main():
print("starting")
sentence = "('James Gosling'/jamesgosling/jame gosling) , ('SUN Microsystem'/sunmicrosystem), keyword"
splited = sentence.split(",")
number_of_primary_keywords = len(splited)
#print(number_of_primary_keywords, "primary keywords length")
number_of_brackets = 0
inside_quotes = ''
inside_quotes_1 = ''
inside_brackets = ''
for n in range(len(splited)):
#print(len(re.findall('\w+', splited[n])), "length of splitted")
inside_brackets = splited[n][splited[n].find("(") + 1: splited[n].find(")")]
synonyms = inside_brackets.split("/")
for x in range(len(synonyms)):
try:
inside_quotes_1 = synonyms[x][synonyms[x].find("\"") + 1: synonyms[n].find("\"")]
print(inside_quotes_1)
except:
pass
try:
inside_quotes = synonyms[x][synonyms[x].find("'") + 1: synonyms[n].find("'")]
print(inside_quotes)
except:
pass
#print(synonyms[x])
number_of_brackets += 1
print(number_of_brackets)
if __name__ == '__main__':
main()
Output is as follows
'James Gosling
jamesgoslin
jame goslin
'SUN Microsystem
SUN Microsystem
sunmicrosyste
sunmicrosyste
3
As you can see, the last letters of some words are missing.
So, if you read this far, I hope you can help me in getting the expected output
Unfortunately, your code has a logic issue that I could not figure it out, however there might be in these lines:
inside_quotes_1 = synonyms[x][synonyms[x].find("\"") + 1: synonyms[n].find("\"")]
inside_quotes = synonyms[x][synonyms[x].find("'") + 1: synonyms[n].find("'")]
which by the way you can simply use:
inside_quotes_1 = synonyms[x][synonyms[x].find("\x22") + 1: synonyms[n].find("\x22")]
inside_quotes = synonyms[x][synonyms[x].find("\x27") + 1: synonyms[n].find("\x27")]
Other than that, you seem to want to extract the words with their indices, which you can extract them using a basic expression:
(\w+)
Then, you might want to find a simple way to locate the indices, where the words are. Then, associate each word to the desired indices.
Example Test
# -*- coding: UTF-8 -*-
import re
string = "('James Gosling'/jamesgosling/james gosling) , ('SUN Microsystem'/sunmicrosystem), keyword"
expression = r'(\w+)'
match = re.search(expression, string)
if match:
print("YAAAY! \"" + match.group(1) + "\" is a match 💚💚💚 ")
else:
print('🙀 Sorry! No matches! Something is not right! Call 911 👮')

Rearranging elements in Python

i am new to Python and i cant get this.I have a List and i want to take the input from there and write those in files .
p = ['Eth1/1', 'Eth1/5','Eth2/1', 'Eth2/4','Eth101/1/1', 'Eth101/1/2', 'Eth101/1/3','Eth102/1/1', 'Eth102/1/2', 'Eth102/1/3','Eth103/1/1', 'Eth103/1/2', 'Eth103/1/3','Eth103/1/4','Eth104/1/1', 'Eth104/1/2', 'Eth104/1/3','Eth104/1/4']
What i am trying :
with open("abc1.txt", "w+") as fw1, open("abc2.txt", "w+") as fw2:
for i in p:
if len(i.partition("/")[0]) == 4:
fw1.write('int ' + i + '\n mode\n')
else:
i = 0
while i < len(p):
start = p[i].split('/')
if (start[0] == 'Eth101'):
i += 3
key = start[0]
i += 1
while i < len(p) and p[i].split('/')[0] == key:
i += 1
end = p[i-1].split('/')
fw2.write('confi ' + start[0] + '/' + start[1] + '-' + end[1] + '\n mode\n')
What i am looking for :
abc1.txt should have
int Eth1/1
mode
int Eth1/5
mode
int Eth2/1
mode
int Eth 2/4
mode
abc2.txt should have :
int Eth101/1/1-3
mode
int Eth102/1/1-3
mode
int Eth103/1/1-4
mode
int Eth104/1/1-4
mode
So any Eth having 1 digit before " / " ( e:g Eth1/1 or Eth2/2
)should be in one file that is abc1.txt .
Any Eth having 3 digit before " / " ( e:g Eth101/1/1 or Eth 102/1/1
) should be in another file that is abc2.txt and .As these are in
ranges , need to write it like Eth101/1/1-3, Eth102/1/1-3 etc
Any Idea ?
I don't think you need a regex here, at all. All your items begin with 'Eth' followed by one or more digits. So you can check the length of the items before first / occurs and then write it to a file.
p = ['Eth1/1', 'Eth1/5','Eth2/1', 'Eth2/4','Eth101/1/1', 'Eth101/1/2', 'Eth101/1/3','Eth102/1/1', 'Eth102/1/2', 'Eth102/1/3','Eth103/1/1', 'Eth103/1/2', 'Eth103/1/3','Eth103/1/4','Eth104/1/1', 'Eth104/1/2', 'Eth104/1/3','Eth104/1/4']
with open("abc1.txt", "w+") as fw1, open("abc2.txt", "w+") as fw2:
for i in p:
if len(i.partition("/")[0]) == 4:
fw1.write('int ' + i + '\n mode\n')
else:
fw2.write('int ' + i + '\n mode\n')
I refactored your code a little to bring with-statement into play. This will handle correctly closing the file at the end. Also it is not necessary to iterate twice over the sequence, so it's all done in one iteration.
If the data is not as clean as provided, then you maybe want to use regexes. Independent of the regex itself, by writing if re.match(r'((Eth\d{1}\/\d{1,2})', "p" ) you proof if a match object can be created for given regex on the string "p", not the value of the variable p. This is because you used " around p.
So this should work for your example. If you really need a regex, this will turn your problem in finding a good regex to match your needs without any other issues.
As these are in ranges , need to write it like Eth101/1/1-3, Eth102/1/1-3 etc
This is something you can achieve by first computing the string and then write it in the file. But this is more like a separate question.
UPDATE
It's not that trivial to compute the right network ranges. Here I can present you one approach which doesn't change my code but adds some functionality. The trick here is to get groups of connected networks which aren't interrupted by their numbers. For that I've copied consecutive_groups. You can also do a pip install more-itertools of course to get that functionality. And also I transformed the list to a dict to prepare the magic and then retransformed dict to list again. There are definitely better ways of doing it, but this worked for your input data, at least.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from itertools import groupby
from operator import itemgetter
p = ['Eth1/1', 'Eth1/5', 'Eth2/1', 'Eth2/4', 'Eth101/1/1', 'Eth101/1/2',
'Eth101/1/3', 'Eth102/1/1', 'Eth102/1/2', 'Eth102/1/3', 'Eth103/1/1',
'Eth103/1/2', 'Eth103/1/3', 'Eth103/1/4', 'Eth104/1/1', 'Eth104/1/2',
'Eth104/1/3', 'Eth104/1/4']
def get_network_ranges(networks):
network_ranges = {}
result = []
for network in networks:
parts = network.rpartition("/")
network_ranges.setdefault(parts[0], []).append(int(parts[2]))
for network, ranges in network_ranges.items():
ranges.sort()
for group in consecutive_groups(ranges):
group = list(group)
if len(group) == 1:
result.append(network + "/" + str(group[0]))
else:
result.append(network + "/" + str(group[0]) + "-" +
str(group[-1]))
result.sort() # to get ordered results
return result
def consecutive_groups(iterable, ordering=lambda x: x):
"""taken from more-itertools (latest)"""
for k, g in groupby(
enumerate(iterable), key=lambda x: x[0] - ordering(x[1])
):
yield map(itemgetter(1), g)
# only one line added to do the magic
with open("abc1.txt", "w+") as fw1, open("abc2.txt", "w+") as fw2:
p = get_network_ranges(p)
for i in p:
if len(i.partition("/")[0]) == 4:
fw1.write('int ' + i + '\n mode\n')
else:
fw2.write('int ' + i + '\n mode\n')

Matching word pattern with character pattern

So I have a very interesting question where I have a long string s such as:
eatsleepeatwalksleepwalk
and a smaller string p such as:
esetst
so on a quick look you can deduce that:
eat = e
sleep = s
walk = t
The problem statement is to tell whether the pattern of characters in smaller string p matches the words in the bigger string s
Size of s = 0 to 1000
Size of p = 0 to 1000
I'm aware of simple pattern matching using KMP, however this problem seems quite tricky and I'm unable to get to a starting point of solving this problem.
Any hints?
Edit 1: Look at #Neverever's answer below. Seems quite interesting, awaiting examination of space/time complexity.
Tried to solve it using JavaScript RegExp
$("button").click(function() {
let p = $("#p").val()
, s = $("#s").val()
, regMap = []
, regStr = "";
for (let c of p) {
let idx = regMap.indexOf(c);
if (idx === -1) {
regMap.push(c);
regStr += "(.+)";
} else {
regStr += "\\" + (idx + 1);
}
}
let reg = new RegExp("^" + regStr + "$");
console.log("RegExp used: " + regStr)
console.log("Result: " + reg.test(s));
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<label>String `s`: <input type="text" id="s" value="eatsleepeatwalksleepwalk" /></label><br>
<label>String `p`: <input type="text" id="p" value="esetst" /></label><br>
<button type="button">Run</button>

regex for detecting subtitle errors

I'm having some issues with subtitles, I need a way to detect specific errors. I think regular expressions would help but need help figuring this one out. In this example of SRT formatted subtitle, line #13 ends at 00:01:10,130 and line #14 begins at 00:01:10:129.
13
00:01:05,549 --> 00:01:10,130
some text here.
14
00:01:10,129 --> 00:01:14,109
some other text here.
Problem is that next line can't begin before current one is over - embedding algorithm doesn't work when that happens. I need to check my SRT files and correct this manually, but looking for this manually in about 20 videos each an hour long just isn't an option. Specially since I need it 'yesterday' (:
Format for SRT subtitles is very specific:
XX
START --> END
TEXT
EMPTY LINE
[line number (digits)][new line character]
[start and end times in 00:00:00,000 format, separated by _space__minusSign__minusSign__greaterThenSign__space_][new line character]
[text - can be any character - letter, digit, punctuation sign.. pretty much anything][new line character]
[new line character]
I need to check if END time is greater then START time of the following subtitle. Help would be appreciated.
PS. I can work with Notepad++, Eclipse (Aptana), python or javascript...
Regular expressions can be used to achieve what you want, that being said, they can't do it on their own. Regular expressions are used for matching patterns and not numerical ranges.
If I where you, what I would do would be as following:
Parse the file and place the start-end time in one data structure (call it DS_A) and the text in another (call it DS_B).
Sort DS_A in ascending order. This should guarantee that you will not have overlapping ranges. (This previous SO post should point you in the right direction).
Iterate over and write the following in your file:j DS_A[i] --> DS_A[i + 1] <newline> DS_B[j] where i is a loop counter for DS_A and j is a loop counter for DS_B.
I ended up writing short script to fix this. here it is:
# -*- coding: utf-8 -*-
from datetime import datetime
import getopt, re, sys
count = 0
def fix_srt(inputfile):
global count
parsed_file, errors_file = '', ''
try:
with open( inputfile , 'r') as f:
srt_file = f.read()
parsed_file, errors_file = parse_srt(srt_file)
except:
pass
finally:
outputfile1 = ''.join( inputfile.split('.')[:-1] ) + '_fixed.srt'
outputfile2 = ''.join( inputfile.split('.')[:-1] ) + '_error.srt'
with open( outputfile1 , 'w') as f:
f.write(parsed_file)
with open( outputfile2 , 'w') as f:
f.write(errors_file)
print 'Detected %s errors in "%s". Fixed file saved as "%s"
(Errors only as "%s").' % ( count, inputfile, outputfile1, outputfile2 )
previous_end_time = datetime.strptime("00:00:00,000", "%H:%M:%S,%f")
def parse_times(times):
global previous_end_time
global count
_error = False
_times = []
for time_code in times:
t = datetime.strptime(time_code, "%H:%M:%S,%f")
_times.append(t)
if _times[0] < previous_end_time:
_times[0] = previous_end_time
count += 1
_error = True
previous_end_time = _times[1]
_times[0] = _times[0].strftime("%H:%M:%S,%f")[:12]
_times[1] = _times[1].strftime("%H:%M:%S,%f")[:12]
return _times, _error
def parse_srt(srt_file):
parsed_srt = []
parsed_err = []
for srt_group in re.sub('\r\n', '\n', srt_file).split('\n\n'):
lines = srt_group.split('\n')
if len(lines) >= 3:
times = lines[1].split(' --> ')
correct_times, error = parse_times(times)
if error:
clean_text = map( lambda x: x.strip(' '), lines[2:] )
srt_group = lines[0].strip(' ') + '\n' + ' --> '.join( correct_times ) + '\n' + '\n'.join( clean_text )
parsed_err.append( srt_group )
parsed_srt.append( srt_group )
return '\r\n'.join( parsed_srt ), '\r\n'.join( parsed_err )
def main(argv):
inputfile = None
try:
options, arguments = getopt.getopt(argv, "hi:", ["input="])
except:
print 'Usage: test.py -i <input file>'
for o, a in options:
if o == '-h':
print 'Usage: test.py -i <input file>'
sys.exit()
elif o in ['-i', '--input']:
inputfile = a
fix_srt(inputfile)
if __name__ == '__main__':
main( sys.argv[1:] )
If someone needs it save the code as srtfix.py, for example, and use it from command line:
python srtfix.py -i "my srt subtitle.srt"
I was lazy and used datetime module to process timecodes, so not sure script will work for subtitles longer then 24h (: I'm also not sure when miliseconds were added to Python's datetime module, I'm using version 2.7.5; it's possible script won't work on earlier versions because of this...

Shortcut to change a line of words to a vertical list in Sublime Text 2

Is it possible to make this title on line 1 a list of items from each word or symbol seperated by a space with a keyboard shortcut. So that I can select the title and then hit a shortcut and it will make the title a list of items like below:
Tried saving the Key Binding file.
Nothing built in, but you can do it with a plugin.
import sublime
import sublime_plugin
import re
class SplitLineCommand(sublime_plugin.TextCommand):
def run(self, edit, split_pattern=" "):
view = self.view
cursors = view.sel()
if len(cursors) == 1:
cursor = cursors[0]
begin_offset = 0
end_offset = 0
if cursor.empty():
region = view.line(cursor)
content = view.substr(region)
new_content = re.sub(split_pattern, "\n", content)
view.replace(edit, region, new_content)
else:
region = cursor
content = view.substr(region)
new_content = ""
if view.line(region).begin() != region.begin():
new_content = "\n"
begin_offset = 1
new_content += re.sub(split_pattern, "\n", content)
if view.line(region).end() != region.end():
new_content += "\n"
end_offset = - 1
view.replace(edit, region, new_content)
cursors.clear()
cursors.add(sublime.Region(region.begin() + begin_offset, region.begin() + len(new_content) + end_offset))
view.run_command("split_selection_into_lines")
You can then add the following in your key binding file.
[
{ "keys": ["f8"], "command": "split_line", "args": {"split_pattern": " "}}
]
Of course changing the key to something that you want. You don't actually need the args argument if you are just using a space. It defaults to that. I just included it for completeness.
Edit:
I've updated the plugin so it now handles selections, though it does not handle multiple cursors at this point.
Edit 2
If it is not working, try opening the console and entering view.run_command("split_line"). This will run the command in whatever view you were in prior to switching to the console. This way you know if the command actually works. If it doesn't then there is a problem with the plugin. If it does, then there is a problem with the key binding.
I adapted the above code for my own use, so that it now respects whitespace. But I hard-coded tabs instead of spaces, so if you use spaces you might have to change it further. It also now assumes you have no text selected and instead have the cursor in the middle of the line to be changed to vertical spacing. I left intro/outro as arguments so you can also use it for [] or (), although maybe some more escaping is needed in that case for the regex.
Before:
fields = { 'Team1', 'Team2', 'Player1', 'Player2', 'Tab=Round', 'DateTime_UTC=DateTime', 'HasTime=TimeEntered', 'OverviewPage=Tournament', 'ShownName', 'Winner', 'Stream' },
After:
fields = {
'Team1',
'Team2',
'Player1',
'Player2',
'Tab=Round',
'DateTime_UTC=DateTime',
'HasTime=TimeEntered',
'OverviewPage=Tournament',
'ShownName',
'Winner',
'Stream',
},
import sublime
import sublime_plugin
import re
class SplitLineCommand(sublime_plugin.TextCommand):
def run(self, edit, sep=",", repl= "\n", intro="{", outro="}"):
view = self.view
find = re.escape(sep + ' ') + '*(?! *$| *\n)'
intro_repl = intro + repl
intro = intro + ' *'
outro_repl_start = sep + repl
outro_repl_end = outro
outro = ',? *' + outro
repl = sep + repl
cursors = view.sel()
if len(cursors) == 1:
cursor = cursors[0]
begin_offset = 0
end_offset = 0
if cursor.empty():
region = view.line(cursor)
content = view.substr(region)
line_str = view.substr(view.line(view.sel()[0]))
tabs = len(line_str) - len(line_str.lstrip())
intro_repl = intro_repl + '\t' * (tabs + 1)
repl = repl + '\t' * (tabs + 1)
outro_repl = outro_repl_start + ('\t' * tabs) + outro_repl_end
content = re.sub(outro, outro_repl, content)
content = re.sub(find, repl, content)
content = re.sub(intro, intro_repl, content)
view.replace(edit, region, content)
cursors.clear()
cursors.add(sublime.Region(region.begin() + begin_offset, region.begin() + len(content) + end_offset))
view.run_command("split_selection_into_lines")