Regex to match despite some of the characters not matching pattern? - regex

I'm working with some bioinformatics data, and I've got this sed expression:
sed -n 'N;/.*:\(.*\)\n.*\1/{p;n;p;n;p};D' file.txt
It currently takes a file that is structured such as:
#E00378:1485 1:N:0:ABC
ABCDEF ##should match, all characters present
+
#
#E00378:1485 1:N:1:ABC
XYZABX ##should match, with permutation
+
#
#E00378:1485 1:N:1:ABCDE
ZABCDXFGH ##should match, with permutation
+
#
#E00378:1485 1:N:1:CBA
ABC ##should not match, order not preserved
+
#
Then it returns 4 lines if the sequence after : is found in the second line, so in this case I would get:
#E00378:1485 1:N:0:ABC
ABCDEF
+
#
However, I am looking to expand my search a little, by adding the possibility of searching for any single permutation of the letters, while maintaining the order, such that ABX, ZBC, AHC, ABO would all match the search criteria ABC.
Is a search like this possible to construct as a one-liner? Or should I write a script?
I was thinking it should be possible to programmatically change one of the letters to a * in the pattern space.
I am trying to make something along the lines of an AWK pattern that has a match defined as:
p = "";
p = p "."a[2]a[3]a[4]a[5]a[6]a[7]a[8]"|";
p = p a[1]"."a[3]a[4]a[5]a[6]a[7]a[8]"|";
p = p a[1]a[2]"."a[4]a[5]a[6]a[7]a[8]"|";
p = p a[1]a[2]a[3]"."a[5]a[6]a[7]a[8]"|";
p = p a[1]a[2]a[3]a[4]"."a[6]a[7]a[8]"|";
p = p a[1]a[2]a[3]a[4]a[5]"."a[7]a[8]"|";
p = p a[1]a[2]a[3]a[4]a[5]a[6]"."a[8]"|";
p = p a[1]a[2]a[3]a[4]a[5]a[6]a[7]".";
m = p;
But I can't seem to figure out how to make it programmatically for n numbers.

Okay, check this out where fuzzy is your input above:
£ perl -0043 -MText::Fuzzy -ne 'if (/.*:(.*?)\n(.*?)\n/) {my ($offset, $edits, $distance) = Text::Fuzzy::fuzzy_index ($1, $2); print "$offset $edits $distance\n";}' fuzzy
3 kkk 0
5 kkd 1
5 kkkkd 1
Since you haven't been 100% clear on your "fuzziness" criteria (and can't be until you have a measurement tool), I'll explain this first. Reference here:
http://search.cpan.org/~bkb/Text-Fuzzy-0.27/lib/Text/Fuzzy.pod
Basically, for each record (which I've assumed are split on # which is the -0043 bit), the output is an offset, how the 1st string can become the 2nd string, and lastly the "distance" (Levenshtein, I would assume) between the two strings.
So..
£ perl -0043 -MText::Fuzzy -ne 'if (/.*:(.*?)\n(.*?)\n/) {my ($offset, $edits, $distance) = Text::Fuzzy::fuzzy_index ($1, $2); print "$_\n" if $distance < 2;}' fuzzy
#E00378:1485 1:N:0:ABC
ABCDEF
+
#
#E00378:1485 1:N:1:ABC
XYZABX
+
#
#E00378:1485 1:N:1:ABCDE
ZABCDXFGH
+
#
See here for installing perl modules like Text::Fuzzy
https://www.thegeekstuff.com/2008/09/how-to-install-perl-modules-manually-and-using-cpan-command/
Example input/output for a record that wouldn't be printed (distance is 3):
#E00378:1485 1:N:1:ABCDE
ZDEFDXFGH
+
#
gives us this (or simply doesn't print with the second perl command)
3 dddkk 3

Awk doesn't have sed back-references, but has more expressiveness to make up the difference. The following script composes the pattern for matching from the final field of the lead line, then applies the pattern to the subsequent line.
#! /usr/bin/awk -f
BEGIN {
FS = ":"
}
# Lead Line has 5 fields
NF == 5 {
line0 = $0
seq = $NF
getline
if (seq != "") {
n = length(seq)
if (n == 1) {
pat = seq
} else {
# ABC -> /.BC|A.C|AB./
pat = "." substr(seq, 2, n - 1)
for (i = 2; i < n; ++i)
pat = pat "|" substr(seq, 1, i - 1) "." substr(seq, i + 1, n - i)
pat = pat "|" substr(seq, 1, n - 1) "."
}
if ($0 ~ pat) {
print line0
print
getline; print
getline; print
next
}
}
getline
getline
}
If the above needs some work to form a different matching pattern, we mostly limit our modification to the lines of pattern composition. By the way... I noticed that sequences repeat -- to make this faster we can implement caching:
#! /usr/bin/awk -f
BEGIN {
FS = ":"
# Noticed that sequences repeat
# -- implement caching of patterns
split("", cache)
}
# Lead Line has 5 fields
NF == 5 {
line0 = $0
seq = $NF
getline
if (seq != "") {
if (seq in cache) {
pat = cache[seq]
} else {
n = length(seq)
if (n == 1) {
pat = seq
} else {
# ABC -> /.BC|A.C|AB./
pat = "." substr(seq, 2, n - 1)
for (i = 2; i < n; ++i)
pat = pat "|" substr(seq, 1, i - 1) "." substr(seq, i + 1, n - i)
pat = pat "|" substr(seq, 1, n - 1) "."
}
cache[seq] = pat
}
if ($0 ~ pat) {
print line0
print
getline; print
getline; print
next
}
}
getline
getline
}

Related

Trouble sorting a list after using regex

The code below is parsing data from this text sample:
rf-Parameters-v1020
supportedBandCombination-r10: 128 items
Item 0
BandCombinationParameters-r10: 1 item
Item 0
BandParameters-r10
bandEUTRA-r10: 2
bandParametersUL-r10: 1 item
Item 0
CA-MIMO-ParametersUL-r10
ca-BandwidthClassUL-r10: a (0)
bandParametersDL-r10: 1 item
Item 0
CA-MIMO-ParametersDL-r10
ca-BandwidthClassDL-r10: a (0)
supportedMIMO-CapabilityDL-r10: fourLayers (1)
I am having trouble replacing the first 'a' from the "ca-BandwidthClassUL-r10" line with 'u' and placing it before 'm' in the final output: [2 a(0) u m]
import re
regex = r"bandEUTRA-r10: *(\d+)(?:\r?\n(?!ca-BandwidthClassUL-r10:).*)*\r?\nca-BandwidthClassUL-r10*: *(\w.*)(" \
r"?:\r?\n(?!ca-BandwidthClassDL-r10:).*)*\r?\nca-BandwidthClassDL-r10*: *(" \
r"\w.*)\nsupportedMIMO-CapabilityDL-r10: *(.*) "
regex2 = r"^.*bandEUTRA-r10: *(\d+)(?:\r?\n(?!ca-BandwidthClassUL-r10:).*)*\r?\nca-BandwidthClassUL-r10*: *(\w.*)(?:\r?\n(?!ca-BandwidthClassDL-r10:).*)*\r?\nca-BandwidthClassDL-r10*: *(\w.*)\nsupportedMIMO-CapabilityDL-r10: *(.*)(?:\r?\n(?!bandEUTRA-r10:).*)*\r?\nbandEUTRA-r10: *(\d+)(?:\r?\n(?!ca-BandwidthClassDL-r10:).*)*\r?\nca-BandwidthClassDL-r10*: *(\w.*)\nsupportedMIMO-CapabilityDL-r10: *(.*)"
my_file = open("files.txt", "r")
content = my_file.read().replace("fourLayers", 'm').replace("twoLayers", " ")
#print(content)
#if 'BandCombinationParameters-r10: 1 item' in content:
result = ["".join(m) for m in re.findall(regex, content, re.MULTILINE)]
print(result)
You might use an optional part where you capture group 2.
Then you can print group 3 concatenated with u if there is group 2, else only print group 3.
As you are already matching the text in the regex, you don't have to do the separate replacement calls. You can use the text in the replacement itself.
bandEUTRA-r10: *(\d+)(?:\r?\n(?!ca-BandwidthClassUL-r10:).*)*(?:\r?\n(ca-BandwidthClassUL-r10)?: *(\w.*))(?:\r?\n(?!ca-BandwidthClassDL-r10:).*)*\r?\nca-BandwidthClassDL-r10*: *\w.*\nsupportedMIMO-CapabilityDL-r10:
Regex demo | Python demo
For example
import re
regex = r"bandEUTRA-r10: *(\d+)(?:\r?\n(?!ca-BandwidthClassUL-r10:).*)*(?:\r?\n(ca-BandwidthClassUL-r10)?: *(\w.*))(?:\r?\n(?!ca-BandwidthClassDL-r10:).*)*\r?\nca-BandwidthClassDL-r10*: *\w.*\nsupportedMIMO-CapabilityDL-r10:"
s = "here the example data with and without ca-BandwidthClassUL-r10"
matches = re.finditer(regex, s, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result = "{0}{1} m".format(
match.group(1),
match.group(3) + " u" if match.group(2) else match.group(3)
)
print(result)
Output
2a (0) u m
2a (0) m

How to remove a whitespace character and insert a string in its place

I'm trying to take a string such as Hello World this is Bob and have it formatted to
:h: :e: :l: :l: :o: | :w: :o: :r: :l: :d: | :t: :h: :i: :s: | :i: :s: | :b: :o: :b:
This is where I'm having the issue:
text = text.scan(/\s/).join(' | ')
def to_clipboard(text)
string = text.to_s
IO.popen('pbcopy', 'w') {|t| t << text}
string
end
#########################################
print "Enter message: "
text = gets.downcase.chomp!
# text = text.scan(/\s/).join(' | ')
formatted = text.scan(/\S[A-Za-z]{0,0}/).join(': :')
formatted.insert(0,':')
formatted[formatted.size, 0] = ':'
# verify expected format
puts formatted
to_clipboard(formatted)
def bannerize(str)
str.downcase.gsub(/./) do |c|
if c == ' '
"| "
else
":#{c}: "
end
end.rstrip
end
bannerize("Hello this is Mugsy")
#=> ":h: :e: :l: :l: :o: | :t: :h: :i: :s: | :i: :s: | :m: :u: :g: :s: :y:"
Alternatively,
def bannerize(str)
h = Hash.new { |_,k| ":#{k}: " }.tap { |h| h[' '] = "| " }
str.downcase.gsub(/./,h).rstrip
end
This uses the form of Hash::new that creates an empty hash h with a default proc, after which the key-value pair ' '=>" " is added to the hash so that it becomes { ' '=>" " }. The default proc causes h[k] to return ":#{k}: " if it does not have a key k; that is, if k is anything other than a space.
The form of String#gsub that employs a hash for making substitutions is then used with this hash h.
Try to split by space, make a replacement in each word and then combine results:
words = text.split(/\s/)
words.map {|s| s.gsub(/(.)/, ':\1: ')}
formatted = words.join('| ')
I suggest
text = 'Hello World this is Bob'
p text.strip.gsub(/\s+|(\S)/) { |m| m == $~[1] ? ":#{$~[1].downcase}: " : '| ' }.rstrip
## => ":h: :e: :l: :l: :o: | :w: :o: :r: :l: :d: | :t: :h: :i: :s: | :i: :s: | :b: :o: :b:"
See the Ruby demo online.
NOTES:
The string is first stripped from leading/trailing whitespace in order not to add pipe at the start.
gsub(/\s+|(\S)/) { |m| m == $~[1] ? ":#{$~[1].downcase}: " : '| ' } does the main job within one regex replace pass:
/\s+|(\S)/ - matches 1+ whitespaces, or matches and captures into Group 1 any single non-whitespace character
The replacement is a block where m represents the match value. If the match value is equal Group 1 value (m == $~[1] ?) then the replacement is :, then Group 1 value in lower case (#{$~[1].downcase}), and then : , else, the replacement is | .
Since there may be a trailing space after gsub, the string is rstripped.

Regex Find any combination of \d and [ -] as long as there is at least K \d together and no more than 10

Hey all given the examples
1234567890
12 3456789 0
123 456-7890
12345 678 90
123-4-5-6789 0
total digits are fixed, and any grouping larger than some arbitrary k (min_group_length) are allowed, with max groups being set (optional but preferred)
I need to identify these in Regex, my current solution is disgusting.
I first find the partitions of 10, then the permutations of them, then convert all that to regex, resulting in hundreds of groupings
printAllUniqueParts(10);
int min_groups = 1;
int min_group_len = 2;
res.RemoveAll(s => s.Split(' ').ToList().Intersect(Enumerable.Range(0, min_group_len).Select(n => n.ToString()).ToList()).Count() >= 1 || s.Split(' ').Length < min_groups || s.Split(' ').Length == 0);
string reg = string.Empty;
for (int i = 1; i < res.Count; i++)
{
res[i] = res[i].Trim();
var r = res[i].Split(' ');
pair[] lp = r.Where(x => x.Length > 0).Select(y => new pair(y)).ToList().ToArray();
var qw = new List<string[]>();
perm(lp, 0, ref qw); // standard permutations List<string>
for (int k = 0; k < qw.Count; k++)
{
string s = "";
var v = string.Join(" ", qw[k]).Split(' ');
for (int j = 0; j < v.Length; j++)
{
s += #"\d{" + v[j] + "}" + (j == v.Length - 1 ? "" : "[ -]");
}
// res[i] = s;
reg += '(' + s + ")" + (k == qw.Count - 1 ? "" : "|");
}
}
This works, but there has to a more computationally cheap way than the below,
Any help appreciated.
(\d{7}[ -]\d{3})|(\d{3}[ -]\d{7})|(\d{6}[ -]\d{4})|(\d{4}[ -]\d{6})|(\d{5}[ -]\d{5})|(\d{5}[ -]\d{5})|(\d{4}[ -]\d{3}[ -]\d{3})(\d{4}[ -]\d{3}[ -]\d{3})(\d{3}[ -]\d{4}[ -]\d{3})(\d{3}[ -]\d{3}[ -]\d{4})(\d{3}[ -]\d{4}[ -]\d{3})(\d{3}[ -]\d{3}[ -]\d{4})
I guess you could try this
^(?=\d(?:\D*\d){9}$)\d{1,7}(?:[ -]\d{1,7})*$
https://regex101.com/r/t9Lnw1/1
Explained
^ # BOS
(?= # Validate the 10 digits first
\d
(?: \D* \d ){9}
$
)
# Then match the string based on grouping of
\d{1,7} # no more than let's say 7 for example
(?: [ -] \d{1,7} )*
$ # EOS
It sounds like you want sequences of at least K and at most 10 digits, but you also want to ignore any single - or space that might appear between two digits. So something like (\d[ -]?){K,10} should do the trick. Obviously the K needs to be replaced by an actual number, and this will incidentally pick up a trailing space or - after the sequence (which you likely just want to ignore anyways.)
If you really must avoid the trailing space or -, you could use \d([ -]?\d){K-1,9}
If you want some more complex structure than this, your best bet may be to use a simple regex that matches a superset of your requirements, and then post-process the matches to eliminate those that don't meet the details.

How do I capture all occurences in a string in Vim?

I want to capture all certain occurrences in a string in Vimscript.
example:
let my_calculation = '200/3 + 23 + 100.5/3 -2 + 4*(200/2)'
How can I capture all numbers (including dots if there are) before and after the '/'? in 2 different variables:
- output before_slash: 200100.5200
- output after slash 332
How can I replace them if a condition occurs?
p.e. if after a single '/' there is no '.' add '.0' after this number
I tried to use matchstring and regex but after trying and trying I couldn't resolve it.
A useful feature that can be taken advantage of in this case is substitution
with an expression (see :help sub-replace-\=).
let [a; b] = [[]]
call substitute(s, '\(\d*\.\?\d\+\)/\(\d*\.\?\d\+\)\zs',
\ '\=add(a,submatch(1))[1:0]+add(b,submatch(2))[1:0]', 'g')
To answer the second part of the question:
let my_calculation = '200/3 + 23 + 100.5/3 -2 + 4*(200/2)'
echo substitute(my_calculation, '\(\/[0-9]\+\)\([^0-9.]\|$\)', '\1.0\2', 'g')
The above outputs:
200/3.0 + 23 + 100.5/3.0 -2 + 4*(200/2.0)
Give this a try:
function! GetNumbers(string)
let pairs = filter(split(a:string, '[^0-9/.]\+'), 'v:val =~ "/"')
let den = join(map(copy(pairs), 'matchstr(v:val, ''/\zs\d\+\(\.\d\+\)\?'')'), '')
let num = join(map(pairs, 'matchstr(v:val, ''\d\+\(\.\d\+\)\?\ze/'')'), '')
return [num, den]
endfunction
let my_calculation = '200/3 + 23 + 100.5/3 -2 + 4*(200/2)'
let [a,b] = GetNumbers(my_calculation)
echo a
echo b

Regex expression to parse an interesting CSV?

I need to parse an CSV file using AWK. A line in the CSV could look like this:
"hello, world?",1 thousand,"oneword",,,"last one"
Some important observations:
-field inside quoted string can contain commas and multiple words
-unquoted field can be multiple worlds
-field can be empty by just having two commas in a row
Any clues on writing a regex expression to split this line up properly?
Thanks!
As many have observed, CSV is a harder format than it first appears. There are many edge cases and ambiguities. As an example ambiguity, in your example, is ',,,' a field with a comma or two blank fields?
Perl, python, Java, etc are better equipped to deal with CSV because they have well tested libraries for the same. A regex will be more fragile.
With AWK, I have had some success with THIS AWK function. It works under AWK, gawk and nawk.
#!/usr/bin/awk -f
#**************************************************************************
#
# This file is in the public domain.
#
# For more information email LoranceStinson+csv#gmail.com.
# Or see http://lorance.freeshell.org/csv/
#
# Parse a CSV string into an array.
# The number of fields found is returned.
# In the event of an error a negative value is returned and csverr is set to
# the error. See below for the error values.
#
# Parameters:
# string = The string to parse.
# csv = The array to parse the fields into.
# sep = The field separator character. Normally ,
# quote = The string quote character. Normally "
# escape = The quote escape character. Normally "
# newline = Handle embedded newlines. Provide either a newline or the
# string to use in place of a newline. If left empty embedded
# newlines cause an error.
# trim = When true spaces around the separator are removed.
# This affects parsing. Without this a space between the
# separator and quote result in the quote being ignored.
#
# These variables are private:
# fields = The number of fields found thus far.
# pos = Where to pull a field from the string.
# strtrim = True when a string is found so we know to remove the quotes.
#
# Error conditions:
# -1 = Unable to read the next line.
# -2 = Missing end quote.
# -3 = Missing separator.
#
# Notes:
# The code assumes that every field is preceded by a separator, even the
# first field. This makes the logic much simpler, but also requires a
# separator be prepended to the string before parsing.
#**************************************************************************
function parse_csv(string,csv,sep,quote,escape,newline,trim, fields,pos,strtrim) {
# Make sure there is something to parse.
if (length(string) == 0) return 0;
string = sep string; # The code below assumes ,FIELD.
fields = 0; # The number of fields found thus far.
while (length(string) > 0) {
# Remove spaces after the separator if requested.
if (trim && substr(string, 2, 1) == " ") {
if (length(string) == 1) return fields;
string = substr(string, 2);
continue;
}
strtrim = 0; # Used to trim quotes off strings.
# Handle a quoted field.
if (substr(string, 2, 1) == quote) {
pos = 2;
do {
pos++
if (pos != length(string) &&
substr(string, pos, 1) == escape &&
(substr(string, pos + 1, 1) == quote ||
substr(string, pos + 1, 1) == escape)) {
# Remove escaped quote characters.
string = substr(string, 1, pos - 1) substr(string, pos + 1);
} else if (substr(string, pos, 1) == quote) {
# Found the end of the string.
strtrim = 1;
} else if (newline && pos >= length(string)) {
# Handle embedded newlines if requested.
if (getline == -1) {
csverr = "Unable to read the next line.";
return -1;
}
string = string newline $0;
}
} while (pos < length(string) && strtrim == 0)
if (strtrim == 0) {
csverr = "Missing end quote.";
return -2;
}
} else {
# Handle an empty field.
if (length(string) == 1 || substr(string, 2, 1) == sep) {
csv[fields] = "";
fields++;
if (length(string) == 1)
return fields;
string = substr(string, 2);
continue;
}
# Search for a separator.
pos = index(substr(string, 2), sep);
# If there is no separator the rest of the string is a field.
if (pos == 0) {
csv[fields] = substr(string, 2);
fields++;
return fields;
}
}
# Remove spaces after the separator if requested.
if (trim && pos != length(string) && substr(string, pos + strtrim, 1) == " ") {
trim = strtrim
# Count the number fo spaces found.
while (pos < length(string) && substr(string, pos + trim, 1) == " ") {
trim++
}
# Remove them from the string.
string = substr(string, 1, pos + strtrim - 1) substr(string, pos + trim);
# Adjust pos with the trimmed spaces if a quotes string was not found.
if (!strtrim) {
pos -= trim;
}
}
# Make sure we are at the end of the string or there is a separator.
if ((pos != length(string) && substr(string, pos + 1, 1) != sep)) {
csverr = "Missing separator.";
return -3;
}
# Gather the field.
csv[fields] = substr(string, 2 + strtrim, pos - (1 + strtrim * 2));
fields++;
# Remove the field from the string for the next pass.
string = substr(string, pos + 1);
}
return fields;
}
{
num_fields = parse_csv($0, csv, ",", "\"", "\"", "\\n", 1);
if (num_fields < 0) {
printf "ERROR: %s (%d) -> %s\n", csverr, num_fields, $0;
} else {
printf "%s -> \n", $0;
printf "%s fields\n", num_fields;
for (i = 0;i < num_fields;i++) {
printf "%s\n", csv[i];
}
printf "|\n";
}
}
Running it on your example data produces:
"hello, world?",1 thousand,"oneword",,,"last one" ->
6 fields
hello, world?
1 thousand
oneword
last one
|
An example Perl solution:
$ echo '"hello, world?",1 thousand,"oneword",,,"last one"' |
perl -lnE 'for(/(?:^|,)("(?:[^"]+|"")*"|[^,]*)/g) { s/"$//; s/""/"/g if (s/^"//);
say}'
Try this:
^(("(?:[^"]|"")*"|[^,]*)(,("(?:[^"]|"")*"|[^,]*))*)$
I haven't tested it with AWK though.