perl: string matching to find longest substring - regex

$string1 = "peachbananaapplepear";
$string2 = "juicenaapplewatermelonpear";
I want to know what's the longest common substring containing the word "apple".
$string2 =~ m/.+apple.+/;
print $string2;
So I use the match operator, and .+ for matching any character before and after the keyword "apple". When I print $string2 it doesn't return naapple but returns the original $string2 instead.

Here is one approach. First get the locations where 'apple' appears in the strings. And for each of those locations in string1, look at all locations in string2.
Look to the left and right to see how far the commonality extends from the initial location.
$string1 = "peachbananaapplepear12345applegrapeapplebcdefghijk";
$string2 = "juicenaapplewatermelonpearkiwi12345applebcdefghijkberryapple";
my $SearchFor="apple";
my $SearchStrLen = length($SearchFor);
# Get locations in first string where the search term appears
my #FirstPositions = getPostions($string1);
# Get locations in second string where the search term appears
my #SecondPositions = getPostions($string2);
CheckForMaxMatch();
sub getPostions
{
my $GivenString = shift;
my #Positions;
my $j=0;
for (my $i=0; $i < length($GivenString); $i += ($SearchStrLen+1) )
{
$j = index($GivenString, $SearchFor, $i);
if ($j == -1) {
last;
}
push (#Positions, $j);
$i = $j;
}
return #Positions;
}
sub CheckForMaxMatch
{
my $MaxLeft=0;
# From the location of 'apple', look to the left and right
# to see how far the characters are same
for my $i (#FirstPositions) {
for my $j (#SecondPositions) {
my $LeftMatchPos = getMaxMatch($i, $j, -1);
my $RightMatchPos = getMaxMatch($i, $j, 1);
if ( ($RightMatchPos - $LeftMatchPos) > ($MaxRight - $MaxLeft) ) {
$MaxLeft = $LeftMatchPos;
$MaxRight = $RightMatchPos;
}
}
}
my $LongestSubString = substr($string1, $MaxLeft, $MaxRight-$MaxLeft);
print "Longest common substring is: $LongestSubString\n";
print "It begins at $MaxLeft and ends at $MaxRight in string1\n";
}
sub getMaxMatch
{
my $i= shift;
my $j= shift;
my $direction= shift;
my $k = ($direction >= 1 ? $SearchStrLen : 0);
my $FirstChar = substr($string1, $i+($k * $direction), 1);
my $SecondChar = substr($string2, $j+($k * $direction), 1);
for ( ; $FirstChar && $SecondChar; $k++ )
{
$FirstChar = substr($string1, $i+($k * $direction), 1);
$SecondChar = substr($string2, $j+($k * $direction), 1);
if ( $FirstChar ne $SecondChar ) {
$direction < 1 ? $k-- : "";
my $pos = ($k ? ($i + $k * $direction) : $i);
return $pos;
}
}
return $i;
}

The =~ operator is not going to reassign the value of $string2. Try this:
$string2 =~ m/(.+apple.+)/;
my $match = $1;
print $match

Based on the general algorithm, but tracks not only the length of the current run (#l), but whether it includes the keyword (#k). Only runs that include the keyword are considered for longest run.
use strict;
use warnings;
use feature qw( say );
sub find_substrs {
our $s; local *s = \shift;
our $key; local *key = \shift;
my #positions;
my $position = -1;
while (1) {
$position = index($s, $key, $position+1);
last if $position < 0;
push #positions, $position;
}
return #positions;
}
sub lcsubstr_which_include {
our $s1; local *s1 = \shift;
our $s2; local *s2 = \shift;
our $key; local *key = \shift;
my #key_starts1 = find_substrs($s1, $key)
or return;
my #key_starts2 = find_substrs($s2, $key)
or return;
my #is_key_start1; $is_key_start1[$_] = 1 for #key_starts1;
my #is_key_start2; $is_key_start2[$_] = 1 for #key_starts2;
my #s1 = split(//, $s1);
my #s2 = split(//, $s2);
my $length = 0;
my #rv;
my #l = ( 0 ) x ( #s1 + 1 ); # Last ele is read when $i1==0.
my #k = ( 0 ) x ( #s1 + 1 ); # Same.
for my $i2 (0..$#s2) {
for my $i1 (reverse 0..$#s1) {
if ($s1[$i1] eq $s2[$i2]) {
$l[$i1] = $l[$i1-1] + 1;
$k[$i1] = $k[$i1-1] || ( $is_key_start1[$i1] && $is_key_start2[$i2] );
if ($k[$i1]) {
if ($l[$i1] > $length) {
$length = $l[$i1];
#rv = [ $i1, $i2, $length ];
}
elsif ($l[$i1] == $length) {
push #rv, [ $i1, $i2, $length ];
}
}
} else {
$l[$i1] = 0;
$k[$i1] = 0;
}
}
}
for (#rv) {
$_->[0] -= $length;
$_->[1] -= $length;
}
return #rv;
}
{
my $s1 = "peachbananaapplepear";
my $s2 = "juicenaapplewatermelonpear";
my $key = "apple";
for (lcsubstr_which_include($s1, $s2, $key)) {
my ($s1_pos, $s2_pos, $length) = #$_;
say substr($s1, $s1_pos, $length);
}
}
This solution in O(NM), meaning it scales amazingly well (for what it does).

Related

Print out preceding lines after regex match

I am writing a sort of quiz program. I am using a .txt file as a test bank, but cant figure out how to (using regex's) match each question and print out the possible answers on different lines.I originally was just going to do true false so I didnt need to match anything else and just matching "1" worked fine. Basically I just need the question on one line and the answers on others. Here is an example of a question
1.) some text
a.) solution
b.) solution
c.) solution
code i had before:
while (<$test>) {
foreach my $line (split /\n/) {
my $match1 = "1";
if ($line =~ /$match1/) {
$question1 = $line;
print "$question1\n";
print "Answer: ";
$answer1 = <>;
chomp ($answer1);
if ( $answer1 =~ /(^a$)/i) {
$score1 = 20;
push #score, $score1;
}
}
I really couldn't get what you were getting at, so I wrote this sample program.
use 5.016;
use strict;
use warnings;
my ( #lines, #questions, $current_question );
sub prompt {
my ( $prompt ) = #_;
print $prompt, ' ';
STDOUT->flush;
my $val = <>;
return $val;
}
QUESTION:
while ( <DATA> ) {
if ( my ( $ans ) = m/^=(\w+)/ ) {
INPUT: {
say #lines;
last unless defined( my $answer = prompt( 'Your answer:' ));
say '';
my ( $response ) = $answer =~ /([a-z])\s*$/;
if ( not $response ) {
$answer =~ s/\s*$//; #/
say "Invalid response. '$answer' is not an answer!\n";
redo INPUT;
}
if ( $response eq $ans ) {
say 'You are right!';
}
elsif ( my $ansln = $current_question->{$response} ) {
if ( $response eq 'q' ) {
say 'Quitting...';
last QUESTION;
}
say <<"END_SAY";
You chose:\n$current_question->{$response}
The correct answer was:\n$current_question->{$ans}
END_SAY
}
else {
say "Invalid response. '$response' is not an answer!\n";
redo INPUT;
}
};
#lines = ();
prompt( 'Press enter to continue.' );
say '';
}
else {
if ( my ( $qn, $q ) = m/^\s*(\d+)\.\)\s+(.*\S)\s*$/ ) {
push #questions, $current_question = { question => $q };
}
else {
my ( $l, $a ) = m/^\s+([a-z])/;
$current_question->{$l} = ( m/(.*)/ )[0];
}
push #lines, $_;
}
}
__DATA__
1.) Perl is
a.) essential
b.) fun
c.) useful
=c
2.) This question is
a.) Number two
b.) A test to see how this format is parsed.
c.) Unneeded
=b
This is probably over simplified.
It just reads in the test data and creates a structure.
You could use it to grade test takers answers.
use strict;
use warnings;
use Data::Dumper;
$/ = undef;
my $testdata = <DATA>;
my %HashTest = ();
my $hchoices;
my $hqeustion;
my $is_question = 0;
while ( $testdata =~ /(^.*)\n/mg )
{
my $line = $1;
$line =~ s/^\s+|\s+$//g;
next if ( length( $line ) == 0);
if ( $line =~ /^(\d+)\s*\.\s*\)\s*(.*)/ )
{
$is_question = 1;
$HashTest{ $1 }{'question'} = $2;
$HashTest{ $1 }{'choices'} = {};
$HashTest{ $1 }{'answer'} = 'unknown';
$hqeustion = $HashTest{ $1 };
$hchoices = $HashTest{ $1 }{'choices'};
}
elsif ( $is_question && $line =~ /^\s*(answer)\s*:\s*([a-z])/ )
{
$hqeustion->{'answer'} = $2;
}
elsif ( $is_question && $line =~ /^\s*([a-z])\s*\.\s*\)\s*(.*)/ )
{
$hchoices->{ $1 } = $2;
}
}
print "\nQ & A summary\n-------------------------\n";
for my $qnum ( keys %HashTest )
{
print "Question $qnum: $HashTest{$qnum}{'question'}'\n";
my $ans_code = $HashTest{$qnum}{'answer'};
print "Answer: ($ans_code) $HashTest{$qnum}{'choices'}{$ans_code}\n\n";
}
print "---------------------------\n";
print Dumper(\%HashTest);
__DATA__
1.) What is the diameter of the earth?
a.) Half the distance to the sun
b.) Same as the moon
c.) 6,000 miles
answer: c
2.) Who is buried in Grants Tomb?
a.) Thomas Edison
b.) Grant, who else
c.) Jimi Hendrix
answer: b
Output:
Q & A summary
-------------------------
Question 1: What is the diameter of the earth?'
Answer: (c) 6,000 miles
Question 2: Who is buried in Grants Tomb?'
Answer: (b) Grant, who else
---------------------------
$VAR1 = {
'1' => {
'question' => 'What is the diameter of the earth?',
'answer' => 'c',
'choices' => {
'c' => '6,000 miles',
'a' => 'Half the distance to the sun',
'b' => 'Same as the moon'
}
},
'2' => {
'question' => 'Who is buried in Grants Tomb?',
'answer' => 'b',
'choices' => {
'c' => 'Jimi Hendrix',
'a' => 'Thomas Edison',
'b' => 'Grant, who else'
}
}
};

Can one use regex to wildcard a set of whole words in Perl?

This is nuts, I mean pseudocode, but something like this:
/[January, February, March] \d*/
Should match things like January 13 or February 26, and so on...
WHAT I'M DOING:
my $url0 = 'http://www.registrar.ucla.edu/calendar/acadcal13.htm';
my $url1 = 'http://www.registrar.ucla.edu/calendar/acadcal14.htm';
my $url2 = 'http://www.registrar.ucla.edu/calendar/acadcal15.htm';
my $url3 = 'http://www.registrar.ucla.edu/calendar/acadcal16.htm';
my $url4 = 'http://www.registrar.ucla.edu/calendar/acadcal17.htm';
my $url5 = 'http://www.registrar.ucla.edu/calendar/sumcal.htm';
my $document0 = get($url0);
my $document1 = get($url1);
my $document2 = get($url2);
my $document3 = get($url3);
my $document4 = get($url4);
my $document5 = get($url5);
my #dates0 = ($document0 =~ /(January|February|March|April|May|June|July|August|September|October|November|December) \d+/g);
my #dates1 = ($document1 =~ /(January|February|March|April|May|June|July|August|September|October|November|December) \d+/g);
my #dates2 = ($document2 =~ /(January|February|March|April|May|June|July|August|September|October|November|December) \d+/g);
my #dates3 = ($document3 =~ /(January|February|March|April|May|June|July|August|September|October|November|December) \d+/g);
my #dates4 = ($document4 =~ /(January|February|March|April|May|June|July|August|September|October|November|December) \d+/g);
my #dates5 = ($document5 =~ /(January|February|March|April|May|June|July|August|September|October|November|December) \d+/g);
foreach(#dates0)
{
print "$_\r\n";
}
foreach(#dates1)
{
print "$_\r\n";
}
foreach(#dates2)
{
print "$_\r\n";
}
foreach(#dates3)
{
print "$_\r\n";
}
foreach(#dates4)
{
print "$_\r\n";
}
foreach(#dates5)
{
print "$_\r\n";
}
These printing gadgets give the following result: http://pastebin.com/7z13gBqt
This is not good:
http://tinypic.com/r/nqpapx/8
Yes. You can use an alternation.
/(January|February|March|April|May|June|July|August|September|October|November|December) \d*/
Would do that.
If you already have them in an array, then you can change the variable $LIST_SEPARATOR to string them into an alternation. And then parenthesize the whole
use English qw<$LIST_SEPARATOR>; # In line-noise: $"
my $date_regex
= do { local $LIST_SEPARATOR = '|';
qr/(?:#months) \d*/ # ?: if you don't want the capture
};
This gives you a compiled expression, which you can reuse like so:
my #dates;
while ( my $url = <DATA> ) {
my $document = get( $url );
push #dates, [ $document =~ /($date_regex)/g ];
push #dates, $date;
}
__DATA__
http://www.registrar.ucla.edu/calendar/acadcal13.htm
http://www.registrar.ucla.edu/calendar/acadcal14.htm
http://www.registrar.ucla.edu/calendar/acadcal15.htm
http://www.registrar.ucla.edu/calendar/acadcal16.htm
http://www.registrar.ucla.edu/calendar/acadcal17.htm
http://www.registrar.ucla.edu/calendar/sumcal.htm

Character match count between strings in Perl

I have a string (say string 1) that needs to be matched to another string (string2). Both the strings will have the same length and are case in-sensitive.
I want to print the number of character matches between both the strings.
E.g.: String 1: stranger
String 2: strangem
Match count = 7
I tried this:
$string1 = "stranger";
$string2 = "strangem";
my $count = $string1 =~ m/string2/ig;
print "$count\n";
How can I fix this?
Exclusive or, then count the null characters (where the strings were the same):
my $string1 = "stranger";
my $string2 = "strangem";
my $count = ( lc $string1 ^ lc $string2 ) =~ tr/\0//;
print "$count\n";
I missed the "case in-sensitive" bit.
You can use substr for that:
#!/usr/bin/perl
use warnings;
use strict;
my $string1=lc('stranger');
my $string2=lc('strangem');
my $count=0;
for (0..length($string1)-1) {
$count++ if substr($string1,$_,1) eq substr($string2,$_,1);
}
print $count; #prints 7
Or you can use split to get all characters as an array, and loop:
#!/usr/bin/perl
use warnings;
use strict;
my $string1=lc('stranger');
my $string2=lc('strangem');
my $count=0;
my #chars1=split//,$string1;
my #chars2=split//,$string2;
for (0..$#chars1) {
$count++ if $chars1[$_] eq $chars2[$_];
}
print $count; #prints 7
(fc gives more accurate results than lc, but I went for backwards compatibility.)
Not tested
sub cm
{
my #a = shift;
my #b = shift;
# First match prefix of string:
my $n = 0;
while ($n < $#a && $n < $#b && $a[$n] eq $b[$n]) {
++$n;
}
# Then skip one char on either side, and recurse.
if ($n < $#a && $n < $#b) {
# Match rest by skipping one place:
my $n2best = 0;
my $n2a = cm(splice(#a, $n), splice(#b, $n + 1));
$n2best = $n2a;
my $n2b = cm(splice(#a, $n + 1), splice(#b, $n));
$n2best = $n2b if $n2b > $n2best;
my $n2c = cm(splice(#a, $n + 1), splice(#b, $n + 1));
$n2best = $n2c if $n2c > $n2best;
$n += $n2best;
}
return $n;
}
sub count_matches
{
my $a = shift;
my $b = shift;
my #a_chars = split //, $a;
my #b_chars = split //, $b;
return cm(#a_chars, #b_chars);
}
print count_matches('stranger', 'strangem')

Use of uninitialized value in perl

So I have this code:
use warnings;
use strict;
my #arr = ("stuff (06:13)", "more stuff (02:59)", "extra stuff (00:00)");
my #new_arr = map { /\((\d+:\d+)\)/ ; $1 } #arr;
my ( $sum, $hrs, $mins );
$sum = 0;
for my $t (#new_arr) {
my ( $h, $m ) = split m/:/, $t;
my $hm = $h * 3600;
my $tm = $m * 60;
$sum = $sum + $hm + $tm;
}
$mins = sprintf( "%02d", ( $sum % 3600 ) / 60 );
$hrs = int( $sum / 3600 );
print "$hrs:$mins\n";
and I got uninitialized value error
Use of uninitialized value $t in split at DR/Hello World/test.pl line 14.
Use of uninitialized value $h in multiplication (*) at DR/test.pl line 16.
Use of uninitialized value $m in multiplication (*) at DR/test.pl line 17.
so how can I fix that?
stuff (3+06:13) doesn't match /\((\d+:\d+)\)/, so $1 is left untouched, so $1 contains undef, so undef ends up in #arr.
It's unwise to use $1 without making sure the pattern matches. Either adjust the pattern to make sure it always matches,
/\(([\d+]+:\d+)\)/
Or filter out the results that don't match.
my #new_arr = map { /\((\d+:\d+)\)/ ? $1 : () } #arr;
-or-
my #new_arr = map { /\((\d+:\d+)\)/ } #arr;
You have a similar problem with map { /\((\d+\++)/ ; $1 }.
You're missing a capture in two of your regexes.
Your first one:
my #new_arr = map { /\((\d+:\d+)\)/ ; $1 } #arr;
Misses a capture in the first instance:
$VAR2 = [
undef,
'02:59',
'00:00'
];
Which can be corrected (see below).
Your second capture also fails to capture anything:
my #x = map {/\((\d+\++)/ ; $1 } #arr;
See:
$VAR3 = [
'3+',
undef,
undef
];
This is because your asking it to find a digit \d followed by a literal + one or more times, which only occurs in $arr[0]. Below i've adjusted to capture 0 if no capture is found:
#!/usr/bin/perl
use warnings;
use strict;
use Data::Dumper;
my #arr = ("stuff (3+06:13)", "more stuff (02:59)", "extra stuff (00:00)");
my #new_arr = map { /\(.*?(\d+:\d+)\)/ ; $1 } #arr;
my #x = map {/\((\d+\+)|(0)/ ; $1 // $2 } #arr;
my ( $sum, $hrs, $mins );
$sum = 0;
for my $t (#new_arr) {
my ( $h, $m ) = split m/:/, $t;
my $hm = $h * 3600;
my $tm = $m * 60;
$sum = $sum + $hm + $tm;
}
$mins = sprintf( "%02d", ( $sum % 3600 ) / 60 );
$hrs = int( $sum / 3600 );
print "$hrs:$mins\n";
print Dumper (\#arr, \#new_arr, \#x);
$VAR1 = [
'stuff (3+06:13)',
'more stuff (02:59)',
'extra stuff (00:00)'
];
$VAR2 = [
'06:13',
'02:59',
'00:00'
];
$VAR3 = [
'3+',
'0',
'0'
];
Output:
9:12

How to match sequence group?

say, the given string is abcwhateverdefwhatever34567whatever012 How to match those group which are in sequence like match abc, def, 34567,012?
the regex i have now is (.)\1{2,} but it matches the same characters but not in sequence
If you're still looking for PHP code.
function getSequence($str) {
$prev = 0; $next = 0; $length = strlen($str);
$temp = "";
for($i = 0; $i < $length; $i++) {
$next = ord($str[$i]);
if ($next == $prev + 1) {
$temp .= $str[$i];
} else {
if (strlen($temp) > 1) $result[] = $temp;
$temp = $str[$i];
}
$prev = $next;
}
if (strlen($temp) > 1) $result[] = $temp;
return $result;
}
$str = "abcwhateverdefwhatever34567whatever012";
print_r(getSequence($str));
Here's a solution that solves the problem with regex. It's not very efficient though and I wouldn't recommend it.
from re import findall, X
text = "abcwhateverdefwhatever34567whatever012"
reg = r"""
(?:
(?:0(?=1))|
(?:(?<=0)1)|(?:1(?=2))|
(?:(?<=1)2)|(?:2(?=3))|
(?:(?<=2)3)|(?:3(?=4))|
(?:(?<=3)4)|(?:4(?=5))|
(?:(?<=4)5)|(?:5(?=6))|
(?:(?<=5)6)|(?:6(?=7))|
(?:(?<=6)7)|(?:7(?=8))|
(?:(?<=7)8)|(?:8(?=9))|
(?:(?<=8)9)|
(?:a(?=b))|
(?:(?<=a)b)|(?:b(?=c))|
(?:(?<=b)c)|(?:c(?=d))|
(?:(?<=c)d)|(?:d(?=e))|
(?:(?<=d)e)|(?:e(?=f))|
(?:(?<=e)f)
){1,}
"""
print findall(reg, text, X)
The result is:
['abc', 'def', '34567', '012']
As you can see I only added the numbers and the first 6 letters in the alphabet. It's should be fairly obvious how to continue.