perl: refer to groups in foreach loop - regex

Can I refer to groups in a foreach loop? my code below works:
#SEP_line=grep(/,\s(SEP[A-F0-9]*),\s(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}),\s(\+\d+),\s.*,\s(\w+)\n$/, #lines);
foreach (#SEP_line)
{
/,\s(SEP[A-F0-9]*),\s(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}),\s(\+\d+),\s.*,\s(\w+)\n$/;
print $1.",".$2.",".$3.",".$4."\n";
}
Since I already specified the match regex in #SEP_line definition, I'd do something like this:
#SEP_line=grep(/,\s(SEP[A-F0-9]*),\s(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}),\s(\+\d+),\s.*,\s(\w+)\n$/, #lines);
foreach (#SEP_line)
{
print #SEP_line[$1].",".#SEP_line[$2].",".#SEP_line[$3]."\n";
}
This doesnt work. Thanks in advance

Why loop twice (grep, foreach)? You can do it all in one loop:
foreach my $line (#lines)
{
if ($line =~ /,\s(SEP[A-F0-9]*),\s(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}),\s(\+\d+),\s.*,\s(\w+)\n$/)
{
print "$1,$2,$3,$4\n";
}
}

Don't hand-parse and hand-generate CSV! It's simpler and cleaner to use a CSV parser.
use Text::CSV_XS qw( );
my $fh_in = ...;
my $fh_out = \*STDOUT;
my $csv_in = Text::CSV_XS->new({ binary => 1, auto_diag => 2, sep => ', ' });
my $csv_out = Text::CSV_XS->new({ binary => 1, auto_diag => 2 });
while ( my $row = $csv_in->getline($fh_in) ) {
$csv_out->say($fh_out, [ #$row[-4,-3,-2,-1] ]
if $row->[-4] =~ /^SEP[A-F0-9]*\z/
&& $row->[-3] =~ /^\d{1,3}(?:\.\d{1,3}){3}\z/
&& $row->[-2] =~ /^\+\d+\z/
&& $row->[-1] =~ /^\w+\z/;
}

Related

To replace a number with strings in an XML file using a Perl script

I have an XML file. I need to replace the digits in comment="18" with comment="my string" where my string is from my #array ($array[18] = my string).
<rule ccType="inst" comment="18" domain="icc" entityName="thens" entityType="toggle" excTime="1605163966" name="exclude" reviewer="hpanjali" user="1" vscope="default"></rule>
This is what I have tried.
while (my $line = <FH>) {
chomp $line;
$line =~ s/comment="(\d+)"/comment="$values[$1]"/ig;
#print "$line \n";
print FH1 $line, "\n";
}
Here is an example using XML::LibXML:
use strict;
use warnings;
use XML::LibXML;
my $fn = 'test.xml';
my #array = map { "string$_" } 0..20;
my $doc = XML::LibXML->load_xml(location => $fn);
for my $node ($doc->findnodes('//rule')) {
my $idx = $node->getAttribute('comment');
$node->setAttribute('comment', $array[$idx]);
}
print $doc->toString();
Here's an XML::Twig example. It's basically the same idea as the XML::LibXML example done in a different way with a different tool:
use XML::Twig;
my $xml =
qq(<rule ccType="inst" comment="18"></rule>);
my #array;
$array[18] = 'my string';
my $twig = XML::Twig->new(
twig_handlers => {
rule => \&update_comment,
},
);
$twig->parse( $xml );
$twig->print;
sub update_comment {
my( $t, $e ) = #_;
my $n = $e->{att}{comment};
$e->set_att( comment => $array[$n] );
}

Get all occurances of matches pattern perl

if we are in the following case:
my $str = <<EO_STR;
Name=Value1 Adress=Value4
Name=Value2 Adress=Value5
Name=Value3 Adress=Value6
EO_STR
I have a table "T1" in the database with columns: ("Name", "Address") and I want to put on the column "Name" values "value1,Value2,Value3" and on the column "Adress" values "Value4,Value5,Value6"
in this case we have :
my #matches = $str =~ /Name=(.*?)\nAdress=(.*?)\n/g;
how can we use $1 and $2 with #matches in order to get separately all occurence of Name and Adresse in order to insert them on the Table T1?
All captures of all matches are returned, so you'd have to group them up.
use List::Util 1.29 qw( pairs );
for ( pairs( $str =~ /Name=(.*) Address=(.*)/g ) ) {
my #matches = #$_;
...
}
That said, it's far more common to grab the matches iteratively.
while ($str =~ /Name=(.*) Address=(.*)/g) {
my #matches = ( $1, $2 );
...
}
Regex is not always the right tool for the job. Your data looks a lot like it's just key/value pairs. Use split to break it up. No need for a pattern match here.
Your code and data doesn't match, so I've gone with what the code said.
use strict;
use warnings;
my $str = <<EO_STR;
Name=Value1
Adress=Value4
Name=Value2
Adress=Value5
Name=Value3
Adress=Value6
EO_STR
my $fields;
foreach my $pair (split /\n/, $str) {
my ($key, $value) = split /=/, $pair;
$key =~ s/^\s+//;
push #{ $fields->{$key} }, $value;
}
use Data::Dumper;
print Dumper $fields;
The code will create this data structure:
$VAR1 = {
'Name' => [
'Value1',
'Value2',
'Value3'
],
'Adress' => [
'Value4',
'Value5',
'Value6'
]
};
You can now access these two array references and use them to insert data into your table.
I have done the following:
#!/usr/bin/env perl
use v5.28;
my $str = <<EO_STR;
Name=Value1 Adress=Value4
Name=Value2 Adress=Value5
Name=Value3 Adress=Value6
EO_STR
my #array;
for my $a (split(/\n/, $str)) {
my %res = $a =~ m/(\w+)=(\w+)/g;
push #array, \%res;
}
for my $a (#array) {
for my $b (sort keys %{$a}) {
"\n", <INPUT_FILE> ); say $b.'->'.$a->{$b};
}
}
It creates this structure:
#array = [
{
Name->Value1,
Adress->Value4
},
...
];

perl: capturing the replaced-with string

I have code in a loop similar to
for( my $i=0; $a =~ s/<tag>(.*?)<\/tag>/sprintf("&CITE%03d;",$i)/e ; $i++ ){
%cite{ $i } = $1;
}
but instead of just the integer index, I want to make the keys of the hash the actual replaced-with text (placeholder "&CITE001;", etc.) without having to redo the sprintf().
I was almost sure there was a way to do it (variable similar to $& and such, but maybe I was thinking of vim's substitutions and not perl. :)
Thanks!
my $i = 0;
s{<tag>(.*?)</tag>}{
my $entity = sprintf("&CITE%03d;", $i++);
$cite{$entity} = $1;
$entity
}eg;
I did a something of a hacque, but really wanted something a bit more elegant. What I ended up doing (for now) is
my $t;
for( my $i=0; $t = sprintf("&CITE%04d;",$i), $all =~ s/($oct.*?$cct)/$t/s; $i++ ){
$cites{$t} = $1;
}
but I really wanted something even more "self-contained".
Just being able to grab the replacement string would've made things much simpler, though. This is a simple read-modify-write op.
True, adding the 'g' modifier should help shave some microseconds off it. :D
I think any method other than re-starting the search from the start of the target
is always the better choice.
In that vein and, as an alternative, you can move the logic inside the regex
via a Code Construct (?{ code }) and leverage the fact that $^N contains
the last capture content.
Perl
use strict;
use warnings;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
my $target = "<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>";
my %cite;
my ($cnt,$key) = (0,'');
$target =~ s/
<tag> (.*?) <\/tag>
(?{
$key = sprintf("&CITE%03d;", $cnt++);
$cite{$key} = $^N;
})
/$key/xg;
print $target, "\n";
print Dumper(\%cite);
Output
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};
Edited/code by #Ikegami
use strict;
use warnings;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
sub f {
my $target = "<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>";
my %cite;
my ($cnt,$key) = (0,'');
$target =~ s/
<tag> (.*?) <\/tag>
(?{
$key = sprintf("&CITE%03d;", $cnt++);
$cite{$key} = $^N;
})
/$key/xg;
print $target, "\n";
print Dumper(\%cite);
}
f() for 1..2;
Output
Variable "$key" will not stay shared at (re_eval 1) line 2.
Variable "$cnt" will not stay shared at (re_eval 1) line 2.
Variable "%cite" will not stay shared at (re_eval 1) line 3.
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};
$VAR1 = {};
This issue has been addressed in 5.18.
Perl by #sln
See, now I don't get that issue in version 5.20.
And, I don't believe I got it in 5.12 either.
use strict;
use warnings;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
sub wrapper {
my ($targ, $href) = #_;
my ($cnt, $key) = (0,'');
$$targ =~ s/<tag>(.*?)<\/tag>(?{ $key = sprintf("&CITE%03d;", $cnt++); $href->{$key} = $^N; })/$key/g;
}
my ($target,%cite) = ("<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>", ());
wrapper( \$target, \%cite );
print $target, "\n";
print Dumper(\%cite);
($target,%cite) = ("<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>", ());
wrapper( \$target, \%cite );
print $target, "\n";
print Dumper(\%cite);
Output
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};

Perl: Using split but ignore quotes

I'm trying to create a Perl hash from an input string, but I'm having problems with the original 'split', as values may contain quotes. Below is an example input string, and my (desired) resulting hash:
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,MOB,123,KEY,VALUE":TIME,"08:01:59":FIN,0';
my %hash =
(
CREATE => '',
USER => '',
TEL => '12345678',
MOB => '444001122',
Type => 'Whatever',
ATTRIBUTES => 'ID,0,MOB,123,KEY,VALUE',
TIME => '08:01:59',
FIN => '0',
);
The input string is of arbitrary length, and the number of keys is not set.
Thanks!
-hq
Use Text::CSV. It handles comma separated value files correctly.
Update
It seems the format of your input is not parsable by the standard module, even with sep_char and allow_loose_quotes. So, you have to do the heavy lifting yourself, but you can still use Text::CSV to parse each key-value pair:
#!/usr/bin/perl
use warnings;
use strict;
use feature qw(say);
use Data::Dumper;
use Text::CSV;
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0';
my #fields = split /:/, $command;
my %hash;
my $csv = Text::CSV->new();
my $i = 0;
while ($i <= $#fields) {
if (1 == $fields[$i] =~ y/"//) {
my $j = $i;
$fields[$i] .= ':' . $fields[$j] until 1 == $fields[++$j] =~ y/"//;
$fields[$i] .= ':' . $fields[$j];
splice #fields, $i + 1, $j - $i, ();
}
$csv->parse($fields[$i]);
my ($key, $value) = $csv->fields;
$hash{$key} = "$value"; # quotes turn undef to q()
$i++;
}
print Dumper \%hash;
As far as I can see the most obvious candidate - Text::CSV - won't handle this format properly, so a home-grown regular expression solution is the only one.
use strict;
use warnings;
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0';
my %config;
for my $field ($command =~ /(?:"[^"]*"|[^:])+/g) {
my ($key, $val) = split /,/, $field, 2;
($config{$key} = $val // '') =~ s/"([^"]*)"/$1/;
}
use Data::Dumper;
print Data::Dumper->Dump([\%config], ['*config']);
output
%config = (
'TIME' => '08:01:59',
'MOB' => '444001122',
'Type' => 'Whatever',
'CREATE' => '',
'TEL' => '12345678',
'ATTRIBUTES' => 'ID,0,KEY,VALUE',
'USER' => '',
'FIN' => '0'
);
If you have Perl v5.10 or later then you have the convenient (?| ... ) regular expression group, which allows you to write this
use 5.010;
use warnings;
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0';
my %config = $command =~ /(\w+) (?| , " ([^"]*) " | , ([^:"]*) | () )/gx;
use Data::Dumper;
print Data::Dumper->Dump([\%config], ['*config']);
which produces identical results to the code above.
This looks like something Text::ParseWords could handle. The quotewords subroutine will split the input on the delimiter :, ignoring delimiters inside quotes. This will give us the basic list of items, seen first in the output as $VAR1. After that, it is a simple matter of parsing the comma separated items with a regex which will handle optional second capture to accommodate empty tags such as those for CREATE and USER.
use strict;
use warnings;
use Data::Dumper;
use Text::ParseWords;
while (<DATA>) {
chomp;
my #list = quotewords(':', 0, $_);
my %hash = map { my ($k, $v) = /([^,]+),?(.*)/; $k => $v; } #list;
print Dumper \#list, \%hash;
}
__DATA__
CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0
Output:
$VAR1 = [
'CREATE',
'USER',
'TEL,12345678',
'MOB,444001122',
'Type,Whatever',
'ATTRIBUTES,ID,0,KEY,VALUE',
'TIME,08:01:59',
'FIN,0'
];
$VAR2 = {
'TIME' => '08:01:59',
'MOB' => '444001122',
'Type' => 'Whatever',
'CREATE' => '',
'TEL' => '12345678',
'ATTRIBUTES' => 'ID,0,KEY,VALUE',
'USER' => '',
'FIN' => '0'
};
my %hash = $command =~ /([^:,]+)(?:,((?:[^:"]|"[^"]*")*))?/g;
s/"([^"]*)"/$1/g
for grep defined, values %hash;

How to automagically create pattern based on real data?

I have many vendors in database, they all differ in some aspect of their data. I'd like to make data validation rule which is based on previous data.
Example:
A: XZ-4, XZ-23, XZ-217
B: 1276, 1899, 22711
C: 12-4, 12-75, 12
Goal: if user inputs string 'XZ-217' for vendor B, algorithm should compare previous data and say: this string is not similar to vendor B previous data.
Is there some good way/tools to achieve such comparison? Answer could be some generic algoritm or Perl module.
Edit:
The "similarity" is hard to define, i agree. But i'd like to catch to algorithm, which could analyze previous ca 100 samples and then compare the outcome of analyze with new data. Similarity may based on length, on use of characters/numbers, string creation patterns, similar beginning/end/middle, having some separators in.
I feel it is not easy task, but on other hand, i think it has very wide use. So i hoped, there is already some hints.
You may want to peruse:
http://en.wikipedia.org/wiki/String_metric and http://search.cpan.org/dist/Text-Levenshtein/Levenshtein.pm (for instance)
Joel and I came up with similar ideas. The code below differentiates 3 types of zones.
one or more non-word characters
alphanumeric cluster
a cluster of digits
It creates a profile of the string and a regex to match input. In addition, it also contains logic to expand existing profiles. At the end, in the task sub, it contains some pseudo logic which indicates how this might be integrated into a larger application.
use strict;
use warnings;
use List::Util qw<max min>;
sub compile_search_expr {
shift;
#_ = #{ shift() } if #_ == 1;
my $str
= join( '|'
, map { join( ''
, grep { defined; }
map {
$_ eq 'P' ? quotemeta;
: $_ eq 'W' ? "\\w{$_->[1],$_->[2]}"
: $_ eq 'D' ? "\\d{$_->[1],$_->[2]}"
: undef
;
} #$_
)
} #_ == 1 ? #{ shift } : #_
);
return qr/^(?:$str)$/;
}
sub merge_profiles {
shift;
my ( $profile_list, $new_profile ) = #_;
my $found = 0;
PROFILE:
for my $profile ( #$profile_list ) {
my $profile_length = #$profile;
# it's not the same profile.
next PROFILE unless $profile_length == #$new_profile;
my #merged;
for ( my $i = 0; $i < $profile_length; $i++ ) {
my $old = $profile->[$i];
my $new = $new_profile->[$i];
next PROFILE unless $old->[0] eq $new->[0];
push( #merged
, [ $old->[0]
, min( $old->[1], $new->[1] )
, max( $old->[2], $new->[2] )
]);
}
#$profile = #merged;
$found = 1;
last PROFILE;
}
push #$profile_list, $new_profile unless $found;
return;
}
sub compute_info_profile {
shift;
my #profile_chunks
= map {
/\W/ ? [ P => $_ ]
: /\D/ ? [ W => length, length ]
: [ D => length, length ]
}
grep { length; } split /(\W+)/, shift
;
}
# Psuedo-Perl
sub process_input_task {
my ( $application, $input ) = #_;
my $patterns = $application->get_patterns_for_current_customer;
my $regex = $application->compile_search_expr( $patterns );
if ( $input =~ /$regex/ ) {}
elsif ( $application->approve_divergeance( $input )) {
$application->merge_profiles( $patterns, compute_info_profile( $input ));
}
else {
$application->escalate(
Incident->new( issue => INVALID_FORMAT
, input => $input
, customer => $customer
));
}
return $application->process_approved_input( $input );
}
Here is my implementation and a loop over your test cases. Basically you give a list of good values to the function and it tries to build a regex for it.
output:
A: (?^:\w{2,2}(?:\-){1}\d{1,3})
B: (?^:\d{4,5})
C: (?^:\d{2,2}(?:\-)?\d{0,2})
code:
#!/usr/bin/env perl
use strict;
use warnings;
use List::MoreUtils qw'uniq each_arrayref';
my %examples = (
A => [qw/ XZ-4 XZ-23 XZ-217 /],
B => [qw/ 1276 1899 22711 /],
C => [qw/ 12-4 12-75 12 /],
);
foreach my $example (sort keys %examples) {
print "$example: ", gen_regex(#{ $examples{$example} }) || "Generate failed!", "\n";
}
sub gen_regex {
my #cases = #_;
my %exploded;
# ex. $case may be XZ-217
foreach my $case (#cases) {
my #parts =
grep { defined and length }
split( /(\d+|\w+)/, $case );
# #parts are ( XZ, -, 217 )
foreach (#parts) {
if (/\d/) {
# 217 becomes ['\d' => 3]
push #{ $exploded{$case} }, ['\d' => length];
} elsif (/\w/) {
#XZ becomes ['\w' => 2]
push #{ $exploded{$case} }, ['\w' => length];
} else {
# - becomes ['lit' => '-']
push #{ $exploded{$case} }, ['lit' => $_ ];
}
}
}
my $pattern = '';
# iterate over nth element (part) of each case
my $ea = each_arrayref(values %exploded);
while (my #parts = $ea->()) {
# remove undefined (i.e. optional) parts
my #def_parts = grep { defined } #parts;
# check that all (defined) parts are the same type
my #part_types = uniq map {$_->[0]} #def_parts;
if (#part_types > 1) {
warn "Parts not aligned\n";
return;
}
my $type = $part_types[0]; #same so make scalar
# were there optional parts?
my $required = (#parts == #def_parts);
# keep the values of each part
# these are either a repitition or lit strings
my #values = sort uniq map { $_->[1] } #def_parts;
# these are for non-literal quantifiers
my $min = $required ? $values[0] : 0;
my $max = $values[-1];
# write the specific pattern for each type
if ($type eq '\d') {
$pattern .= '\d' . "{$min,$max}";
} elsif ($type eq '\w') {
$pattern .= '\w' . "{$min,$max}";
} elsif ($type eq 'lit') {
# quote special characters, - becomes \-
my #uniq = map { quotemeta } uniq #values;
# join with alternations, surround by non-capture grouup, add quantifier
$pattern .= '(?:' . join('|', #uniq) . ')' . ($required ? '{1}' : '?');
}
}
# build the qr regex from pattern
my $regex = qr/$pattern/;
# test that all original patterns match (#fail should be empty)
my #fail = grep { $_ !~ $regex } #cases;
if (#fail) {
warn "Some cases fail for generated pattern $regex: (#fail)\n";
return '';
} else {
return $regex;
}
}
To simplify the work of finding the pattern, optional parts may come at the end, but no required parts may come after optional ones. This could probably be overcome but it might be hard.
If there was a Tie::StringApproxHash module, it would fit the bill here.
I think you're looking for something that combines the fuzzy-logic functionality of String::Approx and the hash interface of Tie::RegexpHash.
The former is more important; the latter would make light work of coding.