Perl: Using split but ignore quotes - regex

I'm trying to create a Perl hash from an input string, but I'm having problems with the original 'split', as values may contain quotes. Below is an example input string, and my (desired) resulting hash:
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,MOB,123,KEY,VALUE":TIME,"08:01:59":FIN,0';
my %hash =
(
CREATE => '',
USER => '',
TEL => '12345678',
MOB => '444001122',
Type => 'Whatever',
ATTRIBUTES => 'ID,0,MOB,123,KEY,VALUE',
TIME => '08:01:59',
FIN => '0',
);
The input string is of arbitrary length, and the number of keys is not set.
Thanks!
-hq

Use Text::CSV. It handles comma separated value files correctly.
Update
It seems the format of your input is not parsable by the standard module, even with sep_char and allow_loose_quotes. So, you have to do the heavy lifting yourself, but you can still use Text::CSV to parse each key-value pair:
#!/usr/bin/perl
use warnings;
use strict;
use feature qw(say);
use Data::Dumper;
use Text::CSV;
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0';
my #fields = split /:/, $command;
my %hash;
my $csv = Text::CSV->new();
my $i = 0;
while ($i <= $#fields) {
if (1 == $fields[$i] =~ y/"//) {
my $j = $i;
$fields[$i] .= ':' . $fields[$j] until 1 == $fields[++$j] =~ y/"//;
$fields[$i] .= ':' . $fields[$j];
splice #fields, $i + 1, $j - $i, ();
}
$csv->parse($fields[$i]);
my ($key, $value) = $csv->fields;
$hash{$key} = "$value"; # quotes turn undef to q()
$i++;
}
print Dumper \%hash;

As far as I can see the most obvious candidate - Text::CSV - won't handle this format properly, so a home-grown regular expression solution is the only one.
use strict;
use warnings;
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0';
my %config;
for my $field ($command =~ /(?:"[^"]*"|[^:])+/g) {
my ($key, $val) = split /,/, $field, 2;
($config{$key} = $val // '') =~ s/"([^"]*)"/$1/;
}
use Data::Dumper;
print Data::Dumper->Dump([\%config], ['*config']);
output
%config = (
'TIME' => '08:01:59',
'MOB' => '444001122',
'Type' => 'Whatever',
'CREATE' => '',
'TEL' => '12345678',
'ATTRIBUTES' => 'ID,0,KEY,VALUE',
'USER' => '',
'FIN' => '0'
);
If you have Perl v5.10 or later then you have the convenient (?| ... ) regular expression group, which allows you to write this
use 5.010;
use warnings;
my $command = 'CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0';
my %config = $command =~ /(\w+) (?| , " ([^"]*) " | , ([^:"]*) | () )/gx;
use Data::Dumper;
print Data::Dumper->Dump([\%config], ['*config']);
which produces identical results to the code above.

This looks like something Text::ParseWords could handle. The quotewords subroutine will split the input on the delimiter :, ignoring delimiters inside quotes. This will give us the basic list of items, seen first in the output as $VAR1. After that, it is a simple matter of parsing the comma separated items with a regex which will handle optional second capture to accommodate empty tags such as those for CREATE and USER.
use strict;
use warnings;
use Data::Dumper;
use Text::ParseWords;
while (<DATA>) {
chomp;
my #list = quotewords(':', 0, $_);
my %hash = map { my ($k, $v) = /([^,]+),?(.*)/; $k => $v; } #list;
print Dumper \#list, \%hash;
}
__DATA__
CREATE:USER:TEL,12345678:MOB,444001122:Type,Whatever:ATTRIBUTES,"ID,0,KEY,VALUE":TIME,"08:01:59":FIN,0
Output:
$VAR1 = [
'CREATE',
'USER',
'TEL,12345678',
'MOB,444001122',
'Type,Whatever',
'ATTRIBUTES,ID,0,KEY,VALUE',
'TIME,08:01:59',
'FIN,0'
];
$VAR2 = {
'TIME' => '08:01:59',
'MOB' => '444001122',
'Type' => 'Whatever',
'CREATE' => '',
'TEL' => '12345678',
'ATTRIBUTES' => 'ID,0,KEY,VALUE',
'USER' => '',
'FIN' => '0'
};

my %hash = $command =~ /([^:,]+)(?:,((?:[^:"]|"[^"]*")*))?/g;
s/"([^"]*)"/$1/g
for grep defined, values %hash;

Related

To replace a number with strings in an XML file using a Perl script

I have an XML file. I need to replace the digits in comment="18" with comment="my string" where my string is from my #array ($array[18] = my string).
<rule ccType="inst" comment="18" domain="icc" entityName="thens" entityType="toggle" excTime="1605163966" name="exclude" reviewer="hpanjali" user="1" vscope="default"></rule>
This is what I have tried.
while (my $line = <FH>) {
chomp $line;
$line =~ s/comment="(\d+)"/comment="$values[$1]"/ig;
#print "$line \n";
print FH1 $line, "\n";
}
Here is an example using XML::LibXML:
use strict;
use warnings;
use XML::LibXML;
my $fn = 'test.xml';
my #array = map { "string$_" } 0..20;
my $doc = XML::LibXML->load_xml(location => $fn);
for my $node ($doc->findnodes('//rule')) {
my $idx = $node->getAttribute('comment');
$node->setAttribute('comment', $array[$idx]);
}
print $doc->toString();
Here's an XML::Twig example. It's basically the same idea as the XML::LibXML example done in a different way with a different tool:
use XML::Twig;
my $xml =
qq(<rule ccType="inst" comment="18"></rule>);
my #array;
$array[18] = 'my string';
my $twig = XML::Twig->new(
twig_handlers => {
rule => \&update_comment,
},
);
$twig->parse( $xml );
$twig->print;
sub update_comment {
my( $t, $e ) = #_;
my $n = $e->{att}{comment};
$e->set_att( comment => $array[$n] );
}

Get all occurances of matches pattern perl

if we are in the following case:
my $str = <<EO_STR;
Name=Value1 Adress=Value4
Name=Value2 Adress=Value5
Name=Value3 Adress=Value6
EO_STR
I have a table "T1" in the database with columns: ("Name", "Address") and I want to put on the column "Name" values "value1,Value2,Value3" and on the column "Adress" values "Value4,Value5,Value6"
in this case we have :
my #matches = $str =~ /Name=(.*?)\nAdress=(.*?)\n/g;
how can we use $1 and $2 with #matches in order to get separately all occurence of Name and Adresse in order to insert them on the Table T1?
All captures of all matches are returned, so you'd have to group them up.
use List::Util 1.29 qw( pairs );
for ( pairs( $str =~ /Name=(.*) Address=(.*)/g ) ) {
my #matches = #$_;
...
}
That said, it's far more common to grab the matches iteratively.
while ($str =~ /Name=(.*) Address=(.*)/g) {
my #matches = ( $1, $2 );
...
}
Regex is not always the right tool for the job. Your data looks a lot like it's just key/value pairs. Use split to break it up. No need for a pattern match here.
Your code and data doesn't match, so I've gone with what the code said.
use strict;
use warnings;
my $str = <<EO_STR;
Name=Value1
Adress=Value4
Name=Value2
Adress=Value5
Name=Value3
Adress=Value6
EO_STR
my $fields;
foreach my $pair (split /\n/, $str) {
my ($key, $value) = split /=/, $pair;
$key =~ s/^\s+//;
push #{ $fields->{$key} }, $value;
}
use Data::Dumper;
print Dumper $fields;
The code will create this data structure:
$VAR1 = {
'Name' => [
'Value1',
'Value2',
'Value3'
],
'Adress' => [
'Value4',
'Value5',
'Value6'
]
};
You can now access these two array references and use them to insert data into your table.
I have done the following:
#!/usr/bin/env perl
use v5.28;
my $str = <<EO_STR;
Name=Value1 Adress=Value4
Name=Value2 Adress=Value5
Name=Value3 Adress=Value6
EO_STR
my #array;
for my $a (split(/\n/, $str)) {
my %res = $a =~ m/(\w+)=(\w+)/g;
push #array, \%res;
}
for my $a (#array) {
for my $b (sort keys %{$a}) {
"\n", <INPUT_FILE> ); say $b.'->'.$a->{$b};
}
}
It creates this structure:
#array = [
{
Name->Value1,
Adress->Value4
},
...
];

perl: capturing the replaced-with string

I have code in a loop similar to
for( my $i=0; $a =~ s/<tag>(.*?)<\/tag>/sprintf("&CITE%03d;",$i)/e ; $i++ ){
%cite{ $i } = $1;
}
but instead of just the integer index, I want to make the keys of the hash the actual replaced-with text (placeholder "&CITE001;", etc.) without having to redo the sprintf().
I was almost sure there was a way to do it (variable similar to $& and such, but maybe I was thinking of vim's substitutions and not perl. :)
Thanks!
my $i = 0;
s{<tag>(.*?)</tag>}{
my $entity = sprintf("&CITE%03d;", $i++);
$cite{$entity} = $1;
$entity
}eg;
I did a something of a hacque, but really wanted something a bit more elegant. What I ended up doing (for now) is
my $t;
for( my $i=0; $t = sprintf("&CITE%04d;",$i), $all =~ s/($oct.*?$cct)/$t/s; $i++ ){
$cites{$t} = $1;
}
but I really wanted something even more "self-contained".
Just being able to grab the replacement string would've made things much simpler, though. This is a simple read-modify-write op.
True, adding the 'g' modifier should help shave some microseconds off it. :D
I think any method other than re-starting the search from the start of the target
is always the better choice.
In that vein and, as an alternative, you can move the logic inside the regex
via a Code Construct (?{ code }) and leverage the fact that $^N contains
the last capture content.
Perl
use strict;
use warnings;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
my $target = "<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>";
my %cite;
my ($cnt,$key) = (0,'');
$target =~ s/
<tag> (.*?) <\/tag>
(?{
$key = sprintf("&CITE%03d;", $cnt++);
$cite{$key} = $^N;
})
/$key/xg;
print $target, "\n";
print Dumper(\%cite);
Output
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};
Edited/code by #Ikegami
use strict;
use warnings;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
sub f {
my $target = "<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>";
my %cite;
my ($cnt,$key) = (0,'');
$target =~ s/
<tag> (.*?) <\/tag>
(?{
$key = sprintf("&CITE%03d;", $cnt++);
$cite{$key} = $^N;
})
/$key/xg;
print $target, "\n";
print Dumper(\%cite);
}
f() for 1..2;
Output
Variable "$key" will not stay shared at (re_eval 1) line 2.
Variable "$cnt" will not stay shared at (re_eval 1) line 2.
Variable "%cite" will not stay shared at (re_eval 1) line 3.
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};
$VAR1 = {};
This issue has been addressed in 5.18.
Perl by #sln
See, now I don't get that issue in version 5.20.
And, I don't believe I got it in 5.12 either.
use strict;
use warnings;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
sub wrapper {
my ($targ, $href) = #_;
my ($cnt, $key) = (0,'');
$$targ =~ s/<tag>(.*?)<\/tag>(?{ $key = sprintf("&CITE%03d;", $cnt++); $href->{$key} = $^N; })/$key/g;
}
my ($target,%cite) = ("<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>", ());
wrapper( \$target, \%cite );
print $target, "\n";
print Dumper(\%cite);
($target,%cite) = ("<tag>zero</tag>\n<tag>one</tag>\n<tag>two</tag>\n<tag>three</tag>", ());
wrapper( \$target, \%cite );
print $target, "\n";
print Dumper(\%cite);
Output
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};
&CITE000;
&CITE001;
&CITE002;
&CITE003;
$VAR1 = {
'&CITE000;' => 'zero',
'&CITE001;' => 'one',
'&CITE002;' => 'two',
'&CITE003;' => 'three'
};

Perl, global match and append something after the matched string

I have a problem in the case of performing a global match. How can I substitute the matched string for a new string, which is made up from the original string plus a new string. The string is like:
$string = "t123:apple;t456:pear;t789:banana";
Then I have a hash like this:
my %hash = (
t123 => 'fruit1',
t456 => 'fruit2',
t789 => 'fruit3',
);
How can I then obtain a new string such as:
$newstring = "t123 fruit1:apple;t456 fruit2:pear;t789 fruit3:banana";
Now, my perl code is:
while($string =~ /t\d{3}/g){
if (exists $hash{"$&"}) {
my $match = $&;
$string =~ s/$&/$match.$hash{"$&"}/;
}
}
It doesn't work though, because the match always starts from the first character. I think I should use pos(string) or something to make it have an offset, but I don't know how to do this.
The easy way is rather easy:
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
my $string = "t123:apple;t456:pear;t789:banana";
my %hash = (
t123 => 'fruit1',
t456 => 'fruit2',
t789 => 'fruit3',
);
$string =~ s/(t\d+)/$1 $hash{$1}/g;
say $string;
But this doesn't ensure that everything that matches t\d{3} is a valid key in your hash. So let's explicitly search for those keys.
#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
my $string = "t123:apple;t456:pear;t789:banana";
my %hash = (
t123 => 'fruit1',
t456 => 'fruit2',
t789 => 'fruit3',
);
my $match = join '|', map quotemeta, keys %hash;
$string =~ s/($match)/$1 $hash{$1}/g;
say $string;

Parsing a string to a hash

I have a string:
<https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5>;
rel="next",
<https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5>;
rel="first",
<https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5>;
rel="last"
So the format is
(<val>; rel="key")*
And I want to parse that to a hash with the following format:
next => https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5
first => https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5
last => https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5
In Java I would use a regex pattern to extract each key => value pair and put them into a map. The pattern would be something like:
<([^>]++)>;\s*rel="([^"]++)"
Which would give me the key in the second match group and the value in the first. Would the same approach be the best way to achieve this is Perl, or is there something snazzier I could do?
P.S. the reason I'm using Perl rather than Java is that the server doesn't have Java.
My first inclination was to split the string on commas and work with the three substrings, but it is probably better to use a global match ina while loop.
This should do what you want. (Perl is by far the better tool for text processing like this!)
Update I've just realised that your choice of markdown discarded the angle brackets and newlines. Is this more appropriate? I assume it's a multi-line string?
use strict;
use warnings;
my $str = <<'END';
<https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5>;
rel="next",
<https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5>;
rel="first",
<https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5>;
rel="last"
END
my %data;
while ($str =~ / < ([^<>]+) >; \s* rel="([^"]+)" (?:,\s*)? /xg) {
$data{$2} = $1;
}
use Data::Dump;
dd \%data;
output
{
first => "https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5",
last => "https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5",
next => "https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5",
}
You can split the string on a "," and then use a map to create the hash:
#!/usr/bin/env perl
use strict;
use warnings;
my $str = 'https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5; rel="next", https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5; rel="first", https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5; rel="last"';
my %hash = map {
my ($v, $k) = $_ =~ /\s*([^;]+);\s*rel="([^"]+)".*/;
$k => $v;
} split ',', $str;
foreach my $key (keys %hash) {
print "$key => $hash{$key}\n"
}
output:
first => https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5
next => https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5
last => https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5
update
With the new string you could do:
$str = q(<https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5>; rel="next", <https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5>; rel="first", <https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5>; rel="last");
my %hash = map {
my ($v, $k) = $_ =~ /<([^>]+)>;\s*rel="([^"]+)".*/;
$k => $v;
} split ',', $str;
to get the same result.
use strict;
use warnings;
my $string='https://gitlab.me.com/api/v3/projects/all?page=2&per_page=5; rel="next", https://gitlab.me.com/api/v3/projects/all?page=1&per_page=5; rel="first", https://gitlab.me.com/api/v3/projects/all?page=8&per_page=5; rel="last"';
my #array=split /,/, $string;
my %hash;
foreach(#array)
{
if($_=~/(.*?);\s*rel\=\s*"([^"]+)"/)
{
$hash{$2}=$1;
}
}
print "$_ => $hash{$_}\n" foreach(keys%hash);