perl regex for multiple matches - regex

I have a file with lines similar to following:
abcd1::101:xyz1,user,user1,abcd1,pqrs1,userblah,abcd1
I want to retain strings up to last ":" and remove all occurrences of abcd1
In the end, I need to have below:
abcd1::101:xyz1,xyz2,xyz3,pqrs1,xyz4
I tried code as below, but for some reason, it is not working. So please help
the account name is "abcd1"
sub UpdateEtcGroup {
my $account = shift;
my $file = "/tmp/group";
#ARGV = ($file);
$^I = ".bak";
while (<>){
s#^($account::\d{1,$}:)$account,?#$1#g;
s/,$//; # to remove the last "," if there
print;
}
}

split is the tool for the job, not a regex.
Because split lets you reliably separate out the field you do want to operate on, from the ones that you don't. Like this:
#!/usr/bin/env perl
use strict;
use warnings;
my $username = 'abcd1';
while ( <DATA> ) {
my #fields = split /:/;
my #users = split ( /,/, pop ( #fields ) );
print join ( ":", #fields,
join ( ",", grep { not m/^$username$/ } #users ) ),"\n";
}
__DATA__
abcd1::101:xyz1,user,user1,abcd1,pqrs1,userblah,abcd1

Don't use a regular expression for this.
use strict;
use warnings;
while (<DATA>) {
chomp;
my #parts = split(/:/, $_);
$parts[-1] = join(',', grep { !/^abcd/ } split(/,/, $parts[-1]));
print join(':', #parts) . "\n";
}
__DATA__
abcd1::101:xyz1,user,user1,abcd1,pqrs1,userblah,abcd1
abcd2::102:user1,xyz2,otheruser,abcd2,pqrs1,xyz4,abcd2
Output:
abcd1::101:xyz1,user,user1,pqrs1,userblah
abcd2::102:user1,xyz2,otheruser,pqrs1,xyz4

Related

Perl Regex Match and loop HTML Comments

I have a log file with data in format :
<!-- 12/15/16 01:02:27:950.125
DATA1 -->
<!-- 12/15/16 01:02:27:950.373
DATA2 -->
<!-- 12/15/16 01:02:27:950.921
DATA3: Text1 -->
<!-- 12/15/16 01:02:27:951.066
DATA4: Text2 -->
I need to extract and loop all the data inside the comments.
I am reading the file and saving data as one string.
I have tried a few solutions but getiing "undef" on match
use strict;
use Data::Dumper;
use File::Basename;
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );
use Date::Format;
use DateTime;
use warnings;
.
.
.
if ( open(ORIGFILE, $filepath) ) {
my #wrp_record_content = <ORIGFILE>;
# my $content = join('', #wrp_record_content);
# my #matches = $content =~ s/<!--(.*)-->//g;
# my $data;
# while ( <ORIGFILE> ) {
# $data .= $_;
# }
# while ( $data =~ m/<!--(.*)-->/g ) {
# print Dumper('===DATA===');
# print Dumper($data);
# }
my $content = join('', #wrp_record_content);
#print Dumper('------CONTENT------');
#print Dumper($content);
#print Dumper('------ CONTENT ENDED ------');
my #matches;
while ($content =~ /<!--.*?-->/gs) {
push #matches, $1;
}
foreach my $m (#matches) {
print Dumper('===MATCH===', "\n");
print Dumper($m);
}
}
Can someone please guide on where it is going wrong?
There is nothing in $1. You must add capturing parentheses to your regex pattern
$content =~ /<!--(.*?)-->/gs
You have done it correctly in the loop that you commented out!

Perl: Trying to speed up parsing a delimited file

I have a large flat text file with lines that hold name/value pairs ("varname=value"). These pairs are seperated by a multi-character delimiter. So a single line in this file might look like this:
var1=value1|^|var2=value2|^|var3=value3|^|var4=value4
Each line holds about 50 name/value pairs.
I need to iterate through the lines of this file (there are about 100,000 lines) and store the name/value pairs in a hash so that
$field{'var1'} = value1
$field{'var2'} = value2
etc...
What I did was this:
# $line holds a single line from the file
my #fields = split( /\Q|^|\E/, $line );
foreach my $field (#fields) {
my ($name, $value) = split( /=/, $field );
$hash{$name} = $value;
}
Doing this for each line of the entire file takes (on my PC) about 2 seconds. This doesn't seem like a long time, but I really want to speed this up by quite a bit.
Of this 2 seconds, the first split takes about 0.6 seconds, while the foreach loop takes about 1.4 seconds. So I thought I'd get rid of the foreach loop and put it all in a single split:
%hash = split( /\Q|^|\E|=/, $line );
Much to my surprise, parsing the entire file this way took a full second longer! My question isn't really why this takes longer (although it would be a nice bonus to understand why), but my question is if there are any other (faster) ways to get the job done.
Thanks in advance.
------ Edit below this line ------
I just found out that changing this:
%hash = split( /\Q|^|\E|=/, $line );
into this:
$line =~ s/\Q|^|\E/=/g;
%hash = split( /=/, $line );
makes it three times faster! Parsing the entire file this way now takes just over a second...
------ Snippet below this line ------
use strict;
use Time::HiRes qw( time );
my $line = "a=1|^|b=2|^|c=3|^|d=4|^|e=5|^|f=6|^|g=7|^|h=8|^|i=9|^|j=10|^|k=11|^|l=12|^|m=13|^|n=14|^|o=15|^|p=16|^|q=17|^|r=18|^|s=19|^|t=20|^|u=21|^|v=22|^|w=23|^|x=24|^|y=25|^|z=26|^|aa=27|^|ab=28|^|ac=29|^|ad=30|^|ae=31|^|af=32|^|ag=33|^|ah=34|^|ai=35|^|aj=36|^|ak=37|^|al=38|^|am=39|^|an=40|^|ao=41|^|ap=42|^|aq=43|^|ar=44|^|as=45|^|at=46|^|au=47|^|av=48|^|aw=49|^|ax=50";
ResetTimer();
my %hash;
for( my $i = 1; $i <= 100000; $i++ ) {
my #fields = split( /\Q|^|\E/, $line );
foreach my $field (#fields) {
my ($name, $value) = split( /=/, $field );
$hash{$name} = $value;
}
}
print Elapsed() . "\n";
ResetTimer();
%hash = ();
for( my $i = 1; $i <= 100000; $i++ ) {
%hash = split( /\Q|^|\E|=/, $line );
}
print Elapsed() . "\n";
ResetTimer();
%hash = ();
for( my $i = 1; $i<=100000; $i++ ) {
$line =~ s/\Q|^|\E/=/g;
%hash = split( /=/, $line );
}
print Elapsed() . "\n";
################################################################################################################################
BEGIN {
my $startTime;
sub ResetTimer {
$startTime = time();
return $startTime;
}
sub Elapsed {
return time() - $startTime;
}
}
I can't easily answer your performance question, because I'd need a test case. But I'd guess that it's to do with how the regular expression is being processed.
You can see what that's doing with use re 'debug';, and that'll print the regular expression steps.
But for the broader question - I'd probably just tackle it with a global (assuming your data is as simple as the example):
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
while ( <DATA> ) {
my %row = m/(\w+)=(\w+)/g;
print Dumper \%row;
}
__DATA__
var1=value1|^|var2=value2|^|var3=value3|^|var4=value4
You can use lookahead/behind to match delimiters if you've got more complicated things in there, but because it's one regex per line, you're invoking the regex engine less often, and that'll probably be faster. (But I can't tell you for sure without a test case).
If your data is more complicated, then perhaps:
my %row = s/\Q|^|\E/\n/rg =~ m/(.*)=(.*)/g;
This will 'force' splitting the input into a new line, and then match 'anything' = 'anything'. But that's probably overkill unless your values include whitespace/pipes/metachars.
With editing your test case to use Benchmark:
#!/usr/bin/env perl
use strict;
use warnings;
use Benchmark qw ( cmpthese );
my $line =
"a=1|^|b=2|^|c=3|^|d=4|^|e=5|^|f=6|^|g=7|^|h=8|^|i=9|^|j=10|^|k=11|^|l=12|^|m=13|^|n=14|^|o=15|^|p=16|^|q=17|^|r=18|^|s=19|^|t=20|^|u=21|^|v=22|^|w=23|^|x=24|^|y=25|^|z=26|^|aa=27|^|ab=28|^|ac=29|^|ad=30|^|ae=31|^|af=32|^|ag=33|^|ah=34|^|ai=35|^|aj=36|^|ak=37|^|al=38|^|am=39|^|an=40|^|ao=41|^|ap=42|^|aq=43|^|ar=44|^|as=45|^|at=46|^|au=47|^|av=48|^|aw=49|^|ax=50";
sub double_split {
my %hash;
my #fields = split( /\Q|^|\E/, $line );
foreach my $field (#fields) {
my ( $name, $value ) = split( /=/, $field );
$hash{$name} = $value;
}
}
sub single_split {
my %hash = split( /\Q|^|\E|=/, $line );
}
sub re_replace_then_split {
$line =~ s/\Q|^|\E/=/g;
my %hash = split( /=/, $line );
}
sub single_regex {
my %hash = $line =~ m/(\w+)=(\w+)/g;
}
sub compound {
my %hash = $line =~ s/\Q|^|\E/\n/rg =~ m/(.*)=(.*)/g;
}
cmpthese(
1_000_000,
{ "Double Split" => \&double_split,
"single split with regex" => \&single_split,
"Replace then split" => \&re_replace_then_split,
"Single Regex" => \&single_regex,
"regex to linefeed them match" => \&compound
}
);
Looks like the results come out like:
Rate Double Split single split with regex Single Regex Replace then split regex to linefeed them match
Double Split 18325/s -- -4% -34% -56% -97%
single split with regex 19050/s 4% -- -31% -54% -97%
Single Regex 27607/s 51% 45% -- -34% -96%
Replace then split 41733/s 128% 119% 51% -- -93%
regex to linefeed them match 641026/s 3398% 3265% 2222% 1436% --
... I'm a bit suspicious of that last, because that's absurdly faster. There's probably caching of results happening there.
But looking at it, what's slowing you down is the alternation in the regex:
sub single_split_with_alt {
my %hash = split( /\Q|^|\E|=/, $line );
}
sub single_split {
my %hash = split( /[\|\^\=]+/, $line );
}
(I know that latter might not be quite what you want, but it's for illustrative purposes)
Gives:
Rate alternation single split
alternation 19135/s -- -37%
single split 30239/s 58% --
But there does come a point where this is moot, because your limiting factor is disk IO, not CPU.

perl count line in double looping, if match regular expression plus 1

I open a file by putting the line to an array. Inside this file based on the regular expression that contains a duplicate value. If the regular expression is a match I want to count it. The regular expression may look like this
$b =~ /\/([^\/]+)##/. I want to match $1 value.
my #array = do
{
open my $FH, '<', 'abc.txt' or die 'unable to open the file\n';
<$FH>;
};
Below is the way I do, it will get the same line in my file. Thank for help.
foreach my $b (#array)
{
$conflictTemp = 0;
$b =~ /\/([^\/]+)##/;
$b = $1;
#print "$b\n";
foreach my $c (#array)
{
$c =~ /\/([^\/]+)##/;
$c = $1;
if($b eq $c)
{
$conflictTemp ++;
#print "$b , $c \n"
#if($conflictTemp > 1)
#{
# $conflict ++;
#}
}
}
}
Below is the some sample data, two sentences are duplicates
/a/b/c/d/code/Debug/atlantis_digital/c/d/code/Debug/atlantis_digital.map##/main/place.09/2
/a/b/c/d/code/C5537_mem_map.cmd##/main/place.09/0
/a/b/c/d/code/.settings/org.eclipse.cdt.managedbuilder.core.prefs##/main/4
/a/b/c/d/code/.project_initial##/main/2
/a/b/c/d/code/.project##/main/CSS5/5
/a/b/c/d/code/.cproject##/main/CSS5/10
/a/b/c/d/code/.cdtproject##/main/place.09/0
/a/b/c/d/code/.cdtproject##/main/place.09/0
/a/b/c/d/code/.cdtbuild_initial##/main/2
/a/b/c/d/code/.**cdtbuild##**/main/CSS5/2
/a/b/c/d/code/.**cdtbuild##**/main/CSS5/2
/a/b/c/d/code/.ccsproject##/main/CSS5/3
It looks like you're trying to iterate each element of the array, select some data via pattern match, and then count dupes. Is that correct?
Would it not be easier to:
my %count_of;
while ( <$FH> ) {
my ( $val ) = /\/([^\/]+)##/;
$count_of{$val}++;
}
And then, for the variables that have more than one (e.g. there's a duplicate):
print join "\n", grep { $count_of{$_} > 1 } keys %count_of;
Alternatively, if you're just wanting to play 'spot the dupe':
#!/usr/bin/env perl
use strict;
use warnings;
my %seen;
my $match = qr/\/([^\/]+)##/;
while ( <DATA> ) {
my ( $value ) = m/$match/ or next;
print if $seen{$value}++;
}
__DATA__
/a/b/c/d/code/Debug/atlantis_digital/c/d/code/Debug/atlantis_digital.map##/main/place.09/2
/a/b/c/d/code/C5537_mem_map.cmd##/main/place.09/0
/a/b/c/d/code/.settings/org.eclipse.cdt.managedbuilder.core.prefs##/main/4
/a/b/c/d/code/.project_initial##/main/2
/a/b/c/d/code/.project##/main/CSS5/5
/a/b/c/d/code/.cproject##/main/CSS5/10
/a/b/c/d/code/.cdtproject##/main/place.09/0
/a/b/c/d/code/.cdtproject##/main/place.09/0
/a/b/c/d/code/.cdtbuild_initial##/main/2
/a/b/c/d/code/.cdtbuild##/main/CSS5/2
/a/b/c/d/code/.cdtbuild##/main/CSS5/2
/a/b/c/d/code/.ccsproject##/main/CSS5/3
The problem has been solved by the previous answer - I just want to offer an alternate flavour that;
Spells out the regex
Uses the %seen hash to record the line the pattern first appears; to enable
slightly more detailed reporting
use v5.12;
use warnings;
my $regex = qr/
\/ # A literal slash followed by
( # Capture to $1 ...
[^\/]+ # ... anything that's not a slash
) # close capture to $1
## # Must be immdiately followed by literal ##
/x;
my %line_num ;
while (<>) {
next unless /$regex/ ;
my $pattern = $1 ;
if ( $line_num{ $pattern } ) {
say "'$pattern' appears on lines ", $line_num{ $pattern }, " and $." ;
next ;
}
$line_num{ $pattern } = $. ; # Record the line number
}
# Ran on data above will produce;
# '.cdtproject' appears on lines 7 and 8
# '.cdtbuild' appears on lines 10 and 11

Extract word before the 1st occurrence of a special string

I have an array that contains elements like
#array=("link_dm &&& drv_ena&&&1",
"txp_n_los|rx_n_lost",
"eof &&& 2 &&& length =!!!drv!!!0");
I want to get all the characters before the first "&&&", and if the element doesn't have a "&&&", then I need to extract the entire element.
This is what I want to extract:
likn_dm
txp_n_los|rx_n_lost
eof
I used
foreach my $row (#array){
if($row =~ /^(.*)\&{3}/){
push #firstelements,$1;
}
}
But I'm getting
link_dm &&& drv_ena
txp_n_los|rx_n_lost
eof &&& 2
Can somebody please suggest how I can achieve this?
Perhaps just splitting would be helpful:
use strict;
use warnings;
my #array = (
"link_dm &&& drv_ena&&&1",
"txp_n_los|rx_n_lost",
"eof &&& 2 &&& length =!!!drv!!!0"
);
foreach my $row (#array){
my ($chars) = split /\&{3}/, $row, 2;
print $chars, "\n"
}
Output:
link_dm
txp_n_los|rx_n_lost
eof
You can write:
#firstelements = map { m/^(.*?) *&&&/ ? $1 : $_ } #array;
Or, if you prefer foreach over map and if over ?::
foreach my $row (#array){
if($row =~ /^(.*)\&{3}/) {
push #firstelements, $1;
} else {
push #firstelements, $row;
}
}
for (#array) {
print "$1\n" if /([^ ]*)(?: *[&]{3}.*)?$/;
}
If you're using regular expressions, use the minimum spanning pattern: .*?. See perldoc perlre http://perldoc.perl.org/perlre.html
#!/usr/bin/env perl
use strict;
use warnings;
# --------------------------------------
use charnames qw( :full :short );
use English qw( -no_match_vars ); # Avoids regex performance penalty
use Data::Dumper;
# Make Data::Dumper pretty
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Indent = 1;
# Set maximum depth for Data::Dumper, zero means unlimited
local $Data::Dumper::Maxdepth = 0;
# conditional compile DEBUGging statements
# See http://lookatperl.blogspot.ca/2013/07/a-look-at-conditional-compiling-of.html
use constant DEBUG => $ENV{DEBUG};
# --------------------------------------
my #array = (
"link_dm &&& drv_ena&&&1",
"txp_n_los|rx_n_lost",
"eof &&& 2 &&& length =!!!drv!!!0",
);
my #first_elements = ();
for my $line ( #array ){
# check for '&&&'
if( my ( $first_element ) = $line =~ m{ \A (.*?) \s* \&{3} }msx ){
push #first_elements, $first_element;
}else{
push #first_elements, $line;
}
}
print Dumper \#first_elements;

Perl Regular Expression to insert/substitute in a string at specific places

Given a url the following regular expression is able insert/substitute in words at certain points in the urls.
Code:
#!/usr/bin/perl
use strict;
use warnings;
#use diagnostics;
my #insert_words = qw/HELLO GOODBYE/;
my $word = 0;
my $match;
while (<DATA>) {
chomp;
foreach my $word (#insert_words)
{
my $repeat = 1;
while ((my $match=$_) =~ s|(?<![/])(?:[/](?![/])[^/]*){$repeat}[^/]*\K|$word|)
{
print "$match\n";
$repeat++;
}
print "\n";
}
}
__DATA__
http://www.stackoverflow.com/dog/cat/rabbit/
http://www.superuser.co.uk/dog/cat/rabbit/hamster/
10.15.16.17/dog/cat/rabbit/
The output given (for the first example url in __DATA__ with the HELLO word):
http://www.stackoverflow.com/dogHELLO/cat/rabbit/
http://www.stackoverflow.com/dog/catHELLO/rabbit/
http://www.stackoverflow.com/dog/cat/rabbitHELLO/
http://www.stackoverflow.com/dog/cat/rabbit/HELLO
Where I am now stuck:
I would now like to alter the regular expression so that the output will look like what is shown below:
http://www.stackoverflow.com/dogHELLO/cat/rabbit/
http://www.stackoverflow.com/dog/catHELLO/rabbit/
http://www.stackoverflow.com/dog/cat/rabbitHELLO/
http://www.stackoverflow.com/dog/cat/rabbit/HELLO
#above is what it already does at the moment
#below is what i also want it to be able to do as well
http://www.stackoverflow.com/HELLOdog/cat/rabbit/ #<-puts the word at the start of the string
http://www.stackoverflow.com/dog/HELLOcat/rabbit/
http://www.stackoverflow.com/dog/cat/HELLOrabbit/
http://www.stackoverflow.com/dog/cat/rabbit/HELLO
http://www.stackoverflow.com/HELLO/cat/rabbit/ #<- now also replaces the string with the word
http://www.stackoverflow.com/dog/HELLO/rabbit/
http://www.stackoverflow.com/dog/cat/HELLO/
http://www.stackoverflow.com/dog/cat/rabbit/HELLO
But I am having trouble getting it to automatically do this within the one regular expression.
Any help with this matter would be highly appreciated, many thanks
One solution:
use strict;
use warnings;
use URI qw( );
my #insert_words = qw( HELLO );
while (<DATA>) {
chomp;
my $url = URI->new($_);
my $path = $url->path();
for (#insert_words) {
# Use package vars to communicate with /(?{})/ blocks.
local our $insert_word = $_;
local our #paths;
$path =~ m{
^(.*/)([^/]*)((?:/.*)?)\z
(?{
push #paths, "$1$insert_word$2$3";
if (length($2)) {
push #paths, "$1$insert_word$3";
push #paths, "$1$2$insert_word$3";
}
})
(?!)
}x;
for (#paths) {
$url->path($_);
print "$url\n";
}
}
}
__DATA__
http://www.stackoverflow.com/dog/cat/rabbit/
http://www.superuser.co.uk/dog/cat/rabbit/hamster/
http://10.15.16.17/dog/cat/rabbit/
Without crazy regexes:
use strict;
use warnings;
use URI qw( );
my #insert_words = qw( HELLO );
while (<DATA>) {
chomp;
my $url = URI->new($_);
my $path = $url->path();
for my $insert_word (#insert_words) {
my #parts = $path =~ m{/([^/]*)}g;
my #paths;
for my $part_idx (0..$#parts) {
my $orig_part = $parts[$part_idx];
local $parts[$part_idx];
{
$parts[$part_idx] = $insert_word . $orig_part;
push #paths, join '', map "/$_", #parts;
}
if (length($orig_part)) {
{
$parts[$part_idx] = $insert_word;
push #paths, join '', map "/$_", #parts;
}
{
$parts[$part_idx] = $orig_part . $insert_word;
push #paths, join '', map "/$_", #parts;
}
}
}
for (#paths) {
$url->path($_);
print "$url\n";
}
}
}
__DATA__
http://www.stackoverflow.com/dog/cat/rabbit/
http://www.superuser.co.uk/dog/cat/rabbit/hamster/
http://10.15.16.17/dog/cat/rabbit/
one more solution:
#!/usr/bin/perl
use strict;
use warnings;
my #insert_words = qw/HELLO GOODBYE/;
while (<DATA>) {
chomp;
/(?<![\/])(?:[\/](?![\/])[^\/]*)/p;
my $begin_part = ${^PREMATCH};
my $tail = ${^MATCH} . ${^POSTMATCH};
my #tail_chunks = split /\//, $tail;
foreach my $word (#insert_words) {
for my $index (1..$#tail_chunks) {
my #new_tail = #tail_chunks;
$new_tail[$index] = $word . $tail_chunks[$index];
my $str = $begin_part . join "/", #new_tail;
print $str, "\n";
$new_tail[$index] = $tail_chunks[$index] . $word;
$str = $begin_part . join "/", #new_tail;
print $str, "\n";
}
print "\n";
}
}
__DATA__
http://www.stackoverflow.com/dog/cat/rabbit/
http://www.superuser.co.uk/dog/cat/rabbit/hamster/
10.15.16.17/dog/cat/rabbit/