Find files matching patterns in Perl - regex

I am taking the user input via -f option, and whatever he enters, accordingly files are being searched recursively.
My problem is: When user enters "tmp*", then also it searches for "abctmp", "xyztmp" etc. What I want to do is, only files starting with tmp should come.
In short, whatever user enters accordingly files should be pushed to array.
Currently I am doing this, but I am sure there's some classy, short way to do it.
#! /perl/bin/perl
use strict;
use warnings;
use File::Find;
use getopt::Long;
my $filename="tmp*.txt";
find( { wanted => \&wanted,
preprocess => \&dir_search,
}, '.');
sub wanted{
my $regex;
my $myop;
my #mylist;
my $firstchar= substr($filename, 0,1); # I am checking first character.
# Whether it's ".*tmp" or just "tmp*"
if($filename=~ m/[^a-zA-Z0-9_]/g){ #If contain wildcard
if($firstchar eq "."){ # first character "."
my $myop = substr($filename, 1,1);
my $frag = substr($filename,2);
$filename = $frag;
$regex = '\b(\w' . ${myop}. ${filename}. '\w*)\b';
# Has to find whatever comes before 'tmp', too
} else {
$regex = '\b(' . ${myop}. ${filename}. '\w*)\b';
# Like, "tmp+.txt" Only search for patterns starting with tmp
}
if($_ =~ /$regex/) {
push(#mylist, $_);
}
} else {
if($_ eq $filename) { #If no wildcard, match the exact name only.
push(#mylist, $_);
}
}
}
sub dir_search {
my (#entries) = #_;
if ($File::Find::dir eq './a') {
#entries = grep { ((-d && $_ eq 'g') ||
((-d && $_ eq 'h') ||
(!(-d && $_ eq 'x')))) } #entries;
# Want from 'g' and 'h' folders only, not from 'x' folder
}
return #entries;
}
And another thing is, I want to search for only '.txt' files. Where should I put that condition?

#!/perl/bin/perl
sub rec_dir {
($dir,$tmpfile_ref) = #_;
opendir(CURRENT, $dir);
#files = readdir(CURRENT);
closedir(CURRENT);
foreach $file (#files) {
if( $file eq ".." || $file eq "." ) { next; }
if( -d $dir."/".$file ) { rec_dir($dir."/".$file,$tmpfile_ref); }
elsif( $file =~ /^tmp/ && $file =~ /\.txf$/ ) { push(#{$tmpfile_ref},$dir."/".$file); }
}
}
#matching_files = ();
$start_dir = ".";
rec_dir($start_dir,\#matching_files);
foreach $file (#matching_files) { print($file."\n"); }
I didn't test it. Barring typographical errors I think it will work.

Related

Dynamic replacing and removing C code (using Perl)

I come to you with my question: what is the best way to replace/remove pieces of C code dynamically? I already did something in Perl using regular expressions and reading what to replace/remove from a configuration file but I can't make it dynamic.
Code:
#autoflush output so it will not interfere with calling application
local $| = 1;
##
#libraries
use warnings;
use strict;
use Switch;
use Cwd;
use File::Find;
##
#global variables
my #config;
my $file;
my $directory;
my $result;
##
if(#ARGV < 1)
{
$directory = cwd();
}
else
{
$directory = $ARGV[0];
}
$result = $directory . "\\result";
if(! -d $result)
{
mkdir ($result);
}
open LOG, ">", $result . "\\log.log";
sub start
{
my $configFile = $_[0];
open CONFIG, $configFile;
local $/;
my $conf = <CONFIG>;
close CONFIG;
foreach my $line (split(/\n\*/, $conf))
{
if(index($line, "*") == 0)
{
$line = substr($line, 1);
}
setConfig($line);
}
processFiles();
}
sub setConfig
{
my $line = $_[0];
my $count = () = $line =~ /\s*==>\s*/;
switch($count)
{
case 0
{
remove($line);
}
case 1
{
replace($line);
}
}
}
sub addSlashes
{
$_[0] =~ s/([\.\\\/\+\*\?\[\^\]\(\)\{\}\=\!\<\>\|\:\-])/\\$1/xg;
if($_[1] == 1)
{
$_[0] =~ s/([\$])/\\$1/xg;
}
return;
}
sub remove
{
my $line = $_[0];
addSlashes($line, 1);
$line =~ s/(\\\$){3}/\.\+/g;
$config[#config][0] = qr/$line/;
$config[#config - 1][1] = q("");
$line = "\\(" . $line . "\\(";
$config[#config - 1][2] = qr/$line/;
}
sub replace
{
my $line = $_[0];
my #split = split(/\s*==>\s*/, $line);
my $original = $split[0];
my $replace = $split[1];
my $regex;
addSlashes($original, 1);
addSlashes($replace, 0);
my $counter = 1;
while($original =~ /\\\$([\d]{1,3})\\\$/g)
{
if($1 <= $counter && $1 > 0)
{
$counter++;
}
else
{
print "Invalid format\n";
return;
}
}
if($counter == 1)
{
$config[#config][0] = qr/$original/;
$config[#config - 1][1] = q(") . $replace . q(");
$original = "\\(" . $original . "\\(";
$config[#config - 1][2] = qr/$original/;
return;
}
while($replace =~ /\$([\d]{1,3})\$/g)
{
if($1 <= 0 && $1 >= $counter)
{
print "Invalid format\n";
return;
}
}
$original =~ s/\\\$\d{1,3}\\\$/\(\.\+\?\)/xg;
$original =~ s/\?\)$/\)/xg;
$replace =~ s/\$(\d{1,3})\$/\$$1/xg;
$config[#config][0] = qr/$original/;
$config[#config - 1][1] = q(") . $replace . q(");
$original = "\\(" . $original . "\\(";
$config[#config - 1][2] = qr/$original/;
}
sub processFiles
{
my #files = grep { ! -d } glob "$directory\\*";
foreach my $file (#files)
{
if($file =~ /\.(h|c)$/)
{
process($file);
}
}
}
sub process
{
my $file = $_[0];
open READ, $file;
local $/;
my $text = <READ>;
close READ;
print LOG "\n--> $file <--\n";
for(my $i = 0; $i < #config; $i++)
{
my $original = $config[$i][0];
my $replace = $config[$i][1];
my $log = $config[$i][2];
while($text =~ /$log/g)
{
print LOG $log . " ----> " . $1 . "\n";
}
$text =~ s/$original/$replace/eeg;
print LOG "\n";
}
$file = $result . substr($file, rindex($file, "\\"));
open WRITE, ">", $file;
print WRITE $text;
close WRITE;
}
start("qm2asil.cfg");
close LOG;
Configuration file content:
*static
*GET_$1$() ==> $1$
*GET_$1$($2$) ==> $1$[$2$]
*SET_$1$($2$,$3$); ==> $1$[$2$] = $3$;
*SET_$1$($2$); ==> $1$ = $2$;
The idea is that there are already a few rules to replace/remove and they work but can exist more complex rules that I couldn't manage.
Example:
SET_VAR1((i),(u8)(((s32)(((s32)GET_VAR2 ((i))) != 0)) && ((s32)((u8)(((s32) (((s32)VAR3[i]) != 0)) ^ ((s32)(((s32) VAR4[i]) != 0)))))));
I want to remove SET function and make it an assignment to the variable (VAR1[i] = ...). This is one of many variations of things that need to be removed/replaced.
What do you advise me to do? Can I make it work using Perl and regex or I should reorientate to another method and/or programming language?
EDIT: I already create regexes based on the configuration file but I have problem matching unknown expressions (currently I use .+). The main idea is that I want to keep the configuration as simple I can.
regexes

Improve performance of Perl search file script

I have recently noticed that a quick script I had written in Perl that was designed to be used on sub 10MB files has been modified, re-tasked and used in 40MB+ text files with significant performance issues in a batch environment.
The jobs have been running for about 12 hours per run when encountering a large text file and I am wondering how do I improve the perfomance of the code? Should I slurp the file into memory and if I do it will break the jobs reliance on the line numbers in the file. Any constructive thought would be greatly appreciated, I know the job is looping through the file too many times but how to reduce that?
#!/usr/bin/perl
use strict;
use warnings;
my $filename = "$ARGV[0]"; # This is needed for regular batch use
my $cancfile = "$ARGV[1]"; # This is needed for regular batch use
my #num =();
open(FILE, "<", "$filename") || error("Cannot open file ($!)");
while (<FILE>)
{
push (#num, $.) if (/^P\|/)
}
close FILE;
my $start;
my $end;
my $loop = scalar(#num);
my $counter =1;
my $test;
open (OUTCANC, ">>$cancfile") || error ("Could not open file: ($!)");
#Lets print out the letters minus the CANCEL letters
for ( 1 .. $loop )
{
$start = shift(#num) if ( ! $start );
$end = shift(#num);
my $next = $end;
$end--;
my $exclude = "FALSE";
open(FILE, "<", "$filename") || error("Cannot open file ($!)");
while (<FILE>)
{
my $line = $_;
$test = $. if ( eof );
if ( $. == $start && $line =~ /^P\|[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]\|1I\|IR\|/)
{
print OUTCANC "$line";
$exclude = "TRUECANC";
next;
}
if ( $. >= $start && $. <= $end && $exclude =~ "TRUECANC")
{
print OUTCANC "$line";
} elsif ( $. >= $start && $. <= $end && $exclude =~ "FALSE"){
print $_;
}
}
close FILE;
$end = ++$test if ( $end < $start );
$start = $next if ($next);
}
#Lets print the last letter in the file
my $exclude = "FALSE";
open(FILE, "<", "$filename") || error("Cannot open file ($!)");
while (<FILE>)
{
my $line = $_;
if ( $. == $start && $line =~ /^P\|[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]\|1I\|IR\|/)
{
$exclude = "TRUECANC";
next;
}
if ( $. >= $start && $. <= $end && $exclude =~ "TRUECANC")
{
print OUTCANC "$line";
} elsif ( $. >= $start && $. <= $end && $exclude =~ "FALSE"){
print $_;
}
}
close FILE;
close OUTCANC;
#----------------------------------------------------------------
sub message
{
my $m = shift or return;
print("$m\n");
}
sub error
{
my $e = shift || 'unknown error';
print("$0: $e\n");
exit 0;
}
There are some things that could speed the script up, like removing unneccessary regex usage.
/^P\|/ is equivalent to "P|" eq substr $_, 0, 2.
$foo =~ "BAR" could be -1 != index $foo, "BAR".
Then there is some repeated code. Factoring that out into a sub will not increase performance per se, but makes it easier to reason about the behaviour of the script.
There are a lot of unneccessary stringifications like "$filename" โ€“ $filename alone is sufficient.
But the worst offender would be this:
for ( 1 .. $loop ) {
...
open FILE, "<", $filename or ...
while (<FILE>) {
...
}
...
}
You only need to read that file in once, preferably into an array. You can the loop over the indices:
for ( 1 .. $loop ) {
...
for my $i (0 .. $#file_contents) {
my $line = $file_contents[$i];
... # swap $. for $i, but avoid off-by-one error
}
...
}
Disk IO is slow, so cache where you can!
I also see that you are using the $exclude variable as a boolean with the values FALSE and TRUECANC. Why not 0 and 1, so you can use it directly in a conditional?
You can factor out common tests in if/elsif:
if (FOO && BAR) { THING_A }
elsif (FOO && BAZ) { THING_B }
should be
if (FOO) {
if (BAR) { THING_A }
elsif (BAZ) { THING_B }
}
The $. == $start && $line =~ /^P\|.../ test may be silly, because $start contains only the numbers of lines that start with P| โ€“ so the regex may be sufficient here.
Edit
If I have understood the script correctly then the following should yield a significant performance increase:
#!/usr/bin/perl
use strict;
use warnings;
my ($filename, $cancfile) = #ARGV;
open my $fh, "<", $filename or die "$0: Couldn't open $filename: $!";
my (#num, #lines);
while (<$fh>)
{
push #lines, $_;
push #num, $#lines if "P|" eq substr $_, 0, 2;
}
open my $outcanc, ">>", $cancfile or die "$0: Couldn't open $cancfile: $!";
for my $i ( 0 .. $#num )
{
my $start = $num[$i];
my $end = ($num[$i+1] // #lines) - 1;
# pre v5.10:
# my $end = (defined $num[$i+1] ? $num[$i+1] : #lines) - 1
if ($lines[$start] =~ /^P[|][0-9]{9}[|]1I[|]IR[|]/) {
print {$outcanc} #lines[$start .. $end];
} else {
print STDOUT #lines[$start .. $end];
}
}
The script is cleaned up. The file is cached in an array. Only the parts of the array are iterated that are actually needed โ€“ we are down to O(n) from the previous O(n ยท m).
For your future scripts: Proving behaviour around loops and mutating variables is not impossible, but tedious and annoying. Realizing that
for (1 .. #num) {
$start = shift #num unless $next; # aka "do this only in the first iteration"
$next = shift #num:
$end = $next - 1:
while (<FH>) {
...
$test = $. if eof
...
}
$end = ++test if $end < $start;
$start = $next if $next;
}
is actually all about circumventing a possible undef in the 2nd shift takes some time. Instead of testing for eof in the inner loop, we can just pick the line number after the loop, so we don't need $test. Then we get:
$start = shift #num;
for my $i (1 .. #num) {
$end = $num[$i] - 1:
while (<FH>) { ... }
$end = $. + 1 if $end < $start; # $end < $start only true if not defined $num[$i]
$start = $num[$i] if $num[$i];
}
After translating $i down by one we confine the out-of-bounds problem to one point only:
for my $i (0 .. $#num) {
$start = $num[$i];
$end = $num[$i+1] - 1; # HERE: $end = -1 if $i == $#num
while (<FH>) { ... }
}
$end = $. + 1 if $end < $start;
After replacing the file reading with an array (careful, there is a difference of one between the array index and the line number), we see that the final file reading loop can be avoided if we pull that iteration into the for loop, because we know how many lines there are in total. So to say, we do
$end = ($num[$i+1] // $last_line_number) - 1;
Hopefully my cleaned up code is indeed equivalent to the original.

List files that exist only on a given directory?

I am trying to list out the files which are only in directory ./a/b or ./a/d. Now I am explicitly specifying by using (-d && $_ =~ "b") || (-d && $_ =~ "d"). Is there any way I can put needed folders in an array?
use File::Find;
my $filename = "h*.txt";
print ("Now it's:", $filename);
find({
wanted => \&wanted,
preprocess => \&dir_preprocess,
}, './a');
sub dir_preprocess {
my (#entries) = #_;
#my #tmparr=("d","b"); This isn't working
if ( $File::Find::dir eq './a' ) {
#entries = grep { (-d && $_ =~ "b") || (-d && $_ =~ "d") }#entries;
}
return #entries;
}
my #mylist;
sub wanted{
if($_ =~ $filename) {
push(#mylist, $_);
}
}
print ("It's:", #mylist);
You can use | as "or" in a regex expression. (-d && $_ =~ /(b|d)/) would match either b or d. You could put the names in an array and then use join to generate the regex you need.
You might want to do things like prepend the path upto that depth, else you might get at different levels of the heirarchy. Also you can add $ at the end to indicate that it comes at the end.

(La)Tex math parsing for C/C++

I would like to convert parse (la)tex math expressions, and convert them to (any kind of!) scripting language expression, so I can evaluate expressions.
What libraries do you recommend ?
May be it will help - take a look at TeXmacs, especially at a way it interacts with computer algebra systems.
Here is a set of possible options from a similar question. https://tex.stackexchange.com/questions/4223/what-parsers-for-latex-mathematics-exist-outside-of-the-tex-engines
I think that Perl would make a fine choice for something like this, acting on text is one of its fortes.
Here is some info on how to make an exclusive flip-flop test (to find the context between \begin{} and \end{} without keeping those lines), http://www.effectiveperlprogramming.com/2010/11/make-exclusive-flip-flop-operators/
EDIT: So this problem has started me going. Here is a first attempt to create something here is my "math.pl" which takes a .tex file as an arguement (i.e. $./math.pl test.tex).
#!/usr/bin/env perl
use strict;
use warnings;
use Text::Balanced qw/extract_multiple extract_bracketed/;
my $re_num = qr/[+\-\dE\.]/;
my $file = shift;
open( my $fh, '<', $file);
#parsing this out for more than just the equation environment might be easier using Text::Balanced too.
my #equations;
my $current_equation = '';
while(<$fh>) {
my $test;
next unless ($test = /\\begin\{equation\}/ .. /\\end\{equation\}/);
if ($test !~ /(^1|E0)$/ ) {
chomp;
$current_equation .= $_;
} elsif ($test =~ /E0$/) {
#print $current_equation . "\n";
push #equations, {eq => $current_equation};
$current_equation = '';
}
}
foreach my $eq (#equations) {
print "Full Equation: " . $eq->{'eq'} . "\n";
solve($eq);
print "Result: " . $eq->{'value'} . "\n\n";
}
sub solve {
my $eq = shift;
print $eq->{'eq'} . "\n";
parse($eq);
compute($eq);
print "intermediate result: " . $eq->{'value'} . "\n";
}
sub parse {
my $eq = shift;
my ($command,#fields) = extract_multiple(
$eq->{'eq'}, [ sub { extract_bracketed(shift,'{}') } ]
);
$command =~ s/^\\//;
print "command: " . $command . "\n";
#fields = map { s/^\{\ *//; s/\ *\}$//; print "arg: $_\n"; {value => $_}; } #fields;
($eq->{'command'}, #{ $eq->{'args'} }) = ($command, #fields);
}
sub compute {
my ($eq) = #_;
#check arguements ...
foreach my $arg (#{$eq->{'args'}}) {
#if arguement is a number, continue
if ($arg->{'value'} =~ /^$re_num$/) {
next;
#if the arguement is a simple mathematical operation, do it and continue
} elsif ($arg->{'value'} =~ /^($re_num)\ *(?:\ |\*|\\times)?\ *($re_num)$/) {
$arg->{'value'} = $1 * $2;
} elsif ($arg->{'value'} =~ /^($re_num)\ *(?:\+)?\ *($re_num)$/) {
$arg->{'value'} = $1 + $2;
} elsif ($arg->{'value'} =~ /^($re_num)\ *(?:\-)?\ *($re_num)$/) {
$arg->{'value'} = $1 - $2;
} elsif ($arg->{'value'} =~ /^($re_num)\ *(?:\/)?\ *($re_num)$/) {
$arg->{'value'} = $1 / $2;
} else {
#parse it and calc it as if it were its own equation.
$arg->{'eq'} = $arg->{'value'};
solve($arg);
}
}
my #args = #{$eq->{'args'}};
## add command processing here
# frac
if ($eq->{'command'} eq 'frac') {
$eq->{'value'} = $args[0]->{'value'} / $args[1]->{'value'};
return;
}
}
and here is a sample test.tex:
\documentclass{article}
\begin{document}
Hello World!
\begin{equation}
\frac{\frac{1}{3}}{2}
\end{equation}
\end{document}
Maybe using boost::spirit in order to tokenize the expression. You will need to define a huge grammar!
Use a parser generator to create an appropriate parser. Try ANTLR for this, as it includes an IDE for the Grammar, which is very helpful. Using tree rewrite rules, you can then convert the parse tree to an abstract syntax tree.
Start perhaps with the expression evaluator from ANTLR tutorial. I think this is reasonably close enough.

Masking a string in perl using a mask string

I have a string such as 'xxox-x' that I want to mask each line in a file against as such:
x's are ignored (or just set to a known value)
o's remain unchanged
the - is a variable length field that will keep everything else unchanged
therefore mask 'xxox-x' against 'deadbeef' would yield 'xxaxbeex'
the same mask 'xxox-x' against 'deadabbabeef' would yield 'xxaxabbabeex'
How can I do this succinctly preferrably using s operator?
$mask =~ s/-/'o' x (length $str - length $mask)/e;
$str =~ s/(.)/substr($mask, pos $str, 1) eq 'o' ? $1 : 'x'/eg;
$ perl -pe 's/^..(.).(.+).$/xx$1x$2x/;'
deadbeef
xxaxbeex
deadabbabeef
xxaxabbabeex
Compile your pattern into a Perl sub:
sub compile {
use feature 'switch';
my($pattern) = #_;
die "illegal pattern" unless $pattern =~ /^[-xo]+$/;
my($search,$replace);
my $i = 0;
for (split //, $pattern) {
given ($_) {
when ("x") {
$search .= "."; $replace .= "x";
}
when ("o") {
$search .= "(?<sub$i>.)";
$replace .= "\$+{sub$i}";
++$i;
}
when ("-") {
$search .= "(?<sub$i>.*)";
$replace .= "\$+{sub$i}";
++$i;
}
}
}
my $code = q{
sub {
local($_) = #_;
s/^SEARCH$/REPLACE/s;
$_;
}
};
$code =~ s/SEARCH/$search/;
$code =~ s/REPLACE/$replace/;
#print $code;
local $#;
my $sub = eval $code;
die $# if $#;
$sub;
}
To be more concise, you could write
sub _patref { '$+{sub' . $_[0]++ . '}' }
sub compile {
my($pattern) = #_;
die "illegal pattern" unless $pattern =~ /^[-xo]+$/;
my %gen = (
'x' => sub { $_[1] .= '.'; $_[2] .= 'x' },
'o' => sub { $_[1] .= "(?<sub$_[0]>.)"; $_[2] .= &_patref },
'-' => sub { $_[1] .= "(?<sub$_[0]>.*)"; $_[2] .= &_patref },
);
my($i,$search,$replace) = (0,"","");
$gen{$1}->($i,$search,$replace)
while $pattern =~ /(.)/g;
eval "sub { local(\$_) = \#_; s/\\A$search\\z/$replace/; \$_ }"
or die $#;
}
Testing it:
use v5.10;
my $replace = compile "xxox-x";
my #tests = (
[ deadbeef => "xxaxbeex" ],
[ deadabbabeef => "xxaxabbabeex" ],
);
for (#tests) {
my($input,$expect) = #$_;
my $got = $replace->($input);
print "$input => $got : ", ($got eq $expect ? "PASS" : "FAIL"), "\n";
}
Output:
deadbeef => xxaxbeex : PASS
deadabbabeef => xxaxabbabeex : PASS
Note that you'll need Perl 5.10.x for given ... when.
x can be translated to . and o to (.) whereas - becomes (.+?):
#!/usr/bin/perl
use strict; use warnings;
my %s = qw( deadbeef xxaxbeex deadabbabeef xxaxabbabeex);
for my $k ( keys %s ) {
(my $x = $k) =~ s/^..(.).(.+?).\z/xx$1x$2x/;
print +($x eq $s{$k} ? 'good' : 'bad'), "\n";
}
heres a quick stab at a regex generator.. maybe somebody can refactor something pretty from it?
#!/usr/bin/perl
use strict;
use Test::Most qw( no_plan );
my $mask = 'xxox-x';
is( mask( $mask, 'deadbeef' ), 'xxaxbeex' );
is( mask( $mask, 'deadabbabeef' ), 'xxaxabbabeex' );
sub mask {
my ($mask, $string) = #_;
my $regex = $mask;
my $capture_index = 1;
my $mask_rules = {
'x' => '.',
'o' => '(.)',
'-' => '(.+)',
};
$regex =~ s/$_/$mask_rules->{$_}/g for keys %$mask_rules;
$mask =~ s/$_/$mask_rules->{$_}/g for keys %$mask_rules;
$mask =~ s/\./x/g;
$mask =~ s/\([^)]+\)/'$' . $capture_index++/eg;
eval " \$string =~ s/^$regex\$/$mask/ ";
$string;
}
Here's a character by character solution using substr rather that split. It should be efficient for long strings since it skips processing the middle part of the string (when there is a dash).
sub apply_mask {
my $mask = shift;
my $string = shift;
my ($head, $tail) = split /-/, $mask;
for( 0 .. length($head) - 1 ) {
my $m = substr $head, $_, 1;
next if $m eq 'o';
die "Bad char $m\n" if $m ne 'x';
substr($string, $_, 1) = 'x';
}
return $string unless defined $tail;
$tail = reverse $tail;
my $last_char = length($string) - 1;
for( 0 .. length($tail) - 1 ) {
my $m = substr $tail, $_, 1;
next if $m eq 'o';
die "Bad char $m\n" if $m ne 'x';
substr($string, $last_char - $_, 1) = 'x';
}
return $string;
}
sub mask {
local $_ = $_[0];
my $mask = $_[1];
$mask =~ s/-/'o' x (length($_)-(length($mask)-1))/e;
s/(.)/substr($mask, pos, 1) eq 'o' && $1/eg;
return $_;
}
Used tidbits from a couple answers ... this is what I ended up with.
EDIT: update from comments