Regex to Parse Mail Subject with multiple encoding - regex

There!
I want to match all the Inline Encodings in one Mail-Subject and build the Subject String in utf8.
Some Examples:
[Listname | Topic123] =?utf-8?Q?encodedtext?=
=?iso-8859-1?q?this=20is=20some=20text?=
Klartext-Betreff
[Listname | Topic123] =?utf-8?Q?encodedtext?= =?iso-8859-1?q?this=20is=20some=20text?=
=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
I also got a mail with two different encodings (example in Last line).
In E-Mails it could also be possible, that the subject is split in multiple lines where each line (except the first) starts with at least one whitespace
So I am looking for a regex, that parses:
Part+
Where Part is one of:
Text with spaces
=?charset?encoding?encoded-text?=
I think it woll go to something like:
ENC = (=\?)([A-Za-z0-9-]*)(\?)([A-Za-z0-9-]*)(?)([Any Character])(\?=)
Part = any character that doesnt match to ENC or ENC

function decode ($string, $source_enc, $dest_enc)
{
$parts = preg_split (
'/=\?([^?]+)\?([^?]+)\?([^?]+)\?=/',
$string,
-1, PREG_SPLIT_DELIM_CAPTURE);
$result = "";
for ($i = 0; $i < count ($parts); $i++)
{
$part = $parts [$i];
if ($i % 4 == 0)
$result .= iconv ($source_enc, $dest_enc, $part);
else
{
$charset = $parts [$i++];
$encoding = $parts [$i++];
$text = $parts [$i];
if ($encoding == 'Q' || $encoding == 'q')
$text = quoted_printable_decode ($text);
else if ($encoding == 'B' || $encoding == 'b')
$text = base64_decode ($text);
$result .= iconv ($charset, $dest_enc, $text);
}
}
return $result;
}
echo (decode ("=?utf-8?Q?encodedtext?= =?iso-8859-1?q?this=20is=20some=20text?=
=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=",
"ISO-8859-1", "ISO-8859-1"));
Output for me is:
encodedtext this is some text If you can read this yo u understand the example.

Related

Dynamic replacing and removing C code (using Perl)

I come to you with my question: what is the best way to replace/remove pieces of C code dynamically? I already did something in Perl using regular expressions and reading what to replace/remove from a configuration file but I can't make it dynamic.
Code:
#autoflush output so it will not interfere with calling application
local $| = 1;
##
#libraries
use warnings;
use strict;
use Switch;
use Cwd;
use File::Find;
##
#global variables
my #config;
my $file;
my $directory;
my $result;
##
if(#ARGV < 1)
{
$directory = cwd();
}
else
{
$directory = $ARGV[0];
}
$result = $directory . "\\result";
if(! -d $result)
{
mkdir ($result);
}
open LOG, ">", $result . "\\log.log";
sub start
{
my $configFile = $_[0];
open CONFIG, $configFile;
local $/;
my $conf = <CONFIG>;
close CONFIG;
foreach my $line (split(/\n\*/, $conf))
{
if(index($line, "*") == 0)
{
$line = substr($line, 1);
}
setConfig($line);
}
processFiles();
}
sub setConfig
{
my $line = $_[0];
my $count = () = $line =~ /\s*==>\s*/;
switch($count)
{
case 0
{
remove($line);
}
case 1
{
replace($line);
}
}
}
sub addSlashes
{
$_[0] =~ s/([\.\\\/\+\*\?\[\^\]\(\)\{\}\=\!\<\>\|\:\-])/\\$1/xg;
if($_[1] == 1)
{
$_[0] =~ s/([\$])/\\$1/xg;
}
return;
}
sub remove
{
my $line = $_[0];
addSlashes($line, 1);
$line =~ s/(\\\$){3}/\.\+/g;
$config[#config][0] = qr/$line/;
$config[#config - 1][1] = q("");
$line = "\\(" . $line . "\\(";
$config[#config - 1][2] = qr/$line/;
}
sub replace
{
my $line = $_[0];
my #split = split(/\s*==>\s*/, $line);
my $original = $split[0];
my $replace = $split[1];
my $regex;
addSlashes($original, 1);
addSlashes($replace, 0);
my $counter = 1;
while($original =~ /\\\$([\d]{1,3})\\\$/g)
{
if($1 <= $counter && $1 > 0)
{
$counter++;
}
else
{
print "Invalid format\n";
return;
}
}
if($counter == 1)
{
$config[#config][0] = qr/$original/;
$config[#config - 1][1] = q(") . $replace . q(");
$original = "\\(" . $original . "\\(";
$config[#config - 1][2] = qr/$original/;
return;
}
while($replace =~ /\$([\d]{1,3})\$/g)
{
if($1 <= 0 && $1 >= $counter)
{
print "Invalid format\n";
return;
}
}
$original =~ s/\\\$\d{1,3}\\\$/\(\.\+\?\)/xg;
$original =~ s/\?\)$/\)/xg;
$replace =~ s/\$(\d{1,3})\$/\$$1/xg;
$config[#config][0] = qr/$original/;
$config[#config - 1][1] = q(") . $replace . q(");
$original = "\\(" . $original . "\\(";
$config[#config - 1][2] = qr/$original/;
}
sub processFiles
{
my #files = grep { ! -d } glob "$directory\\*";
foreach my $file (#files)
{
if($file =~ /\.(h|c)$/)
{
process($file);
}
}
}
sub process
{
my $file = $_[0];
open READ, $file;
local $/;
my $text = <READ>;
close READ;
print LOG "\n--> $file <--\n";
for(my $i = 0; $i < #config; $i++)
{
my $original = $config[$i][0];
my $replace = $config[$i][1];
my $log = $config[$i][2];
while($text =~ /$log/g)
{
print LOG $log . " ----> " . $1 . "\n";
}
$text =~ s/$original/$replace/eeg;
print LOG "\n";
}
$file = $result . substr($file, rindex($file, "\\"));
open WRITE, ">", $file;
print WRITE $text;
close WRITE;
}
start("qm2asil.cfg");
close LOG;
Configuration file content:
*static
*GET_$1$() ==> $1$
*GET_$1$($2$) ==> $1$[$2$]
*SET_$1$($2$,$3$); ==> $1$[$2$] = $3$;
*SET_$1$($2$); ==> $1$ = $2$;
The idea is that there are already a few rules to replace/remove and they work but can exist more complex rules that I couldn't manage.
Example:
SET_VAR1((i),(u8)(((s32)(((s32)GET_VAR2 ((i))) != 0)) && ((s32)((u8)(((s32) (((s32)VAR3[i]) != 0)) ^ ((s32)(((s32) VAR4[i]) != 0)))))));
I want to remove SET function and make it an assignment to the variable (VAR1[i] = ...). This is one of many variations of things that need to be removed/replaced.
What do you advise me to do? Can I make it work using Perl and regex or I should reorientate to another method and/or programming language?
EDIT: I already create regexes based on the configuration file but I have problem matching unknown expressions (currently I use .+). The main idea is that I want to keep the configuration as simple I can.
regexes

Find matching between 2 files (how to improve efficiency)

#file1 contains only startpoint-endpoint pair, each indices represent each pair. file2 is a text file, for #file2 each indices represents each line. I am trying to search each pair from #file1 in #file2 line by line. When the exact match is found, I would then try to extract information1 from file2 and print it out. But for now, I am trying to search for the matched pair in file2. The format of the matching pattern is as below:
Match case
From $file1[0]
Startpoint: /source/in_out/map (positive-triggered)
Endpoint: /output/end/scan_all (positive-triggered)
match if file2 contains:
Line with other stuff
Startpoint: /source/in_out/map (positive-triggered)
Endpoint: /output/end/scan_all (positive-triggered)
information1:
information2:
Lines with other stuff
Unmatch Case:
From file1:
Startpoint: /source/in_out/map (positive-triggered)
Endpoint: /output/end/scan_all (positive-triggered)
From file2:
Startpoint: /source/in_out/map (positive-triggered)
Endpoint: /different endpoint pair/ (positive-triggered)
information1:
information2:
For text files2, I stored it in #file2. For files1, I have successfully extracted and stored every Startpoint and the next line Endpoint as the format above in #file1. (No problem in extracting and storing each pair, so I wont be showing the code for this, it took around 4mins here) Then I split each element of #address, which are the startpoint and endpoint. Checking line by line in files2, if startpoint match, then I will move on next line to check endpoint, it is only considered match if the next line after Startpoint match the Endpoint, else try to search again until the end line of files2. This script does the job but it took 3 and a half hours to complete(there are around 60k pairs from file1 and 800k lines to check in file2). Is there any other efficient way to do this?
I am new in Perl scripting, I apologize for any silly mistakes, both in my explanation and my coding.
Here's the codes:
#!usr/bin/perl
use warnings;
my $report = '/home/dir/file2';
open ( $DATA,$report ) || die "Error when opening";
chomp (#file2 = <$DATA>);
#No problem in extracting Start-Endpoint pair from file1 into #file1, so I wont include
#the code for this
$size = scalar#file1;
$size2 = scalar#file2;
for ( $total=0; $total<$size; $total++ ) {
my #file1_split = split('\n',$file1[$total]);
chomp #file1_split;
my $match_endpoint = 0;
my $split = 0;
LABEL2: for ( $count=0; $count<$size2; $count++ ) {
if ( $match_endpoint == 1) {
if ( grep { $_ eq "file1_split[$split]" } $file2[$count] )
print"Pair($total):Match Pair\n";
last LABEL2; #move on to check next start-endpoint
#pair
}
else {
$split = 0; #reset back to check the same startpoint
and continue searching until match found or end line of file2
$match_endpoint = 0;
}
}
elsif ( grep { $_ eq "$address_array[$split]"} $array[$count] )
{
$match_endpoint = 1;#enable search for endpoint in next line
$split = 1; #move on next line to match endpoint
next;
}
elsif ( $count==$size2-1 ) {
print"no matching found for Path($total)\n";
}
}
}
If I'm understanding what your code is trying to do,
it looks like it would be more efficient to do it this way:
my %split=#file1;
my %total;
#total{#file1}=(0..$#file1);
my $split;
for( #file2 ){
if( $split ){
if( $_ eq $split ){
print"Pair($total{$split}):Match Pair\n";
}else{
$split{$split}="";
}
}
$split=$split{$_};
delete $split{$_};
}
for( keys %split ){
print"no matching found for Path($total{$_})\n";
}
If I have understood your spec (show matches), I'm betting this will complete in less than 5 seconds, unless you're using an old Dell D333. To further minimize the response time, you would write some extra code to drive the while loop by the hash with the fewest keys (you implied file1). If you use references to hashes, then you can write a small if-else statement to swap the hash references without having to code duplicate while statements.
use strict;
use warnings;
sub makeHash($) {
my ($filename) = #_;
open(DATA, $filename) || die;
my %result;
my ($start, $line);
while (<DATA>) {
if ($_ =~ /^Startpoint: (.*)/) {
$start = $1; # captured group in regular expression
$line = $.; # current line number
} elsif ($_ =~ /^Endpoint: (.*)/) {
my $end = $1;
if (defined $line && $. == ($line + 1)) {
my $key = "$start::$end";
# can distinguish start and end lines if necessary
$result{$key} = {start=>$start, end=>$end, line=>$line};
}
}
}
close(DATA);
return %result;
}
my %file1 = makeHash("file1");
my %file2 = makeHash("file2");
my $fmt = "%10s %10s %s\n";
my $nmatches = 0;
printf $fmt, "File1", "File2", "Key";
while (my ($key, $f1h) = each %file1) {
my $f2h = $file2{$key};
if (defined $f2h) {
# You have access to hash members start and end if you need to distinguish further
printf $fmt, $f1h->{line}, $f2h->{line}, $key;
$nmatches++;
}
}
print "Found $nmatches matches\n";
Below, is my test data generator(thanks). I generated a worst-case scenario of 1,000,000 matches between two equal files. The matching code above finished on my MBP in under 20 seconds using the generated test data.
use strict;
use warnings;
sub rndStr { join'', #_[ map{ rand #_ } 1 .. shift ] }
open(F1, ">file1") || die;
open(F2, ">file2") || die;
for (1..1000000) {
my $start = rndStr(30, 'A'..'Z');
my $end = rndStr(30, 'A'..'Z');
print F1 "Startpoint: $start\n";
print F1 "Endpoint: $end\n";
print F2 "Startpoint: $start\n";
print F2 "Endpoint: $end\n";
}
close(F1);
close(F2);

Character match count between strings in Perl

I have a string (say string 1) that needs to be matched to another string (string2). Both the strings will have the same length and are case in-sensitive.
I want to print the number of character matches between both the strings.
E.g.: String 1: stranger
String 2: strangem
Match count = 7
I tried this:
$string1 = "stranger";
$string2 = "strangem";
my $count = $string1 =~ m/string2/ig;
print "$count\n";
How can I fix this?
Exclusive or, then count the null characters (where the strings were the same):
my $string1 = "stranger";
my $string2 = "strangem";
my $count = ( lc $string1 ^ lc $string2 ) =~ tr/\0//;
print "$count\n";
I missed the "case in-sensitive" bit.
You can use substr for that:
#!/usr/bin/perl
use warnings;
use strict;
my $string1=lc('stranger');
my $string2=lc('strangem');
my $count=0;
for (0..length($string1)-1) {
$count++ if substr($string1,$_,1) eq substr($string2,$_,1);
}
print $count; #prints 7
Or you can use split to get all characters as an array, and loop:
#!/usr/bin/perl
use warnings;
use strict;
my $string1=lc('stranger');
my $string2=lc('strangem');
my $count=0;
my #chars1=split//,$string1;
my #chars2=split//,$string2;
for (0..$#chars1) {
$count++ if $chars1[$_] eq $chars2[$_];
}
print $count; #prints 7
(fc gives more accurate results than lc, but I went for backwards compatibility.)
Not tested
sub cm
{
my #a = shift;
my #b = shift;
# First match prefix of string:
my $n = 0;
while ($n < $#a && $n < $#b && $a[$n] eq $b[$n]) {
++$n;
}
# Then skip one char on either side, and recurse.
if ($n < $#a && $n < $#b) {
# Match rest by skipping one place:
my $n2best = 0;
my $n2a = cm(splice(#a, $n), splice(#b, $n + 1));
$n2best = $n2a;
my $n2b = cm(splice(#a, $n + 1), splice(#b, $n));
$n2best = $n2b if $n2b > $n2best;
my $n2c = cm(splice(#a, $n + 1), splice(#b, $n + 1));
$n2best = $n2c if $n2c > $n2best;
$n += $n2best;
}
return $n;
}
sub count_matches
{
my $a = shift;
my $b = shift;
my #a_chars = split //, $a;
my #b_chars = split //, $b;
return cm(#a_chars, #b_chars);
}
print count_matches('stranger', 'strangem')

How to match sequence group?

say, the given string is abcwhateverdefwhatever34567whatever012 How to match those group which are in sequence like match abc, def, 34567,012?
the regex i have now is (.)\1{2,} but it matches the same characters but not in sequence
If you're still looking for PHP code.
function getSequence($str) {
$prev = 0; $next = 0; $length = strlen($str);
$temp = "";
for($i = 0; $i < $length; $i++) {
$next = ord($str[$i]);
if ($next == $prev + 1) {
$temp .= $str[$i];
} else {
if (strlen($temp) > 1) $result[] = $temp;
$temp = $str[$i];
}
$prev = $next;
}
if (strlen($temp) > 1) $result[] = $temp;
return $result;
}
$str = "abcwhateverdefwhatever34567whatever012";
print_r(getSequence($str));
Here's a solution that solves the problem with regex. It's not very efficient though and I wouldn't recommend it.
from re import findall, X
text = "abcwhateverdefwhatever34567whatever012"
reg = r"""
(?:
(?:0(?=1))|
(?:(?<=0)1)|(?:1(?=2))|
(?:(?<=1)2)|(?:2(?=3))|
(?:(?<=2)3)|(?:3(?=4))|
(?:(?<=3)4)|(?:4(?=5))|
(?:(?<=4)5)|(?:5(?=6))|
(?:(?<=5)6)|(?:6(?=7))|
(?:(?<=6)7)|(?:7(?=8))|
(?:(?<=7)8)|(?:8(?=9))|
(?:(?<=8)9)|
(?:a(?=b))|
(?:(?<=a)b)|(?:b(?=c))|
(?:(?<=b)c)|(?:c(?=d))|
(?:(?<=c)d)|(?:d(?=e))|
(?:(?<=d)e)|(?:e(?=f))|
(?:(?<=e)f)
){1,}
"""
print findall(reg, text, X)
The result is:
['abc', 'def', '34567', '012']
As you can see I only added the numbers and the first 6 letters in the alphabet. It's should be fairly obvious how to continue.

Find files matching patterns in Perl

I am taking the user input via -f option, and whatever he enters, accordingly files are being searched recursively.
My problem is: When user enters "tmp*", then also it searches for "abctmp", "xyztmp" etc. What I want to do is, only files starting with tmp should come.
In short, whatever user enters accordingly files should be pushed to array.
Currently I am doing this, but I am sure there's some classy, short way to do it.
#! /perl/bin/perl
use strict;
use warnings;
use File::Find;
use getopt::Long;
my $filename="tmp*.txt";
find( { wanted => \&wanted,
preprocess => \&dir_search,
}, '.');
sub wanted{
my $regex;
my $myop;
my #mylist;
my $firstchar= substr($filename, 0,1); # I am checking first character.
# Whether it's ".*tmp" or just "tmp*"
if($filename=~ m/[^a-zA-Z0-9_]/g){ #If contain wildcard
if($firstchar eq "."){ # first character "."
my $myop = substr($filename, 1,1);
my $frag = substr($filename,2);
$filename = $frag;
$regex = '\b(\w' . ${myop}. ${filename}. '\w*)\b';
# Has to find whatever comes before 'tmp', too
} else {
$regex = '\b(' . ${myop}. ${filename}. '\w*)\b';
# Like, "tmp+.txt" Only search for patterns starting with tmp
}
if($_ =~ /$regex/) {
push(#mylist, $_);
}
} else {
if($_ eq $filename) { #If no wildcard, match the exact name only.
push(#mylist, $_);
}
}
}
sub dir_search {
my (#entries) = #_;
if ($File::Find::dir eq './a') {
#entries = grep { ((-d && $_ eq 'g') ||
((-d && $_ eq 'h') ||
(!(-d && $_ eq 'x')))) } #entries;
# Want from 'g' and 'h' folders only, not from 'x' folder
}
return #entries;
}
And another thing is, I want to search for only '.txt' files. Where should I put that condition?
#!/perl/bin/perl
sub rec_dir {
($dir,$tmpfile_ref) = #_;
opendir(CURRENT, $dir);
#files = readdir(CURRENT);
closedir(CURRENT);
foreach $file (#files) {
if( $file eq ".." || $file eq "." ) { next; }
if( -d $dir."/".$file ) { rec_dir($dir."/".$file,$tmpfile_ref); }
elsif( $file =~ /^tmp/ && $file =~ /\.txf$/ ) { push(#{$tmpfile_ref},$dir."/".$file); }
}
}
#matching_files = ();
$start_dir = ".";
rec_dir($start_dir,\#matching_files);
foreach $file (#matching_files) { print($file."\n"); }
I didn't test it. Barring typographical errors I think it will work.