Handling different escaping sequences? - regex

I'm using ANTLR with Presto grammar in order to parse SQL queries.
This is the original string definition I've used to parse queries:
STRING
: '\'' ( '\\' .
| ~[\\'] // match anything other than \ and '
| '\'\'' // match ''
)*
'\''
;
This worked ok for most queries until I saw queries with different escaping rules. For example:
select
table1(replace(replace(some_col,'\\'',''),'\"' ,'')) as features
from table1
So I've modified my String definition and now it looks like:
STRING
: '\'' ( '\\' .
| '\\\\' . {HelperUtils.isNeedSpecialEscaping(this)}? // match \ followed by any char
| ~[\\'] // match anything other than \ and '
| '\'\'' // match ''
)*
'\''
;
However, this won't work for the query mentioned above as I'm getting
'\\'',''),'
as a single string.
The predicate returns True for the following query.
Any idea how can I handle this query as well?
Thanks,
Nir.

In the end I was able to solve it. This is the expression I was using:
STRING
: '\'' ( '\\\\' . {HelperUtils.isNeedSpecialEscaping(this)}?
| '\\' (~[\\] | . {!HelperUtils.isNeedSpecialEscaping(this)}?)
| ~[\\'] // match anything other than \ and '
| '\'\'' // match ''
)*
'\''
;

grammar Question;
sql
#init {System.out.println("Question last update 2352");}
: replace+ EOF
;
replace
: REPLACE '(' expr ')'
;
expr
: ( replace | ID ) ',' STRING ',' STRING
;
REPLACE : 'replace' DIGIT? ;
ID : [a-zA-Z0-9_]+ ;
DIGIT : [0-9] ;
STRING : '\'' '\\\\\'' '\'' // '\\''
| '\'' '\'\'' '\'' // ''''
| '\'' ~[\\']* '\'\'' ~[\\']* '\'' // 'it is 8 o''clock'
| '\'' .*? '\'' ;
NL : '\r'? '\n' -> channel(HIDDEN) ;
WS : [ \t]+ -> channel(HIDDEN) ;
File input.txt (not having more examples, I can only guess) :
replace1(replace(some_col,'\\'',''),'\"' ,'')
replace2(some_col,'''','')
replace3(some_col,'abc\tdef\tghi','xyz')
replace4(some_col,'abc\ndef','xyz')
replace5(some_col,'it is 8 o''clock','8')
Execution :
$ alias a4='java -jar /usr/local/lib/antlr-4.9-complete.jar'
$ alias grun='java org.antlr.v4.gui.TestRig'
$ a4 Question.g4
$ javac Question*.java
$ grun Question sql -tokens input.txt
[#0,0:7='replace1',<REPLACE>,1:0]
[#1,8:8='(',<'('>,1:8]
[#2,9:15='replace',<REPLACE>,1:9]
[#3,16:16='(',<'('>,1:16]
[#4,17:24='some_col',<ID>,1:17]
[#5,25:25=',',<','>,1:25]
[#6,26:30=''\\''',<STRING>,1:26]
[#7,31:31=',',<','>,1:31]
[#8,32:33='''',<STRING>,1:32]
[#9,34:34=')',<')'>,1:34]
[#10,35:35=',',<','>,1:35]
[#11,36:39=''\"'',<STRING>,1:36]
[#12,40:40=' ',<WS>,channel=1,1:40]
[#13,41:41=',',<','>,1:41]
[#14,42:43='''',<STRING>,1:42]
[#15,44:44=')',<')'>,1:44]
[#16,45:45='\n',<NL>,channel=1,1:45]
[#17,46:53='replace2',<REPLACE>,2:0]
[#18,54:54='(',<'('>,2:8]
[#19,55:62='some_col',<ID>,2:9]
[#20,63:63=',',<','>,2:17]
[#21,64:67='''''',<STRING>,2:18]
[#22,68:68=',',<','>,2:22]
[#23,69:70='''',<STRING>,2:23]
[#24,71:71=')',<')'>,2:25]
[#25,72:72='\n',<NL>,channel=1,2:26]
[#26,73:80='replace3',<REPLACE>,3:0]
[#27,81:81='(',<'('>,3:8]
[#28,82:89='some_col',<ID>,3:9]
[#29,90:90=',',<','>,3:17]
[#30,91:105=''abc\tdef\tghi'',<STRING>,3:18]
[#31,106:106=',',<','>,3:33]
[#32,107:111=''xyz'',<STRING>,3:34]
[#33,112:112=')',<')'>,3:39]
[#34,113:113='\n',<NL>,channel=1,3:40]
[#35,114:121='replace4',<REPLACE>,4:0]
[#36,122:122='(',<'('>,4:8]
[#37,123:130='some_col',<ID>,4:9]
[#38,131:131=',',<','>,4:17]
[#39,132:141=''abc\ndef'',<STRING>,4:18]
[#40,142:142=',',<','>,4:28]
[#41,143:147=''xyz'',<STRING>,4:29]
[#42,148:148=')',<')'>,4:34]
[#43,149:149='\n',<NL>,channel=1,4:35]
[#44,150:157='replace5',<REPLACE>,5:0]
[#45,158:158='(',<'('>,5:8]
[#46,159:166='some_col',<ID>,5:9]
[#47,167:167=',',<','>,5:17]
[#48,168:185=''it is 8 o''clock'',<STRING>,5:18]
[#49,186:186=',',<','>,5:36]
[#50,187:189=''8'',<STRING>,5:37]
[#51,190:190=')',<')'>,5:40]
[#52,191:191='\n',<NL>,channel=1,5:41]
[#53,192:191='<EOF>',<EOF>,6:0]
Question last update 2352

Related

How to replace multiple sub occurrences of string

I have one use case where I can have a text string which can contain anything. What I want to achieve is to replace a certain pattern within that given string.
Let's say I have given string as
:es1
es2
aaes1aa
:es3,
es1:
ees1,
ees1
{
"es1 :
What I am trying to do is here is suppose I have to replace all es1 in this string but with one condition. It has to be either end or start with [, | ; | : | " | ' | \\ | \s]. :es1, es1,, es1: and so on are accepted but eees1sss is not.
I tried ([, | ; | : | " | ' | \\ | \s])(es1)([, | ; | : | " | ' | , | \s]) something like this but I don't think it's what I need.
Go program:
match := regexp.MustCompile(`([, | ; | : | " | ' | \\ | \s])(es1)([, | ; | : | " | ' | , | \s])`)
test := `:es1
es2
aaes1aa
:es3,
es1:
ees1,
ees1
{
"es1 :`
fmt.Println(match.ReplaceAllString(test, "$1es4$3"))
output:
es2
aaes1aa
:es3,
:
ees1,
ees1
{
:
I was expecting my output to be more like
:es4
es2
aaes1aa
:es3,
es4:
ees1,
ees1
{
"es4 :
the solution provided below is not well tested against all possibilities, but it seems to be working.
package main
import (
"fmt"
"regexp"
)
func main() {
match := regexp.MustCompile(`([, | ; | : | " | ' | \\ | \s])es1|^es1([, | ; | : | " | ' | , | \s])`)
test := `:es1
es2
aaes1aa
:es3,
es1:
ees1,
ees1
{
"es1 :`
fmt.Println(match.ReplaceAllString(test, "${1}es4${2}"))
}
https://play.golang.org/p/E8lb9vmM_Sa
You can use
package main
import (
"fmt"
"regexp"
)
func main() {
match := regexp.MustCompile(`([,;:"'\\\s])es1\b|\bes1([,;:"'\\\s])`)
test := ":es1\n es2\n aaes1aa\n :es3,\n es1:\n ees1,\n ees1 \n \n {\n \"es1 :"
fmt.Println(match.ReplaceAllString(test, "${1}es4$2"))
}
See the Go demo and the regex demo. Note that the spaces and | chars inside square brackets are meaningful and match these chars literally, thus, you need to remove them all from your pattern.
The regex matches:
([,;:"'\\\s])es1\b - Group 1: a comma, or a semi-colon, colon, double or single quotation mark, backslash or whitespace; then es1 as a whole word (\b is a word boundary)
| - or
\bes1 - a whole word es1
([,;:"'\\\s]) - Group 2: a comma, or a semi-colon, colon, double or single quotation mark, backslash or whitespace

Getting Regex error on VB.NET on a specific string

Can someone help me please? I want to understand what could have made this regex match stop before showing whether there's a match or not.
So, I'm using Regex and it looks like i'm stuck on an infinite loop.
Here's the pattern and string to match. A match shouldn't be found on this case, i verified that manually.
pattern = "((^)|([A-z0-9_]*[A-z][A-z0-9_]*\((([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*)|([0-9]+(\.[0-9]+)?))(\ *[\+\-\*\/]\ *(([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*)|([0-9]+(\.[0-9]+)?)))*\)\ *[\+\-\*\/]\ *)+)(\()*((([0-9]+(\.[0-9]+)?)\ *[\+\-\*\/]\ *)*([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*))"
string = "(SUM(UTRANCELL.PMNORABESTABLISHATTEMPTPACKETSTREAM)+SUM(UTRANCELL.PMNORABESTABLISHATTEMPTPACKETSTREAM128)+SUM(UTRANCELL.PMNORABESTABLISHATTEMPTPACKETINTERACTIVE)+SUM(UTRANCELL.PMNORABESTATTEMPTSSTREAMHS))"
Here's the grammar i used to build the regex:
<formula_com_erro> ::= ( <inicio_linha> | (<subformula> ['+'|'-'|'*'|'/'])+ ) {'('} ( { (<constante_numerica>) ('+'|'-'|'*'|'/') } ( <alias> '.' <contador> ) )
<formula_com_erro> ::= ((^)|([A-z0-9_]*[A-z][A-z0-9_]*\((([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*)|([0-9]+(\.[0-9]+)?))(\ *[\+\-\*\/]\ *(([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*)|([0-9]+(\.[0-9]+)?)))*\)\ *[\+\-\*\/]\ *)+)(\()*((([0-9]+(\.[0-9]+)?)\ *[\+\-\*\/]\ *)*([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*))
<subformula> ::= <nome_funcao> '(' (<alias> '.' <contador>) | (<constante_numerica>) {['+'|'-'|'*'|'/'] ((<alias> '.' <contador>) | (<constante_numerica>))} ')'
<subformula> ::= [A-z0-9_]*[A-z][A-z0-9_]*\((([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*)|([0-9]+(\.[0-9]+)?))(\ *[\+\-\*\/]\ *(([A-z0-9_]*[A-z][A-z0-9_]*\.[A-z0-9_]*[A-z][A-z0-9_]*)|([0-9]+(\.[0-9]+)?)))*\)
<nome_funcao> ::= ( <letra> | <digito> | '_' )+
<nome_funcao> ::= [A-z0-9_]*[A-z][A-z0-9_]*
<alias> ::= ( <letra> | <digito> | '_' )+
<alias> ::= [A-z0-9_]*[A-z][A-z0-9_]*
<contador> ::= ( <letra> | <digito> | '_' )+
<contador> ::= [A-z0-9_]*[A-z][A-z0-9_]*
<constante_numerica> ::= ( <digito> )+
<constante_numerica> ::= [0-9]+(\.[0-9]+)?

Replace special characters except the following ,.#

I'm looking for an option to remove special characters from a file except for the following 3 items ,.#
The following awk command gets close but it removes all punctuation.
awk '{gsub(/[[:punct:]]/,"",except(".","#",","))}1' test.csv > test2.csv
Any ideas...
There are no opposite character classes in POSIX and no lookarounds to restrict a more generic pattern with some exceptions. The only way is to spell out the POSIX character class.
According to Character Classes and Bracket Expressions:
ā€˜[:punct:]ā€™
Punctuation characters; in the ā€˜Cā€™ locale and ASCII character encoding, this is ! " # $ % & ' ( ) * + , - . / : ; < = > ? # [ \ ] ^ _ \ { | } ~.
You may use
/[!-+\/:-?[-`{-~-]/
See the regex demo.
Legend:
All 3 of these approaches will work in any locale and will work for any character class by just changing the class name and will work for other bracket expressions or strings etc.:
1) Just look for any punct but only change it if it's not one of the chars you don't want changed:
$ echo 'a.b?c#d#e,f' |
awk '{
new = ""
while ( match($0,/[[:punct:]]/) ) {
chr = substr($0,RSTART,1)
new = new substr($0,1,RSTART-1) (chr ~ /[,.#]/ ? chr : "")
$0 = substr($0,RSTART+RLENGTH)
}
print new $0
}'
a.bcd#e,f
2) Turn the chars you don't want changed into other strings first then turn them back afterwards:
$ echo 'a.b?c#d#e,f' |
awk '{
gsub(/a/,"aA"); gsub(/,/,"aB"); gsub(/\./,"aC"); gsub(/#/,"aD")
gsub(/[[:punct:]]/,"")
gsub(/aD/,"#"); gsub(/aC/,"."); gsub(/aB/,","); gsub(/aA/,"a")
print
}'
a.bcd#e,f
Changing a into aA and back is what guarantees that the strings you create when converting the #, etc. are strings that cannot exist elsewhere in the input at that time and that's why you can safely convert them back afterwards.
3) Suffix the puncts with the RS value, then remove the RS suffix from the chars you don't want changed, then change the remaining RS-suffixed puncts:
$ echo 'a.b?c#d#e,f' |
awk '{
gsub(/[[:punct:]]/,"&"RS)
$0 = gensub("([,.#])"RS,"\\1","g")
gsub("[[:punct:]]"RS,"")
print
}'
a.bcd#e,f
That one uses GNU awk for gensub(), with other awks you'd need match()+substr().

How to parse/identify double quoted string from the big expression using MARPA:R2 perl

Problem in parsing/identifying double quoted string from the big expression.
use strict;
use Marpa::R2;
use Data::Dumper;
my $grammar = Marpa::R2::Scanless::G->new({
default_action => '[values]',
source => \(<<'END_OF_SOURCE'),
:start ::= expression
expression ::= expression OP expression
expression ::= expression COMMA expression
expression ::= func LPAREN PARAM RPAREN
expression ::= PARAM
PARAM ::= STRING | REGEX_STRING
:discard ~ sp
sp ~ [\s]+
COMMA ~ [,]
STRING ~ [^ \/\(\),&:\"~]+
REGEX_STRING ~ yet to identify
OP ~ ' - ' | '&'
LPAREN ~ '('
RPAREN ~ ')'
func ~ 'func'
END_OF_SOURCE
});
my $recce = Marpa::R2::Scanless::R->new({grammar => $grammar});
my $input1 = "func(foo)&func(bar)"; -> able to parse it properly by parsing foo and bar as STRING LEXEME.
my $input2 = "\"foo\""; -> Here, I want to parse foo as regex_string LEXEME. REGEX_STRING is something which is enclosed in double quotes.
my $input3 = "func(\"foo\") - func(\"bar\")"; -> Here, func should be taken as func LEXEME, ( should be LPAREN, ) should be RPAREN, foo as REGEX_STRING, - as OP and same for func(\"bar\")
my $input4 = "func(\"foo\")"; -> Here, func should be taken as func LEXEME, ( should be LPAREN, ) should be RPAREN, foo as REGEX_STRING
print "Trying to parse:\n$input\n\n";
$recce->read(\$input);
my $value_ref = ${$recce->value};
print "Output:\n".Dumper($value_ref);
What did i try :
1st method:
My REGEX_STRING should be something : REGEX_STRING -> ~ '\"([^:]*?)\"'
If i try putting above REGEX_STRING in the code with input expression as my $input4 = "func(\"foo\")"; i get error like :
Error in SLIF parse: No lexeme found at line 1, column 5
* String before error: func(
* The error was at line 1, column 5, and at character 0x0022 '"', ...
* here: "foo")
Marpa::R2 exception
2nd method:
Tried including a rule like :
PARAM ::= STRING | REGEX_STRING
REGEX_STRING ::= '"' QUOTED_STRING '"'
STRING ~ [^ \/\(\),&:\"~]+
QUOTED_STRING ~ [^ ,&:\"~]+
The problem here is-> Input is given using:
my $input4 = "func(\"foo\")";
So, here it gives error because there are now two ways to parse this expression, either whole thing between double quotes which is func(\"foo\")
is taken as QUOTED_STRING or func should be taken as func LEXEME and so on.
Please help how do i fix this thing.
use 5.026;
use strictures;
use Data::Dumper qw(Dumper);
use Marpa::R2 qw();
my $grammar = Marpa::R2::Scanless::G->new({
bless_package => 'parsetree',
source => \<<'',
:default ::= action => [values] bless => ::lhs
lexeme default = bless => ::name latm => 1
:start ::= expression
expression ::= expression OP expression
expression ::= expression COMMA expression
expression ::= func LPAREN PARAM RPAREN
expression ::= PARAM
PARAM ::= STRING | REGEXSTRING
:discard ~ sp
sp ~ [\s]+
COMMA ~ [,]
STRING ~ [^ \/\(\),&:\"~]+
REGEXSTRING ::= '"' QUOTEDSTRING '"'
QUOTEDSTRING ~ [^ ,&:\"~]+
OP ~ ' - ' | '&'
LPAREN ~ '('
RPAREN ~ ')'
func ~ 'func'
});
# say $grammar->show_rules;
for my $input (
'func(foo)&func(bar)', '"foo"', 'func("foo") - func("bar")', 'func("foo")'
) {
my $r = Marpa::R2::Scanless::R->new({
grammar => $grammar,
# trace_terminals => 1
});
$r->read(\$input);
say "# $input";
say Dumper $r->value;
}
2nd method posted in question worked for me. I just have to include :
lexeme default = latm => 1
in my code.

Regex to capture everything inside ( ... ) but also handle cases where ( is used inside

I am trying to capture using perl regex the data found inside this:
variable_myname(variable_data);
So I used:
variable_([A-Za-z_]+)(\s+)?\((.*?)\)
This allowed me to capture the myname of the variable (which is also prefixed by variable_) as well as the data inside the (...).
However, this doesn't work if the user uses the (allowed) syntax of:
variable_oneexp("This is a value ( ... ) ");
Which because of ", the ( and ) should be ignored.
Same behavior should be handled if ' is used:
variable_twoexp('This is a value ( ... ) ');
Finally, this behavior should also supported:
variable_threeexp('This is a value ' + ' another string ');
Though, I don't think the last example makes a difference for the regex.
Some pointers/assistance is appreciated.
You could use a negated class instead of the lazy .*?, then use some alternation to match anything between single/double quotes:
variable_([A-Za-z_]+)\s*\(((?:'[^']+'|"[^"]+"|[^()])+)\)
regex101 demo
I also removed the capture group around \s+, and turned it into \s* since I don't believe you need the spaces in a capture group. Revert it back if you need it.
I don't know your use cases, so it's possible that a simple regex would suffice for you. However, it's possible to more completely match this. The following demonstrates the OP's regex, Jerry's regex, and my own against 6 different example functions. The final 3 regex's are examples that Jarry's simpler solution will fail on.
I've included spacing in the regex using the /x modifier to make it easier to read:
use strict;
use warnings;
while (<DATA>) {
chomp;
print "$. - $_\n";
# Original OP
if (/variable_([A-Za-z_]+)\s*\((.*?)\)/) {
printf "OP: <%s>, <%s>\n", $1, $2;
}
# Jarry's Answer
if (/variable_([A-Za-z_]+)\s*\(((?:'[^']+'|"[^"]+"|[^()])+)\)/) {
printf "A1: <%s>, <%s>\n", $1, $2;
}
# Covers all standard single and double quoted strings and parenthesis
if (/variable_([A-Za-z_]+)\s*\((
(?:
(?> [^'"()]+ )
|
' (?: (?>[^'\\]+) | \\. )* '
|
" (?: (?>[^"\\]+) | \\. )* "
|
\( (?2) \)
)*
)\)/x) {
printf "A2: <%s>, <%s>\n", $1, $2;
}
print "\n";
}
__DATA__
variable_oneexp("This is a value ( ... ) ");
variable_twoexp('This is a value ( ... ) ');
variable_threeexp('This is a value ' + ' another string ');
variable_fourexp(' \') <-- a paren');
variable_fiveexp(mysub('value'), 'value2');
variable_sixexp('This is a )(value ', variable_five("testing ()"), "st\")ring", (3-2)/1);
Outputs:
1 - variable_oneexp("This is a value ( ... ) ");
OP: <oneexp>, <"This is a value ( ... >
A1: <oneexp>, <"This is a value ( ... ) ">
A2: <oneexp>, <"This is a value ( ... ) ">
2 - variable_twoexp('This is a value ( ... ) ');
OP: <twoexp>, <'This is a value ( ... >
A1: <twoexp>, <'This is a value ( ... ) '>
A2: <twoexp>, <'This is a value ( ... ) '>
3 - variable_threeexp('This is a value ' + ' another string ');
OP: <threeexp>, <'This is a value ' + ' another string '>
A1: <threeexp>, <'This is a value ' + ' another string '>
A2: <threeexp>, <'This is a value ' + ' another string '>
4 - variable_fourexp(' \') <-- a paren');
OP: <fourexp>, <' \'>
A1: <fourexp>, <' \'>
A2: <fourexp>, <' \') <-- a paren'>
5 - variable_fiveexp(mysub('value'), 'value2');
OP: <fiveexp>, <mysub('value'>
A2: <fiveexp>, <mysub('value'), 'value2'>
6 - variable_sixexp('This is a )(value ', variable_five("testing ()"), "st\")ring", (3-2)/1);
OP: <sixexp>, <'This is a >
A1: <sixexp>, <'This is a >
A2: <sixexp>, <'This is a )(value ', variable_five("testing ()"), "st\")ring", (3-2)/1>