OCaml lexing and parsing with whitespace sensitivity

OCaml lexing and parsing with whitespace sensitivity - ocaml

im building a parser with Sedlex and Menhir, where i have a function definition as:
(* Lexer *)
let non_ascii = [%sedlex.regexp? '\160' .. '\255'];
let escape = [%sedlex.regexp?
unicode | ('\\', Compl('\r' | '\n' | '\012' | hex_digit))
];
let ident_start = [%sedlex.regexp?
'_' | 'a' .. 'z' | 'A' .. 'Z' | '$' | non_ascii | escape
];
let ident_char = [%sedlex.regexp?
'_' | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | non_ascii | escape];
let rec get_next_token = buf => {
switch%sedlex (buf) {
| white_space => get_next_token(buf)
| eof => EOF
| ';' => SEMI_COLON
| '}' => RIGHT_BRACE
| '{' => LEFT_BRACE
| ':' => COLON
| '(' => LEFT_PAREN
| ')' => RIGHT_PAREN
| '[' => LEFT_BRACKET
| ']' => RIGHT_BRACKET
| '%' => PERCENTAGE
| '&' => AMPERSAND
| ident => IDENT(Sedlexing.latin1(buf))
| number => get_dimension(Sedlexing.latin1(buf), buf)
| _ => assert(false)
};
}
let parse = (buf, parser) => {
let last_token = ref((Parser.EOF, Lexing.dummy_pos, Lexing.dummy_pos));
let next_token = () => {
last_token := get_next_token_with_location(buf);
last_token^;
};
try(MenhirLib.Convert.Simplified.traditional2revised(parser, next_token)) {
| LexingError(_) as e => raise(e)
| _ => raise(ParseError(last_token^))
};
};
(* Parser *)
%token <string> IDENT
%token LEFT_PAREN
%token RIGHT_PAREN
function_expr:
| i = IDENT; LEFT_PAREN; xs = list(exprs); RIGHT_PAREN {
Texp_function (
(i, Lex_buffer.make_loc $startpos(i) $endpos(i)),
(xs, Lex_buffer.make_loc $startpos(xs) $endpos(xs))
)
}
and i have a simple ident definition that is:
| i = IDENT {Texp_ident i, Lex_buffer.make_loc $startpos(i) $endpos(i) }
functions cant have a space between the ident and the LEFT_PAREN, how can i define it?
i want that and func(1, 2, 3) produces a list of expressions as [Texp_ident "and"; Texp_function("func", [...])], but it is actually producing: [Texp_function("and", ["func"; ...])]. since it doesnt care about the space between ident and LEFT_PAREN. how can i fix that?

Related

Error in defining closing parenthesis in grammar for C

I am trying to define a simple grammar for C which I am using in Lark. The problem is, I defined the closing parenthesis ("}" or ")") as a terminal in the grammar but it is throwing an error as "No terminal matches ')' in the current parser context". In many example grammar rules that I saw, the closing parenthesis was defined as terminals. How do I resolve this issue?
Here is the code:
from lark import Lark
g=r'''start : header_files
preprocessor_commands : "#include" | definition
definition : def string (header_files)* program
def : "#define" | "#undef" | "#ifdef" | "#ifndef" | "#if" | "#else" | "#elif" | "#endif" | "#error" | "#pragma"
header_files : preprocessor_commands file_names (header_files)* program
file_names : "<stdio.h>" | "<math.h>" | "<conio.h>" | "<stdlib.h>" | "<string.h>" | "<ctype.h>" | "<time.h>" | "<float.h>" | "<limits.h>" | "<wctype.h>"
program : data_type func "(" var ")" "{" (codeblock) close_par | data_type func "()" "{" (codeblock) close_par
data_type : "void" | "int" | "float" | "double" | "long" | "char" | "string" | "long long" | "unsigned_int"
func : "main" | string
var : string
codeblock : "return" var term | "return" const term | "return" func term | "return" "(" expressions ")" term | declarations | expressions | statements | call | print
declarations : data_type var assign const ";" (declarations)* codeblock | data_type var assign var ";" (declarations)* codeblock | data_type var ("," var)* ";" (declarations)* codeblock
expressions : arithmetic | bitwise | assignment
arithmetic : add | sub | mul | div | mod | unary
add : const "+" (arithmetic)* | var "+" (arithmetic)*
sub : const "-" (arithmetic)* | var "-" (arithmetic)*
mul : const "*" (arithmetic)* | var "*" (arithmetic)*
div : const "/" (arithmetic)* | var "/" (arithmetic)*
mod : const "%" (arithmetic)* | var "%" (arithmetic)*
unary : inc | dec
inc : "++" var | var "++"
dec : "--" var | var "--"
bitwise : and | or | xor | boc | ls | rs
and : const "&" (bitwise)* | var "&" (bitwise)*
or : const "|" (bitwise)* | var "|" (bitwise)*
xor : const "^" (bitwise)* | var "^" (bitwise)*
boc : var assign "~" const | var assign "~" var
ls : const "<<" const | var "<<" const
rs : const ">>" const | var ">>" const
assignment : assign | "*=" | "/=" | "%=" | "+=" | "-=" | "<<=" | ">>=" | "&=" | "^=" | "|="
assign : "="
statements : if | switch | loop
if : ("if" "(" logical close_par codeblock)+ ("elseif" "(" logical close_par codeblock)* ["else" codeblock]
logical : land | lor | lnot | equ | gre | les | greq | leeq | neq
land : const "&&" (logical)* | var "&&" (logical)*
lor : const "||" (logical)* | var "||" (logical)*
lnot : "!" (logical)+ | "!" (arithmetic)+ | "!" var | "!" const
equ : const "==" (logical)* | var "==" (logical)*
gre : const ">" (logical)* | var ">" (logical)*
les : const "<" (logical)* | var "<" (logical)*
greq : const ">=" (logical)* | var ">=" (logical)*
leeq : const "<=" (logical)* | var "<=" (logical)*
neq : const "!=" (logical)* | var "!=" (logical)*
switch : "switch" ("(" expressions ")") "{" (switch_case)* ["default" ":" codeblock] close_par
switch_case : "case" const ":" codeblock (switch_case)*
loop : for | while | do_while
for : "for" "(" [[data_type] var assign const] ";" [logical] ";" [arithmetic] ")" "{" codeblock "}" | "for" "(" [[data_type] var assign const] ";" [logical] ";" [arithmetic] ")" codeblock
while : "while" "(" logical ")" "{" codeblock "}" | "while" "(" logical ")" codeblock
do_while : "do" "{" codeblock "}" "while" "(" logical ")"
call : func "(" var ("," var)* ")" term | var assign func "(" var ("," var)* ")" term | func "(" ")" term | var assign func "(" ")" term
print : "printf" "(" (dcstring)* ")" term
%import common.SIGNED_NUMBER
const : SIGNED_NUMBER
term : ";" codeblock
tpar : ")" | "}" | ")" codeblock
digit : "0".."9"
nz_dig : "1".."9"
integer : (digit)* (nz_dig)+ | "-" (digit)* (nz_dig)+
decimal : (digit)+ "." (digit)+ | "-" (digit)+ "." (digit)+
letter : "a".."z" | "A".."Z"
char : letter | SIGNED_NUMBER
string : /[a-zA-Z0-9_.-]{2,}/ | (char)*
dcstring : /"[^"]*"/
close_par : "}" | ")" | "]"
WHITESPACE: " " | "\t" | "\f" | "\n"
%ignore WHITESPACE+
COMMENT: "//" /[^\n]/* | "/*" /(\S|\s)*?/ "*/"
%ignore COMMENT
'''
parser=Lark(grammar=g,parser="earley")
code='''#include<stdio.h>
#define PI 3.14
void main()
{
int a,b; long c=1;
if(a==b || b==c)
return 2;
}
'''
print(parser.parse(code).pretty())
This is the error :

How to handle multi-line rules for gor parsing bnf grammar using boost spirit qi

Assuming I have a BNF grammar like this
<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= a | b | c | d | e
| f | g | h | i
<digit> ::= 0 | 1 | 2 | 3 |
4
If you look at the <letter> rule, its continuation starts with the | but that of the <digit> rule starts with the production with | appearing at the end of the previous line. I also don't want to use a particular symbol to represent the end of a rule.
How do check if a rule as ended using the Boost Spirit Qi for implementation.
I have just gone through the tutorial on the boost page and wondering how I am going to handle this.

Wikipedia
BNF syntax can only represent a rule in one line, whereas in EBNF a terminating character, the semicolon character “;” marks the end of a rule.
So the simple answer is: the input isn't BNF.
Iff you want to support it anyways (at your own peril :)) you'll have to make it so. So, let's write a simplistic BFN grammar, literally mapping from Wikipedia BNF
<syntax> ::= <rule> | <rule> <syntax>
<rule> ::= <opt-whitespace> "<" <rule-name> ">" <opt-whitespace> "::=" <opt-whitespace> <expression> <line-end>
<opt-whitespace> ::= " " <opt-whitespace> | ""
<expression> ::= <list> | <list> <opt-whitespace> "|" <opt-whitespace> <expression>
<line-end> ::= <opt-whitespace> <EOL> | <line-end> <line-end>
<list> ::= <term> | <term> <opt-whitespace> <list>
<term> ::= <literal> | "<" <rule-name> ">"
<literal> ::= '"' <text1> '"' | "'" <text2> "'"
<text1> ::= "" | <character1> <text1>
<text2> ::= '' | <character2> <text2>
<character> ::= <letter> | <digit> | <symbol>
<letter> ::= "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" | "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
<digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
<symbol> ::= "|" | " " | "!" | "#" | "$" | "%" | "&" | "(" | ")" | "*" | "+" | "," | "-" | "." | "/" | ":" | ";" | ">" | "=" | "<" | "?" | "#" | "[" | "\" | "]" | "^" | "_" | "`" | "{" | "}" | "~"
<character1> ::= <character> | "'"
<character2> ::= <character> | '"'
<rule-name> ::= <letter> | <rule-name> <rule-char>
<rule-char> ::= <letter> | <digit> | "-"
It could look like this:
template <typename Iterator>
struct BNF: qi::grammar<Iterator, Ast::Syntax()> {
BNF(): BNF::base_type(start) {
using namespace qi;
start = skip(blank) [ _rule % +eol ];
_rule = _rule_name >> "::=" >> _expression;
_expression = _list % '|';
_list = +_term;
_term = _literal | _rule_name;
_literal = '"' >> *(_character - '"') >> '"'
| "'" >> *(_character - "'") >> "'";
_character = alnum | char_("\"'| !#$%&()*+,./:;>=<?#]\\^_`{}~[-");
_rule_name = '<' >> (alpha >> *(alnum | char_('-'))) >> '>';
BOOST_SPIRIT_DEBUG_NODES(
(_rule)(_expression)(_list)(_term)
(_literal)(_character)
(_rule_name))
}
private:
qi::rule<Iterator, Ast::Syntax()> start;
qi::rule<Iterator, Ast::Rule(), qi::blank_type> _rule;
qi::rule<Iterator, Ast::Expression(), qi::blank_type> _expression;
qi::rule<Iterator, Ast::List(), qi::blank_type> _list;
// lexemes
qi::rule<Iterator, Ast::Term()> _term;
qi::rule<Iterator, Ast::Name()> _rule_name;
qi::rule<Iterator, std::string()> _literal;
qi::rule<Iterator, char()> _character;
};
Now it will parse your sample (corrected to be BNF):
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" | "4"
)";
Live On Compiler Explorer
Prints:
code ::= {<letter>, <digit>} | {<letter>, <digit>, <code>}
letter ::= {a} | {b} | {c} | {d} | {e} | {f} | {g} | {h} | {i}
digit ::= {0} | {1} | {2} | {3} | {4}
Remaining: "
"
Support Line-Wrapped Rules
The best way is to not accept them - since the grammar wasn't designed for it unlike e.g. EBNF.
You can force the issue by doing a negative look-ahead in the skipper:
_skipper = blank | (eol >> !_rule);
start = skip(_skipper) [ _rule % +eol ];
For technical reasons (Boost spirit skipper issues) that doesn't compile, so we need to feed it a placeholder skipper inside the look-ahead:
_blank = blank;
_skipper = blank | (eol >> !skip(_blank.alias()) [ _rule ]);
start = skip(_skipper.alias()) [ _rule % +eol ];
Now it parses the same but with various line-breaks:
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e"
| "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" |
"4"
)";
Printing:
code ::= {<letter>, <digit>} | {<letter>, <digit>, <code>}
letter ::= {a} | {b} | {c} | {d} | {e} | {f} | {g} | {h} | {i}
digit ::= {0} | {1} | {2} | {3} | {4}
FULL LISTING
Compiler Explorer
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/adapted.hpp>
#include <fmt/ranges.h>
#include <fmt/ostream.h>
#include <iomanip>
namespace qi = boost::spirit::qi;
namespace Ast {
struct Name : std::string {
using std::string::string;
using std::string::operator=;
friend std::ostream& operator<<(std::ostream& os, Name const& n) {
return os << '<' << n.c_str() << '>';
}
};
using Term = boost::variant<Name, std::string>;
using List = std::list<Term>;
using Expression = std::list<List>;
struct Rule {
Name name; // lhs
Expression rhs;
};
using Syntax = std::list<Rule>;
}
BOOST_FUSION_ADAPT_STRUCT(Ast::Rule, name, rhs)
namespace Parser {
template <typename Iterator>
struct BNF: qi::grammar<Iterator, Ast::Syntax()> {
BNF(): BNF::base_type(start) {
using namespace qi;
_blank = blank;
_skipper = blank | (eol >> !skip(_blank.alias()) [ _rule ]);
start = skip(_skipper.alias()) [ _rule % +eol ];
_rule = _rule_name >> "::=" >> _expression;
_expression = _list % '|';
_list = +_term;
_term = _literal | _rule_name;
_literal = '"' >> *(_character - '"') >> '"'
| "'" >> *(_character - "'") >> "'";
_character = alnum | char_("\"'| !#$%&()*+,./:;>=<?#]\\^_`{}~[-");
_rule_name = '<' >> (alpha >> *(alnum | char_('-'))) >> '>';
BOOST_SPIRIT_DEBUG_NODES(
(_rule)(_expression)(_list)(_term)
(_literal)(_character)
(_rule_name))
}
private:
using Skipper = qi::rule<Iterator>;
Skipper _skipper, _blank;
qi::rule<Iterator, Ast::Syntax()> start;
qi::rule<Iterator, Ast::Rule(), Skipper> _rule;
qi::rule<Iterator, Ast::Expression(), Skipper> _expression;
qi::rule<Iterator, Ast::List(), Skipper> _list;
// lexemes
qi::rule<Iterator, Ast::Term()> _term;
qi::rule<Iterator, Ast::Name()> _rule_name;
qi::rule<Iterator, std::string()> _literal;
qi::rule<Iterator, char()> _character;
};
}
int main() {
Parser::BNF<std::string::const_iterator> const parser;
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e"
| "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" |
"4"
)";
auto it = input.begin(), itEnd = input.end();
Ast::Syntax syntax;
if (parse(it, itEnd, parser, syntax)) {
for (auto& rule : syntax)
fmt::print("{} ::= {}\n", rule.name, fmt::join(rule.rhs, " | "));
} else {
std::cout << "Failed\n";
}
if (it != itEnd)
std::cout << "Remaining: " << std::quoted(std::string(it, itEnd)) << "\n";
}
Also Live On Coliru (without libfmt)

How to parse/identify double quoted string from the big expression using MARPA:R2 perl

Problem in parsing/identifying double quoted string from the big expression.
use strict;
use Marpa::R2;
use Data::Dumper;
my $grammar = Marpa::R2::Scanless::G->new({
default_action => '[values]',
source => \(<<'END_OF_SOURCE'),
:start ::= expression
expression ::= expression OP expression
expression ::= expression COMMA expression
expression ::= func LPAREN PARAM RPAREN
expression ::= PARAM
PARAM ::= STRING | REGEX_STRING
:discard ~ sp
sp ~ [\s]+
COMMA ~ [,]
STRING ~ [^ \/\(\),&:\"~]+
REGEX_STRING ~ yet to identify
OP ~ ' - ' | '&'
LPAREN ~ '('
RPAREN ~ ')'
func ~ 'func'
END_OF_SOURCE
});
my $recce = Marpa::R2::Scanless::R->new({grammar => $grammar});
my $input1 = "func(foo)&func(bar)"; -> able to parse it properly by parsing foo and bar as STRING LEXEME.
my $input2 = "\"foo\""; -> Here, I want to parse foo as regex_string LEXEME. REGEX_STRING is something which is enclosed in double quotes.
my $input3 = "func(\"foo\") - func(\"bar\")"; -> Here, func should be taken as func LEXEME, ( should be LPAREN, ) should be RPAREN, foo as REGEX_STRING, - as OP and same for func(\"bar\")
my $input4 = "func(\"foo\")"; -> Here, func should be taken as func LEXEME, ( should be LPAREN, ) should be RPAREN, foo as REGEX_STRING
print "Trying to parse:\n$input\n\n";
$recce->read(\$input);
my $value_ref = ${$recce->value};
print "Output:\n".Dumper($value_ref);
What did i try :
1st method:
My REGEX_STRING should be something : REGEX_STRING -> ~ '\"([^:]*?)\"'
If i try putting above REGEX_STRING in the code with input expression as my $input4 = "func(\"foo\")"; i get error like :
Error in SLIF parse: No lexeme found at line 1, column 5
* String before error: func(
* The error was at line 1, column 5, and at character 0x0022 '"', ...
* here: "foo")
Marpa::R2 exception
2nd method:
Tried including a rule like :
PARAM ::= STRING | REGEX_STRING
REGEX_STRING ::= '"' QUOTED_STRING '"'
STRING ~ [^ \/\(\),&:\"~]+
QUOTED_STRING ~ [^ ,&:\"~]+
The problem here is-> Input is given using:
my $input4 = "func(\"foo\")";
So, here it gives error because there are now two ways to parse this expression, either whole thing between double quotes which is func(\"foo\")
is taken as QUOTED_STRING or func should be taken as func LEXEME and so on.
Please help how do i fix this thing.

use 5.026;
use strictures;
use Data::Dumper qw(Dumper);
use Marpa::R2 qw();
my $grammar = Marpa::R2::Scanless::G->new({
bless_package => 'parsetree',
source => \<<'',
:default ::= action => [values] bless => ::lhs
lexeme default = bless => ::name latm => 1
:start ::= expression
expression ::= expression OP expression
expression ::= expression COMMA expression
expression ::= func LPAREN PARAM RPAREN
expression ::= PARAM
PARAM ::= STRING | REGEXSTRING
:discard ~ sp
sp ~ [\s]+
COMMA ~ [,]
STRING ~ [^ \/\(\),&:\"~]+
REGEXSTRING ::= '"' QUOTEDSTRING '"'
QUOTEDSTRING ~ [^ ,&:\"~]+
OP ~ ' - ' | '&'
LPAREN ~ '('
RPAREN ~ ')'
func ~ 'func'
});
# say $grammar->show_rules;
for my $input (
'func(foo)&func(bar)', '"foo"', 'func("foo") - func("bar")', 'func("foo")'
) {
my $r = Marpa::R2::Scanless::R->new({
grammar => $grammar,
# trace_terminals => 1
});
$r->read(\$input);
say "# $input";
say Dumper $r->value;
}

2nd method posted in question worked for me. I just have to include :
lexeme default = latm => 1
in my code.

Micro Parser Combinators (mpc) runs in endless loop

I am writing a compiler in C++ (using Visual Studio) for a small scripting language and I use this C parsing library.
So, I followed instructions from the documentation and I ended up on this peace of code:
int main()
{
mpc_parser_t* Int = mpc_new("int");
mpc_parser_t* Char = mpc_new("char");
mpc_parser_t* String = mpc_new("string");
mpc_parser_t* Id = mpc_new("id");
mpc_parser_t* Type = mpc_new("type");
mpc_parser_t* Formal = mpc_new("formal");
mpc_parser_t* Header = mpc_new("header");
mpc_parser_t* FuncDecl = mpc_new("funcdecl");
mpc_parser_t* VarDef = mpc_new("vardef");
mpc_parser_t* Expr = mpc_new("expr");
mpc_parser_t* Call = mpc_new("call");
mpc_parser_t* Atom = mpc_new("atom");
mpc_parser_t* Simple = mpc_new("simple");
mpc_parser_t* SimpleList = mpc_new("simplelist");
mpc_parser_t* Stmt = mpc_new("stmt");
mpc_parser_t* FuncDef = mpc_new("funcdef");
mpc_parser_t* Program = mpc_new("program");
/* Define them with the following Language */
mpca_lang(MPCA_LANG_DEFAULT,
" \
int : /-?[0-9]+/ ; \
char : /'[a-zA-Z0-9!##$%^&*()\\_+-,.\\/<>?;'|\"`~]'/ ; \
string : /\"(\\\\.|[^\"])*\"/ ; \
id : /[a-zA-Z][a-zA-Z0-9_-]*/ ; \
type : \"int\" | \"bool\" | \"char\" | <type> '[' ']' | \"list\" '[' <type> ']' ; \
formal : (\"ref\")? <type> <id> (',' <id>)* ; \
header : <type>? <id> '(' (<formal> (';' <formal>)*)? ')' ; \
funcdecl : \"decl\" <header> ; \
vardef : <type> <id> (',' <id>)* ; \
expr : <atom> | <int> | <char> | '(' <expr> ')' \
| ('+' | '-') <expr> | <expr> ('+' | '-' | '*' | '/' | \"mod\") <expr> \
| <expr> ('=' | \"<>\" | '<' | '>' | \"<=\" | \">=\") <expr> \
| \"true\" | \"false\" | \"not\" <expr> | <expr> (\"and\" | \"or\") <expr> \
| \"new\" <type> '[' <expr> ']' | \"nil\" | \"nil?\" '(' <expr> ')' \
| <expr> '#' <expr> | \"head\" '(' <expr> ')' | \"tail\" '(' <expr> ')' ; \
call : <id> '(' (<expr> (',' <expr>)*)? ')' ; \
atom : <id> | <string> | <atom> '[' <expr> ']' | <call> ; \
simple : \"skip\" | <atom> \":=\" <expr> | <call> ; \
simplelist : <simple> (',' <simple>)* ; \
stmt : <simple> | \"exit\" | \"return\" <expr> \
| \"if\" <expr> ':' <stmt>+ (\"elif\" <expr> ':' <stmt>+)* \
(\"else\" ':' <stmt>+)? \"end\" \
| \"for\" <simplelist> ';' <expr> ';' <simplelist> ':' <stmt>+ \"end\" ; \
funcdef : \"def\" <header> ':' (<funcdef> | <funcdecl> | <vardef>)* <stmt>+ \"end\" ; \
program : /^/ <funcdef> /$/ ; \
",
Int, Char, String, Id, Type, Formal, Header, FuncDecl, VarDef, Expr,
Call, Atom, Simple, SimpleList, Stmt, FuncDef, Program);
mpc_result_t r;
char* input = "def hey () : return 1 end";
if(mpc_parse("input", input, Program, &r))
{
mpc_ast_print((mpc_ast_t*)r.output);
mpc_ast_delete((mpc_ast_t*)r.output);
}
else
{
mpc_err_print(r.error);
mpc_err_delete(r.error);
}
PAUSE("Press any key to continue . . .");
/* Undefine and Delete our Parsers */
mpc_cleanup(17, Int, Char, String, Id, Type, Formal, Header, FuncDecl, VarDef, Expr,
Call, Atom, Simple, SimpleList, Stmt, FuncDef, Program);
return 0;
}
The problem is that I run into an huge loop in mpc_parse. That loop never actually reaches the end. After some time I get this exception:
Unhandled exception at 0x00CBBC9C in TonyCC.exe: 0xC0000005: Access violation reading location 0x0000000C.
I don't know why. I suspect there is something wrong with my grammar but I cannot figure out what.
If someone has used this library before, do you have any idea what the problem might be?
Note: I know it is difficult to read the grammar from C code so here is an image of the grammar:

Regular Expression Period Issue

((https?|ftp)://|www.)(\S+[^.*])
I would like this expression to check for . in succession to each other. If it finds two or more periods back to back, the expression should fail. On the other hand, if it succeeds, I want it to match every character and/or symbol up until the first white space encountered.
In other words:
www.yahoo..com should fail
On a related note: I realize that this expression is very basic in terms of judging valid URL structure. I have another "more intelligent" regular expression in place that precedes the one above. The purpose of the posted one is meant to check the validity of the URL that is passed from the initial regular expression via preg_match_all.

You may awnt to check out FILTER_VALIDATE_URL with http://php.net/manual/en/book.filter.php instead of using Regex to validate your URLS.
Here's example usage:
$url = "http://www.example.com";
if(!filter_var($url, FILTER_VALIDATE_URL))
{
echo "URL is not valid";
}
else
{
echo "URL is valid";
}

You can do something like this:
((https?|ftp)\:\/\/|www.)((?:[\w\-]+\.)*[\w\-]+)
This will not yet check for valid URLs, even if you skip double dots. I'd advise not to use regex if the language you're using (PHP?) has other means of validating an URL.
The RFC states the following:
; URL schemeparts for ip based protocols:
ip-schemepart = "//" login [ "/" urlpath ]
login = [ user [ ":" password ] "#" ] hostport
hostport = host [ ":" port ]
host = hostname | hostnumber
hostname = *[ domainlabel "." ] toplabel
domainlabel = alphadigit | alphadigit *[ alphadigit | "-" ] alphadigit
toplabel = alpha | alpha *[ alphadigit | "-" ] alphadigit
alphadigit = alpha | digit
hostnumber = digits "." digits "." digits "." digits
port = digits
user = *[ uchar | ";" | "?" | "&" | "=" ]
password = *[ uchar | ";" | "?" | "&" | "=" ]
urlpath = *xchar ; depends on protocol see section 3.1
; HTTP
httpurl = "http://" hostport [ "/" hpath [ "?" search ]]
hpath = hsegment *[ "/" hsegment ]
hsegment = *[ uchar | ";" | ":" | "#" | "&" | "=" ]
search = *[ uchar | ";" | ":" | "#" | "&" | "=" ]
; Miscellaneous definitions
lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" |
"i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" |
"q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" |
"y" | "z"
hialpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
"J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
"S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
alpha = lowalpha | hialpha
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
"8" | "9"
safe = "$" | "-" | "_" | "." | "+"
extra = "!" | "*" | "'" | "(" | ")" | ","
national = "{" | "}" | "|" | "\" | "^" | "~" | "[" | "]" | "`"
punctuation = "<" | ">" | "#" | "%" | <">
reserved = ";" | "/" | "?" | ":" | "#" | "&" | "="
hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
"a" | "b" | "c" | "d" | "e" | "f"
escape = "%" hex hex
unreserved = alpha | digit | safe | extra
uchar = unreserved | escape
xchar = unreserved | reserved | escape
digits = 1*digit

Using negative lookahead is an easy way if your engine supports it:
(?!.*\.\.)((https?|ftp)\:\/\/|www.)(\S+[^.*])
Otherwise, you have to be more specific:
^((https?|ftp)\:\/\/|www.)((\.[^.]|[^.\s])+[^.*])($|\s+)

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

OCaml lexing and parsing with whitespace sensitivity - ocaml

Related

Error in defining closing parenthesis in grammar for C

How to handle multi-line rules for gor parsing bnf grammar using boost spirit qi

How to parse/identify double quoted string from the big expression using MARPA:R2 perl

Micro Parser Combinators (mpc) runs in endless loop

Regular Expression Period Issue

Categories

Resources