Error in defining closing parenthesis in grammar for C - regex

I am trying to define a simple grammar for C which I am using in Lark. The problem is, I defined the closing parenthesis ("}" or ")") as a terminal in the grammar but it is throwing an error as "No terminal matches ')' in the current parser context". In many example grammar rules that I saw, the closing parenthesis was defined as terminals. How do I resolve this issue?
Here is the code:
from lark import Lark
g=r'''start : header_files
preprocessor_commands : "#include" | definition
definition : def string (header_files)* program
def : "#define" | "#undef" | "#ifdef" | "#ifndef" | "#if" | "#else" | "#elif" | "#endif" | "#error" | "#pragma"
header_files : preprocessor_commands file_names (header_files)* program
file_names : "<stdio.h>" | "<math.h>" | "<conio.h>" | "<stdlib.h>" | "<string.h>" | "<ctype.h>" | "<time.h>" | "<float.h>" | "<limits.h>" | "<wctype.h>"
program : data_type func "(" var ")" "{" (codeblock) close_par | data_type func "()" "{" (codeblock) close_par
data_type : "void" | "int" | "float" | "double" | "long" | "char" | "string" | "long long" | "unsigned_int"
func : "main" | string
var : string
codeblock : "return" var term | "return" const term | "return" func term | "return" "(" expressions ")" term | declarations | expressions | statements | call | print
declarations : data_type var assign const ";" (declarations)* codeblock | data_type var assign var ";" (declarations)* codeblock | data_type var ("," var)* ";" (declarations)* codeblock
expressions : arithmetic | bitwise | assignment
arithmetic : add | sub | mul | div | mod | unary
add : const "+" (arithmetic)* | var "+" (arithmetic)*
sub : const "-" (arithmetic)* | var "-" (arithmetic)*
mul : const "*" (arithmetic)* | var "*" (arithmetic)*
div : const "/" (arithmetic)* | var "/" (arithmetic)*
mod : const "%" (arithmetic)* | var "%" (arithmetic)*
unary : inc | dec
inc : "++" var | var "++"
dec : "--" var | var "--"
bitwise : and | or | xor | boc | ls | rs
and : const "&" (bitwise)* | var "&" (bitwise)*
or : const "|" (bitwise)* | var "|" (bitwise)*
xor : const "^" (bitwise)* | var "^" (bitwise)*
boc : var assign "~" const | var assign "~" var
ls : const "<<" const | var "<<" const
rs : const ">>" const | var ">>" const
assignment : assign | "*=" | "/=" | "%=" | "+=" | "-=" | "<<=" | ">>=" | "&=" | "^=" | "|="
assign : "="
statements : if | switch | loop
if : ("if" "(" logical close_par codeblock)+ ("elseif" "(" logical close_par codeblock)* ["else" codeblock]
logical : land | lor | lnot | equ | gre | les | greq | leeq | neq
land : const "&&" (logical)* | var "&&" (logical)*
lor : const "||" (logical)* | var "||" (logical)*
lnot : "!" (logical)+ | "!" (arithmetic)+ | "!" var | "!" const
equ : const "==" (logical)* | var "==" (logical)*
gre : const ">" (logical)* | var ">" (logical)*
les : const "<" (logical)* | var "<" (logical)*
greq : const ">=" (logical)* | var ">=" (logical)*
leeq : const "<=" (logical)* | var "<=" (logical)*
neq : const "!=" (logical)* | var "!=" (logical)*
switch : "switch" ("(" expressions ")") "{" (switch_case)* ["default" ":" codeblock] close_par
switch_case : "case" const ":" codeblock (switch_case)*
loop : for | while | do_while
for : "for" "(" [[data_type] var assign const] ";" [logical] ";" [arithmetic] ")" "{" codeblock "}" | "for" "(" [[data_type] var assign const] ";" [logical] ";" [arithmetic] ")" codeblock
while : "while" "(" logical ")" "{" codeblock "}" | "while" "(" logical ")" codeblock
do_while : "do" "{" codeblock "}" "while" "(" logical ")"
call : func "(" var ("," var)* ")" term | var assign func "(" var ("," var)* ")" term | func "(" ")" term | var assign func "(" ")" term
print : "printf" "(" (dcstring)* ")" term
%import common.SIGNED_NUMBER
const : SIGNED_NUMBER
term : ";" codeblock
tpar : ")" | "}" | ")" codeblock
digit : "0".."9"
nz_dig : "1".."9"
integer : (digit)* (nz_dig)+ | "-" (digit)* (nz_dig)+
decimal : (digit)+ "." (digit)+ | "-" (digit)+ "." (digit)+
letter : "a".."z" | "A".."Z"
char : letter | SIGNED_NUMBER
string : /[a-zA-Z0-9_.-]{2,}/ | (char)*
dcstring : /"[^"]*"/
close_par : "}" | ")" | "]"
WHITESPACE: " " | "\t" | "\f" | "\n"
%ignore WHITESPACE+
COMMENT: "//" /[^\n]/* | "/*" /(\S|\s)*?/ "*/"
%ignore COMMENT
'''
parser=Lark(grammar=g,parser="earley")
code='''#include<stdio.h>
#define PI 3.14
void main()
{
int a,b; long c=1;
if(a==b || b==c)
return 2;
}
'''
print(parser.parse(code).pretty())
This is the error :

Related

OCaml lexing and parsing with whitespace sensitivity

im building a parser with Sedlex and Menhir, where i have a function definition as:
(* Lexer *)
let non_ascii = [%sedlex.regexp? '\160' .. '\255'];
let escape = [%sedlex.regexp?
unicode | ('\\', Compl('\r' | '\n' | '\012' | hex_digit))
];
let ident_start = [%sedlex.regexp?
'_' | 'a' .. 'z' | 'A' .. 'Z' | '$' | non_ascii | escape
];
let ident_char = [%sedlex.regexp?
'_' | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | non_ascii | escape];
let rec get_next_token = buf => {
switch%sedlex (buf) {
| white_space => get_next_token(buf)
| eof => EOF
| ';' => SEMI_COLON
| '}' => RIGHT_BRACE
| '{' => LEFT_BRACE
| ':' => COLON
| '(' => LEFT_PAREN
| ')' => RIGHT_PAREN
| '[' => LEFT_BRACKET
| ']' => RIGHT_BRACKET
| '%' => PERCENTAGE
| '&' => AMPERSAND
| ident => IDENT(Sedlexing.latin1(buf))
| number => get_dimension(Sedlexing.latin1(buf), buf)
| _ => assert(false)
};
}
let parse = (buf, parser) => {
let last_token = ref((Parser.EOF, Lexing.dummy_pos, Lexing.dummy_pos));
let next_token = () => {
last_token := get_next_token_with_location(buf);
last_token^;
};
try(MenhirLib.Convert.Simplified.traditional2revised(parser, next_token)) {
| LexingError(_) as e => raise(e)
| _ => raise(ParseError(last_token^))
};
};
(* Parser *)
%token <string> IDENT
%token LEFT_PAREN
%token RIGHT_PAREN
function_expr:
| i = IDENT; LEFT_PAREN; xs = list(exprs); RIGHT_PAREN {
Texp_function (
(i, Lex_buffer.make_loc $startpos(i) $endpos(i)),
(xs, Lex_buffer.make_loc $startpos(xs) $endpos(xs))
)
}
and i have a simple ident definition that is:
| i = IDENT {Texp_ident i, Lex_buffer.make_loc $startpos(i) $endpos(i) }
functions cant have a space between the ident and the LEFT_PAREN, how can i define it?
i want that and func(1, 2, 3) produces a list of expressions as [Texp_ident "and"; Texp_function("func", [...])], but it is actually producing: [Texp_function("and", ["func"; ...])]. since it doesnt care about the space between ident and LEFT_PAREN. how can i fix that?

How to handle multi-line rules for gor parsing bnf grammar using boost spirit qi

Assuming I have a BNF grammar like this
<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= a | b | c | d | e
| f | g | h | i
<digit> ::= 0 | 1 | 2 | 3 |
4
If you look at the <letter> rule, its continuation starts with the | but that of the <digit> rule starts with the production with | appearing at the end of the previous line. I also don't want to use a particular symbol to represent the end of a rule.
How do check if a rule as ended using the Boost Spirit Qi for implementation.
I have just gone through the tutorial on the boost page and wondering how I am going to handle this.
Wikipedia
BNF syntax can only represent a rule in one line, whereas in EBNF a terminating character, the semicolon character “;” marks the end of a rule.
So the simple answer is: the input isn't BNF.
Iff you want to support it anyways (at your own peril :)) you'll have to make it so. So, let's write a simplistic BFN grammar, literally mapping from Wikipedia BNF
<syntax> ::= <rule> | <rule> <syntax>
<rule> ::= <opt-whitespace> "<" <rule-name> ">" <opt-whitespace> "::=" <opt-whitespace> <expression> <line-end>
<opt-whitespace> ::= " " <opt-whitespace> | ""
<expression> ::= <list> | <list> <opt-whitespace> "|" <opt-whitespace> <expression>
<line-end> ::= <opt-whitespace> <EOL> | <line-end> <line-end>
<list> ::= <term> | <term> <opt-whitespace> <list>
<term> ::= <literal> | "<" <rule-name> ">"
<literal> ::= '"' <text1> '"' | "'" <text2> "'"
<text1> ::= "" | <character1> <text1>
<text2> ::= '' | <character2> <text2>
<character> ::= <letter> | <digit> | <symbol>
<letter> ::= "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" | "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
<digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
<symbol> ::= "|" | " " | "!" | "#" | "$" | "%" | "&" | "(" | ")" | "*" | "+" | "," | "-" | "." | "/" | ":" | ";" | ">" | "=" | "<" | "?" | "#" | "[" | "\" | "]" | "^" | "_" | "`" | "{" | "}" | "~"
<character1> ::= <character> | "'"
<character2> ::= <character> | '"'
<rule-name> ::= <letter> | <rule-name> <rule-char>
<rule-char> ::= <letter> | <digit> | "-"
It could look like this:
template <typename Iterator>
struct BNF: qi::grammar<Iterator, Ast::Syntax()> {
BNF(): BNF::base_type(start) {
using namespace qi;
start = skip(blank) [ _rule % +eol ];
_rule = _rule_name >> "::=" >> _expression;
_expression = _list % '|';
_list = +_term;
_term = _literal | _rule_name;
_literal = '"' >> *(_character - '"') >> '"'
| "'" >> *(_character - "'") >> "'";
_character = alnum | char_("\"'| !#$%&()*+,./:;>=<?#]\\^_`{}~[-");
_rule_name = '<' >> (alpha >> *(alnum | char_('-'))) >> '>';
BOOST_SPIRIT_DEBUG_NODES(
(_rule)(_expression)(_list)(_term)
(_literal)(_character)
(_rule_name))
}
private:
qi::rule<Iterator, Ast::Syntax()> start;
qi::rule<Iterator, Ast::Rule(), qi::blank_type> _rule;
qi::rule<Iterator, Ast::Expression(), qi::blank_type> _expression;
qi::rule<Iterator, Ast::List(), qi::blank_type> _list;
// lexemes
qi::rule<Iterator, Ast::Term()> _term;
qi::rule<Iterator, Ast::Name()> _rule_name;
qi::rule<Iterator, std::string()> _literal;
qi::rule<Iterator, char()> _character;
};
Now it will parse your sample (corrected to be BNF):
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" | "4"
)";
Live On Compiler Explorer
Prints:
code ::= {<letter>, <digit>} | {<letter>, <digit>, <code>}
letter ::= {a} | {b} | {c} | {d} | {e} | {f} | {g} | {h} | {i}
digit ::= {0} | {1} | {2} | {3} | {4}
Remaining: "
"
Support Line-Wrapped Rules
The best way is to not accept them - since the grammar wasn't designed for it unlike e.g. EBNF.
You can force the issue by doing a negative look-ahead in the skipper:
_skipper = blank | (eol >> !_rule);
start = skip(_skipper) [ _rule % +eol ];
For technical reasons (Boost spirit skipper issues) that doesn't compile, so we need to feed it a placeholder skipper inside the look-ahead:
_blank = blank;
_skipper = blank | (eol >> !skip(_blank.alias()) [ _rule ]);
start = skip(_skipper.alias()) [ _rule % +eol ];
Now it parses the same but with various line-breaks:
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e"
| "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" |
"4"
)";
Printing:
code ::= {<letter>, <digit>} | {<letter>, <digit>, <code>}
letter ::= {a} | {b} | {c} | {d} | {e} | {f} | {g} | {h} | {i}
digit ::= {0} | {1} | {2} | {3} | {4}
FULL LISTING
Compiler Explorer
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/adapted.hpp>
#include <fmt/ranges.h>
#include <fmt/ostream.h>
#include <iomanip>
namespace qi = boost::spirit::qi;
namespace Ast {
struct Name : std::string {
using std::string::string;
using std::string::operator=;
friend std::ostream& operator<<(std::ostream& os, Name const& n) {
return os << '<' << n.c_str() << '>';
}
};
using Term = boost::variant<Name, std::string>;
using List = std::list<Term>;
using Expression = std::list<List>;
struct Rule {
Name name; // lhs
Expression rhs;
};
using Syntax = std::list<Rule>;
}
BOOST_FUSION_ADAPT_STRUCT(Ast::Rule, name, rhs)
namespace Parser {
template <typename Iterator>
struct BNF: qi::grammar<Iterator, Ast::Syntax()> {
BNF(): BNF::base_type(start) {
using namespace qi;
_blank = blank;
_skipper = blank | (eol >> !skip(_blank.alias()) [ _rule ]);
start = skip(_skipper.alias()) [ _rule % +eol ];
_rule = _rule_name >> "::=" >> _expression;
_expression = _list % '|';
_list = +_term;
_term = _literal | _rule_name;
_literal = '"' >> *(_character - '"') >> '"'
| "'" >> *(_character - "'") >> "'";
_character = alnum | char_("\"'| !#$%&()*+,./:;>=<?#]\\^_`{}~[-");
_rule_name = '<' >> (alpha >> *(alnum | char_('-'))) >> '>';
BOOST_SPIRIT_DEBUG_NODES(
(_rule)(_expression)(_list)(_term)
(_literal)(_character)
(_rule_name))
}
private:
using Skipper = qi::rule<Iterator>;
Skipper _skipper, _blank;
qi::rule<Iterator, Ast::Syntax()> start;
qi::rule<Iterator, Ast::Rule(), Skipper> _rule;
qi::rule<Iterator, Ast::Expression(), Skipper> _expression;
qi::rule<Iterator, Ast::List(), Skipper> _list;
// lexemes
qi::rule<Iterator, Ast::Term()> _term;
qi::rule<Iterator, Ast::Name()> _rule_name;
qi::rule<Iterator, std::string()> _literal;
qi::rule<Iterator, char()> _character;
};
}
int main() {
Parser::BNF<std::string::const_iterator> const parser;
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e"
| "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" |
"4"
)";
auto it = input.begin(), itEnd = input.end();
Ast::Syntax syntax;
if (parse(it, itEnd, parser, syntax)) {
for (auto& rule : syntax)
fmt::print("{} ::= {}\n", rule.name, fmt::join(rule.rhs, " | "));
} else {
std::cout << "Failed\n";
}
if (it != itEnd)
std::cout << "Remaining: " << std::quoted(std::string(it, itEnd)) << "\n";
}
Also Live On Coliru (without libfmt)

use of undeclared identifier `yylex` and `yyin`?

Here's my simple project source code:
bison.y
flex.l
flex_bison.cpp
flex.l:
%option noyywrap
%{
#include <string>
#include <cstring>
#include "bison.tab.hpp"
#define FT_SAVE_TOKEN yylval.literal = strndup(yytext, yyleng)
#define FT_TOKEN(t) (yylval.token = t)
%}
%%
"True" return FT_TRUE;
"False" return FT_FALSE;
"let" return FT_LET;
"Nil" return FT_NIL;
"if" return FT_IF;
"elseif" return FT_ELSEIF;
"else" return FT_ELSE;
"switch" return FT_SWITCH;
"case" return FT_CASE;
"otherwise" return FT_OTHERWISE;
"for" return FT_FOR;
"while" return FT_WHILE;
"break" return FT_BREAK;
"continue" return FT_CONTINUE;
"func" return FT_FUNC;
"class" return FT_CLASS;
"type" return FT_TYPE;
"isinstance" return FT_ISINSTANCE;
"import" return FT_IMPORT;
"return" return FT_RETURN;
"void" return FT_VOID;
"and" return FT_LOGICALAND;
"or" return FT_LOGICALOR;
"not" return FT_LOGICALNOT;
"int" return FT_INTEGER_KEYWORD;
"uint" return FT_UNSIGNED_INTEGER_KEYWORD;
"double" return FT_DOUBLE_KEYWORD;
[ \t\v\n\f\r] ;
[a-zA-Z_][a-zA-Z0-9_]* FT_SAVE_TOKEN; return FT_IDENTIFIER;
[0-9]+"."[0-9]+([Ee][+-]?[0-9]+)? FT_SAVE_TOKEN; return FT_DOUBLE;
[0-9]+([Ee][+-]?[0-9]+)? FT_SAVE_TOKEN; return FT_INTEGER;
\"(\\.|[^\\"])*\" FT_SAVE_TOKEN; return FT_STRING;
"+" return FT_TOKEN(FT_ADD);
"-" return FT_TOKEN(FT_SUB);
"*" return FT_TOKEN(FT_MUL);
"/" return FT_TOKEN(FT_DIV);
"%" return FT_TOKEN(FT_MOD);
"!" return FT_TOKEN(FT_BITNOT);
"&" return FT_TOKEN(FT_BITAND);
"|" return FT_TOKEN(FT_BITOR);
"~" return FT_TOKEN(FT_BITCOMPLEMENT);
"^" return FT_TOKEN(FT_BITXOR);
"=" return FT_TOKEN(FT_ASSIGN);
"+=" return FT_TOKEN(FT_ADDASSIGN);
"-=" return FT_TOKEN(FT_SUBASSIGN);
"*=" return FT_TOKEN(FT_MULASSIGN);
"/=" return FT_TOKEN(FT_DIVASSIGN);
"%=" return FT_TOKEN(FT_MODASSIGN);
"==" return FT_TOKEN(FT_EQ);
"!=" return FT_TOKEN(FT_NEQ);
"<" return FT_TOKEN(FT_LT);
"<=" return FT_TOKEN(FT_LE);
">" return FT_TOKEN(FT_GT);
">=" return FT_TOKEN(FT_GE);
"(" return FT_TOKEN(FT_LPAREN);
")" return FT_TOKEN(FT_RPAREN);
"[" return FT_TOKEN(FT_LBRACKET);
"]" return FT_TOKEN(FT_RBRACKET);
"{" return FT_TOKEN(FT_LBRACE);
"}" return FT_TOKEN(FT_RBRACE);
"," return FT_TOKEN(FT_COMMA);
";" return FT_TOKEN(FT_SEMI);
"?" return FT_TOKEN(FT_QUESTION);
":" return FT_TOKEN(FT_COLON);
"." return FT_TOKEN(FT_DOT);
. printf("Unknown token!n"); yyterminate();
%%
bison.y:
%{
#include <string>
#include <cstring>
#include <cstdio>
extern FILE *yyin;
extern int yylex();
void yyerror(const char *s) { printf("yyerror: %s\n", s); }
%}
/* Represents the many different ways we can access our data */
%union {
char *literal;
int token;
}
/* union.token: eof, keyword */
%token <token> FT_EOF
%token <token> FT_TRUE FT_FALSE FT_LET FT_NIL FT_IF FT_ELSEIF FT_ELSE FT_FOR FT_WHILE FT_BREAK FT_CONTINUE FT_SWITCH FT_CASE FT_OTHERWISE
%token <token> FT_FUNC FT_CLASS FT_TYPE FT_ISINSTANCE FT_IMPORT FT_RETURN FT_VOID FT_LOGICALAND FT_LOGICALOR FT_LOGICALNOT
%token <token> FT_INTEGER_KEYWORD FT_UNSIGNED_INTEGER_KEYWORD FT_DOUBLE_KEYWORD
/* union.literal, identifier, integer, double number, string */
%token <literal> FT_IDENTIFIER FT_INTEGER FT_DOUBLE FT_STRING
/* union.token: operator, comparator, punctuation */
%token <token> FT_ADD FT_SUB FT_MUL FT_DIV FT_MOD FT_BITNOT FT_BITAND FT_BITOR FT_BITCOMPLEMENT FT_BITXOR
%token <token> FT_ASSIGN FT_ADDASSIGN FT_SUBASSIGN FT_MULASSIGN FT_DIVASSIGN FT_MODASSIGN FT_EQ FT_NEQ FT_LT FT_LE FT_GT FT_GE
%token <token> FT_LPAREN FT_RPAREN FT_LBRACKET FT_RBRACKET FT_LBRACE FT_RBRACE FT_COMMA FT_SEMI FT_QUESTION FT_COLON FT_DOT
/*
%type <ident> ident
%type <expr> numeric expr
%type <varvec> func_decl_args
%type <exprvec> call_args
%type <block> program stmts block
%type <stmt> stmt var_decl func_decl
%type <token> comparison
*/
/* operator/comparator precedence */
%left FT_DOT FT_LPAREN FT_RPAREN FT_LBRACKET FT_RBRACKET
%left FT_MUL FT_DIV FT_MOD
%left FT_ADD FT_SUB
%left FT_LT FT_LE FT_GT FT_GE FT_EQ FT_NEQ
%left FT_BITNOT FT_BITAND FT_BITOR FT_BITXOR FT_BITCOMPLEMENT
%left FT_LOGICALNOT FT_LOGICALAND FT_LOGICALOR
%left FT_ASSIGN FT_ADDASSIGN FT_SUBASSIGN FT_MULASSIGN FT_DIVASSIGN FT_MODASSIGN
/*
%start program
*/
%%
primary_expression : FT_IDENTIFIER
| FT_INTEGER
| FT_DOUBLE
| FT_STRING
| '(' expression ')'
;
postfix_expression : primary_expression
/*| postfix_expression '[' expression ']'*/
| postfix_expression '(' ')'
| postfix_expression '(' argument_expression_list ')'
/*| postfix_expression '.' IDENTIFIER*/
;
argument_expression_list : assignment_expression
| argument_expression_list ',' assignment_expression
;
unary_expression : postfix_expression
| unary_operator postfix_expression
;
unary_operator : FT_BITAND
| FT_BITOR
| FT_BITNOT
| FT_BITCOMPLEMENT
| FT_BITXOR
| FT_ADD
| FT_SUB
;
/*
cast_expression : unary_expression
| '(' type_name ')' cast_expression
;
*/
multiplicative_expression : unary_expression
| multiplicative_expression FT_MUL unary_expression
| multiplicative_expression FT_DIV unary_expression
| multiplicative_expression FT_MOD unary_expression
;
additive_expression : multiplicative_expression
| additive_expression FT_ADD multiplicative_expression
| additive_expression FT_SUB multiplicative_expression
;
/*
shift_expression : additive_expression
| shift_expression '<<' additive_expression
| shift_expression '>>' additive_expression
;
*/
relational_expression : additive_expression
| relational_expression FT_LT additive_expression
| relational_expression FT_LE additive_expression
| relational_expression FT_GT additive_expression
| relational_expression FT_GE additive_expression
;
equality_expression : relational_expression
| equality_expression FT_EQ relational_expression
| equality_expression FT_NEQ relational_expression
;
/*
bit_and_expression : equality_expression
| bitand_expression '&' equality_expression
;
bit_xor_expression : bit_and_expression
| bit_xor_expression '^' bit_and_expression
;
bit_or_expression : bit_xor_expression
| bit_or_expression '|' bit_xor_expression
;
*/
logical_not_expression : equality_expression
| logical_not_expression FT_LOGICALNOT equality_expression
;
logical_and_expression : logical_not_expression
| logical_and_expression FT_LOGICALAND logical_not_expression
;
logical_or_expression : logical_and_expression
| logical_or_expression FT_LOGICALOR logical_and_expression
;
assignment_expression : logical_or_expression
| unary_expression assignment_operator assignment_expression
;
assignment_operator : FT_ASSIGN
| FT_MULASSIGN
| FT_DIVASSIGN
| FT_MODASSIGN
| FT_ADDASSIGN
| FT_SUBASSIGN
;
constant_expression : logical_or_expression
;
expression : assignment_expression
;
unit : external_declaration
| unit external_declaration
;
external_declaration : function_declaration
| declaration
;
declaration : FT_LET declaration_init_list FT_SEMI
;
declaration_init_list : declaration_init
| declaration_init_list declaration_init
;
declaration_init : FT_IDENTIFIER FT_ASSIGN constant_expression
;
function_declaration : FT_FUNC FT_IDENTIFIER FT_LPAREN function_arg_list FT_RPAREN compound_statement
| FT_FUNC FT_IDENTIFIER FT_LPAREN FT_RPAREN compound_statement
;
function_arg_list : function_arg
| function_arg_list FT_COMMA function_arg
;
function_arg : FT_IDENTIFIER
;
compound_statement : FT_LBRACE FT_RBRACE
| FT_LBRACE statement_list FT_RBRACE
| FT_LBRACE declaration_list FT_RBRACE
;
statement_list : statement
| statement_list statement
;
declaration_list : declaration
| declaration_list declaration
;
statement : compound_statement
| expression_statement
| selection_statement
| iteration_statement
| jump_statement
;
expression_statement : FT_SEMI
| expression FT_SEMI
;
selection_statement : FT_IF FT_LPAREN expression FT_RPAREN statement
| FT_IF FT_LPAREN expression FT_RPAREN statement FT_ELSE statement
| FT_SWITCH FT_LPAREN expression FT_RPAREN statement
;
iteration_statement : FT_WHILE FT_LPAREN expression FT_RPAREN statement
| FT_FOR FT_LPAREN expression_statement expression_statement FT_RPAREN statement
| FT_FOR FT_LPAREN expression_statement expression_statement expression FT_RPAREN statement
;
jump_statement : FT_CONTINUE FT_SEMI
| FT_BREAK FT_SEMI
| FT_RETURN FT_SEMI
| FT_RETURN expression FT_SEMI
;
%%
flex_bison.cpp:
#include "bison.tab.hpp"
#include <cstdio>
int main(int argc, char **argv) {
if (argc <= 1) {
printf("error! filename missing!\n");
return 0;
}
FILE *fp = fopen(argv[1], "r");
yyin = fp;
int t;
while ((t = yylex()) != 0) {
printf("token: %d", t);
if (t == FT_IDENTIFIER || t == FT_INTEGER || t == FT_DOUBLE ||
t == FT_STRING) {
printf("literal: %s\n", yylval.literal);
} else {
printf("\n");
}
}
fclose(fp);
return 0;
}
generate code through commands:
$ flex -o flex.yy.cpp flex.l
$ bison -d -o bison.tab.hpp bison.y
$ g++ -o test.exe bison.tab.cpp flex.yy.cpp flex_bison.cpp
Here's the error message:
use of undeclared identifier 'yyin'
use of undeclared identifier 'yylex`
Do I have to define yyin and yylex before main function with code below?
extern FILE *yyin;
extern int yylex(void);
Even if the quoted code would be added to the output file, you still would get complaints about undefined yylex and yyin. Your code only declares these things, it does not define them. Neither yacc nor bison define these for you, you have to provide those functions yourself (you can use for example lex or flex to generate them). Take a look at bison documentation. The RPN example has a short example for a yylex function.
Edit after question was edited: as per the documentation of bison, the header generated from the -d option
Pretend that ‘%defines’ was specified, i.e., write an extra output
file containing macro definitions for the token type names defined
in the grammar, as well as a few other declarations.
This does not include the things you specified in the prologue, in particular not the declaration of yyin and yylex. If you need these declarations in multiple files then you may want to declare them in a separate header file and include that header file from bison.y and all other files that require that declaration (like flex_bison.cpp).

code optimization

I must write a function "to_string" wich receives this datatype
datatype prop = Atom of string | Not of prop | And of prop*prop | Or of prop*prop;
and returns a string.
Example
show
And(Atom("saturday"),Atom("night")) =
"(saturday & night)"
My function is working but I have 2 problems.
the interpreter tells me -> Warning: match nonexhaustive
I think i can write the function with locals functions for all the types (Not, And, Or) and avoid duplicate code but I don't know how.
there is my code
datatype prop = Atom of string | Not of prop | And of prop*prop | Or of prop*prop;
fun show(Atom(alpha)) = alpha
| show(Not(Atom(alpha))) = "(- "^alpha^" )"
| show(Or(Atom(alpha),Atom(beta))) = "( "^alpha^" | "^beta^" )"
| show(Not(Or(Atom(alpha),Atom(beta)))) = "(- ( "^alpha^" | "^beta^" ))"
| show(Or(Not(Atom(alpha)),Atom(beta))) = "( (-"^alpha^") | "^beta^" )"
| show(Or(Atom(alpha),Not(Atom(beta)))) = "( "^alpha^" | (-"^beta^") )"
| show(Or(Not(Atom(alpha)),Not(Atom(beta)))) = "( (-"^alpha^") | (-"^beta^") )"
| show(And(Atom(alpha),Atom(beta))) = "( "^alpha^" & "^beta^" )"
| show(Not(And(Atom(alpha),Atom(beta)))) = "(- ( "^alpha^" & "^beta^" ))"
| show(And(Not(Atom(alpha)),Atom(beta))) = "( (-"^alpha^") & "^beta^" )"
| show(And(Atom(alpha),Not(Atom(beta)))) = "( "^alpha^" & (-"^beta^") )"
| show(And(Not(Atom(alpha)),Not(Atom(beta)))) = "( (-"^alpha^") & (-"^beta^") )";
Thanks a lot for your help.
The general rule is as follows: if you have a recursive data type, you should use a recursive function to transform it.
Your match expression is not exhaustive because there are a lot of variants you can't handle - i.e. And(And(Atom("a"), Atom("b")), Atom("c")).
You should rewrite the function with recursive calls to itself - i.e. replace Not(Atom(alpha)) match with Not(expr):
show(Not(expr)) = "(- " ^ show(expr) ^ " )"
I'm sure you can figure out the rest (you'll have two recursive calls for and/or).

Regular Expression Period Issue

((https?|ftp)://|www.)(\S+[^.*])
I would like this expression to check for . in succession to each other. If it finds two or more periods back to back, the expression should fail. On the other hand, if it succeeds, I want it to match every character and/or symbol up until the first white space encountered.
In other words:
www.yahoo..com should fail
On a related note: I realize that this expression is very basic in terms of judging valid URL structure. I have another "more intelligent" regular expression in place that precedes the one above. The purpose of the posted one is meant to check the validity of the URL that is passed from the initial regular expression via preg_match_all.
You may awnt to check out FILTER_VALIDATE_URL with http://php.net/manual/en/book.filter.php instead of using Regex to validate your URLS.
Here's example usage:
$url = "http://www.example.com";
if(!filter_var($url, FILTER_VALIDATE_URL))
{
echo "URL is not valid";
}
else
{
echo "URL is valid";
}
You can do something like this:
((https?|ftp)\:\/\/|www.)((?:[\w\-]+\.)*[\w\-]+)
This will not yet check for valid URLs, even if you skip double dots. I'd advise not to use regex if the language you're using (PHP?) has other means of validating an URL.
The RFC states the following:
; URL schemeparts for ip based protocols:
ip-schemepart = "//" login [ "/" urlpath ]
login = [ user [ ":" password ] "#" ] hostport
hostport = host [ ":" port ]
host = hostname | hostnumber
hostname = *[ domainlabel "." ] toplabel
domainlabel = alphadigit | alphadigit *[ alphadigit | "-" ] alphadigit
toplabel = alpha | alpha *[ alphadigit | "-" ] alphadigit
alphadigit = alpha | digit
hostnumber = digits "." digits "." digits "." digits
port = digits
user = *[ uchar | ";" | "?" | "&" | "=" ]
password = *[ uchar | ";" | "?" | "&" | "=" ]
urlpath = *xchar ; depends on protocol see section 3.1
; HTTP
httpurl = "http://" hostport [ "/" hpath [ "?" search ]]
hpath = hsegment *[ "/" hsegment ]
hsegment = *[ uchar | ";" | ":" | "#" | "&" | "=" ]
search = *[ uchar | ";" | ":" | "#" | "&" | "=" ]
; Miscellaneous definitions
lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" |
"i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" |
"q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" |
"y" | "z"
hialpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
"J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
"S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
alpha = lowalpha | hialpha
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
"8" | "9"
safe = "$" | "-" | "_" | "." | "+"
extra = "!" | "*" | "'" | "(" | ")" | ","
national = "{" | "}" | "|" | "\" | "^" | "~" | "[" | "]" | "`"
punctuation = "<" | ">" | "#" | "%" | <">
reserved = ";" | "/" | "?" | ":" | "#" | "&" | "="
hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
"a" | "b" | "c" | "d" | "e" | "f"
escape = "%" hex hex
unreserved = alpha | digit | safe | extra
uchar = unreserved | escape
xchar = unreserved | reserved | escape
digits = 1*digit
Using negative lookahead is an easy way if your engine supports it:
(?!.*\.\.)((https?|ftp)\:\/\/|www.)(\S+[^.*])
Otherwise, you have to be more specific:
^((https?|ftp)\:\/\/|www.)((\.[^.]|[^.\s])+[^.*])($|\s+)