Micro Parser Combinators (mpc) runs in endless loop - c++

I am writing a compiler in C++ (using Visual Studio) for a small scripting language and I use this C parsing library.
So, I followed instructions from the documentation and I ended up on this peace of code:
int main()
{
mpc_parser_t* Int = mpc_new("int");
mpc_parser_t* Char = mpc_new("char");
mpc_parser_t* String = mpc_new("string");
mpc_parser_t* Id = mpc_new("id");
mpc_parser_t* Type = mpc_new("type");
mpc_parser_t* Formal = mpc_new("formal");
mpc_parser_t* Header = mpc_new("header");
mpc_parser_t* FuncDecl = mpc_new("funcdecl");
mpc_parser_t* VarDef = mpc_new("vardef");
mpc_parser_t* Expr = mpc_new("expr");
mpc_parser_t* Call = mpc_new("call");
mpc_parser_t* Atom = mpc_new("atom");
mpc_parser_t* Simple = mpc_new("simple");
mpc_parser_t* SimpleList = mpc_new("simplelist");
mpc_parser_t* Stmt = mpc_new("stmt");
mpc_parser_t* FuncDef = mpc_new("funcdef");
mpc_parser_t* Program = mpc_new("program");
/* Define them with the following Language */
mpca_lang(MPCA_LANG_DEFAULT,
" \
int : /-?[0-9]+/ ; \
char : /'[a-zA-Z0-9!##$%^&*()\\_+-,.\\/<>?;'|\"`~]'/ ; \
string : /\"(\\\\.|[^\"])*\"/ ; \
id : /[a-zA-Z][a-zA-Z0-9_-]*/ ; \
type : \"int\" | \"bool\" | \"char\" | <type> '[' ']' | \"list\" '[' <type> ']' ; \
formal : (\"ref\")? <type> <id> (',' <id>)* ; \
header : <type>? <id> '(' (<formal> (';' <formal>)*)? ')' ; \
funcdecl : \"decl\" <header> ; \
vardef : <type> <id> (',' <id>)* ; \
expr : <atom> | <int> | <char> | '(' <expr> ')' \
| ('+' | '-') <expr> | <expr> ('+' | '-' | '*' | '/' | \"mod\") <expr> \
| <expr> ('=' | \"<>\" | '<' | '>' | \"<=\" | \">=\") <expr> \
| \"true\" | \"false\" | \"not\" <expr> | <expr> (\"and\" | \"or\") <expr> \
| \"new\" <type> '[' <expr> ']' | \"nil\" | \"nil?\" '(' <expr> ')' \
| <expr> '#' <expr> | \"head\" '(' <expr> ')' | \"tail\" '(' <expr> ')' ; \
call : <id> '(' (<expr> (',' <expr>)*)? ')' ; \
atom : <id> | <string> | <atom> '[' <expr> ']' | <call> ; \
simple : \"skip\" | <atom> \":=\" <expr> | <call> ; \
simplelist : <simple> (',' <simple>)* ; \
stmt : <simple> | \"exit\" | \"return\" <expr> \
| \"if\" <expr> ':' <stmt>+ (\"elif\" <expr> ':' <stmt>+)* \
(\"else\" ':' <stmt>+)? \"end\" \
| \"for\" <simplelist> ';' <expr> ';' <simplelist> ':' <stmt>+ \"end\" ; \
funcdef : \"def\" <header> ':' (<funcdef> | <funcdecl> | <vardef>)* <stmt>+ \"end\" ; \
program : /^/ <funcdef> /$/ ; \
",
Int, Char, String, Id, Type, Formal, Header, FuncDecl, VarDef, Expr,
Call, Atom, Simple, SimpleList, Stmt, FuncDef, Program);
mpc_result_t r;
char* input = "def hey () : return 1 end";
if(mpc_parse("input", input, Program, &r))
{
mpc_ast_print((mpc_ast_t*)r.output);
mpc_ast_delete((mpc_ast_t*)r.output);
}
else
{
mpc_err_print(r.error);
mpc_err_delete(r.error);
}
PAUSE("Press any key to continue . . .");
/* Undefine and Delete our Parsers */
mpc_cleanup(17, Int, Char, String, Id, Type, Formal, Header, FuncDecl, VarDef, Expr,
Call, Atom, Simple, SimpleList, Stmt, FuncDef, Program);
return 0;
}
The problem is that I run into an huge loop in mpc_parse. That loop never actually reaches the end. After some time I get this exception:
Unhandled exception at 0x00CBBC9C in TonyCC.exe: 0xC0000005: Access violation reading location 0x0000000C.
I don't know why. I suspect there is something wrong with my grammar but I cannot figure out what.
If someone has used this library before, do you have any idea what the problem might be?
Note: I know it is difficult to read the grammar from C code so here is an image of the grammar:

Related

How to handle multi-line rules for gor parsing bnf grammar using boost spirit qi

Assuming I have a BNF grammar like this
<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= a | b | c | d | e
| f | g | h | i
<digit> ::= 0 | 1 | 2 | 3 |
4
If you look at the <letter> rule, its continuation starts with the | but that of the <digit> rule starts with the production with | appearing at the end of the previous line. I also don't want to use a particular symbol to represent the end of a rule.
How do check if a rule as ended using the Boost Spirit Qi for implementation.
I have just gone through the tutorial on the boost page and wondering how I am going to handle this.
Wikipedia
BNF syntax can only represent a rule in one line, whereas in EBNF a terminating character, the semicolon character “;” marks the end of a rule.
So the simple answer is: the input isn't BNF.
Iff you want to support it anyways (at your own peril :)) you'll have to make it so. So, let's write a simplistic BFN grammar, literally mapping from Wikipedia BNF
<syntax> ::= <rule> | <rule> <syntax>
<rule> ::= <opt-whitespace> "<" <rule-name> ">" <opt-whitespace> "::=" <opt-whitespace> <expression> <line-end>
<opt-whitespace> ::= " " <opt-whitespace> | ""
<expression> ::= <list> | <list> <opt-whitespace> "|" <opt-whitespace> <expression>
<line-end> ::= <opt-whitespace> <EOL> | <line-end> <line-end>
<list> ::= <term> | <term> <opt-whitespace> <list>
<term> ::= <literal> | "<" <rule-name> ">"
<literal> ::= '"' <text1> '"' | "'" <text2> "'"
<text1> ::= "" | <character1> <text1>
<text2> ::= '' | <character2> <text2>
<character> ::= <letter> | <digit> | <symbol>
<letter> ::= "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" | "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
<digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
<symbol> ::= "|" | " " | "!" | "#" | "$" | "%" | "&" | "(" | ")" | "*" | "+" | "," | "-" | "." | "/" | ":" | ";" | ">" | "=" | "<" | "?" | "#" | "[" | "\" | "]" | "^" | "_" | "`" | "{" | "}" | "~"
<character1> ::= <character> | "'"
<character2> ::= <character> | '"'
<rule-name> ::= <letter> | <rule-name> <rule-char>
<rule-char> ::= <letter> | <digit> | "-"
It could look like this:
template <typename Iterator>
struct BNF: qi::grammar<Iterator, Ast::Syntax()> {
BNF(): BNF::base_type(start) {
using namespace qi;
start = skip(blank) [ _rule % +eol ];
_rule = _rule_name >> "::=" >> _expression;
_expression = _list % '|';
_list = +_term;
_term = _literal | _rule_name;
_literal = '"' >> *(_character - '"') >> '"'
| "'" >> *(_character - "'") >> "'";
_character = alnum | char_("\"'| !#$%&()*+,./:;>=<?#]\\^_`{}~[-");
_rule_name = '<' >> (alpha >> *(alnum | char_('-'))) >> '>';
BOOST_SPIRIT_DEBUG_NODES(
(_rule)(_expression)(_list)(_term)
(_literal)(_character)
(_rule_name))
}
private:
qi::rule<Iterator, Ast::Syntax()> start;
qi::rule<Iterator, Ast::Rule(), qi::blank_type> _rule;
qi::rule<Iterator, Ast::Expression(), qi::blank_type> _expression;
qi::rule<Iterator, Ast::List(), qi::blank_type> _list;
// lexemes
qi::rule<Iterator, Ast::Term()> _term;
qi::rule<Iterator, Ast::Name()> _rule_name;
qi::rule<Iterator, std::string()> _literal;
qi::rule<Iterator, char()> _character;
};
Now it will parse your sample (corrected to be BNF):
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" | "4"
)";
Live On Compiler Explorer
Prints:
code ::= {<letter>, <digit>} | {<letter>, <digit>, <code>}
letter ::= {a} | {b} | {c} | {d} | {e} | {f} | {g} | {h} | {i}
digit ::= {0} | {1} | {2} | {3} | {4}
Remaining: "
"
Support Line-Wrapped Rules
The best way is to not accept them - since the grammar wasn't designed for it unlike e.g. EBNF.
You can force the issue by doing a negative look-ahead in the skipper:
_skipper = blank | (eol >> !_rule);
start = skip(_skipper) [ _rule % +eol ];
For technical reasons (Boost spirit skipper issues) that doesn't compile, so we need to feed it a placeholder skipper inside the look-ahead:
_blank = blank;
_skipper = blank | (eol >> !skip(_blank.alias()) [ _rule ]);
start = skip(_skipper.alias()) [ _rule % +eol ];
Now it parses the same but with various line-breaks:
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e"
| "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" |
"4"
)";
Printing:
code ::= {<letter>, <digit>} | {<letter>, <digit>, <code>}
letter ::= {a} | {b} | {c} | {d} | {e} | {f} | {g} | {h} | {i}
digit ::= {0} | {1} | {2} | {3} | {4}
FULL LISTING
Compiler Explorer
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/adapted.hpp>
#include <fmt/ranges.h>
#include <fmt/ostream.h>
#include <iomanip>
namespace qi = boost::spirit::qi;
namespace Ast {
struct Name : std::string {
using std::string::string;
using std::string::operator=;
friend std::ostream& operator<<(std::ostream& os, Name const& n) {
return os << '<' << n.c_str() << '>';
}
};
using Term = boost::variant<Name, std::string>;
using List = std::list<Term>;
using Expression = std::list<List>;
struct Rule {
Name name; // lhs
Expression rhs;
};
using Syntax = std::list<Rule>;
}
BOOST_FUSION_ADAPT_STRUCT(Ast::Rule, name, rhs)
namespace Parser {
template <typename Iterator>
struct BNF: qi::grammar<Iterator, Ast::Syntax()> {
BNF(): BNF::base_type(start) {
using namespace qi;
_blank = blank;
_skipper = blank | (eol >> !skip(_blank.alias()) [ _rule ]);
start = skip(_skipper.alias()) [ _rule % +eol ];
_rule = _rule_name >> "::=" >> _expression;
_expression = _list % '|';
_list = +_term;
_term = _literal | _rule_name;
_literal = '"' >> *(_character - '"') >> '"'
| "'" >> *(_character - "'") >> "'";
_character = alnum | char_("\"'| !#$%&()*+,./:;>=<?#]\\^_`{}~[-");
_rule_name = '<' >> (alpha >> *(alnum | char_('-'))) >> '>';
BOOST_SPIRIT_DEBUG_NODES(
(_rule)(_expression)(_list)(_term)
(_literal)(_character)
(_rule_name))
}
private:
using Skipper = qi::rule<Iterator>;
Skipper _skipper, _blank;
qi::rule<Iterator, Ast::Syntax()> start;
qi::rule<Iterator, Ast::Rule(), Skipper> _rule;
qi::rule<Iterator, Ast::Expression(), Skipper> _expression;
qi::rule<Iterator, Ast::List(), Skipper> _list;
// lexemes
qi::rule<Iterator, Ast::Term()> _term;
qi::rule<Iterator, Ast::Name()> _rule_name;
qi::rule<Iterator, std::string()> _literal;
qi::rule<Iterator, char()> _character;
};
}
int main() {
Parser::BNF<std::string::const_iterator> const parser;
std::string const input = R"(<code> ::= <letter><digit> | <letter><digit><code>
<letter> ::= "a" | "b" | "c" | "d" | "e"
| "f" | "g" | "h" | "i"
<digit> ::= "0" | "1" | "2" | "3" |
"4"
)";
auto it = input.begin(), itEnd = input.end();
Ast::Syntax syntax;
if (parse(it, itEnd, parser, syntax)) {
for (auto& rule : syntax)
fmt::print("{} ::= {}\n", rule.name, fmt::join(rule.rhs, " | "));
} else {
std::cout << "Failed\n";
}
if (it != itEnd)
std::cout << "Remaining: " << std::quoted(std::string(it, itEnd)) << "\n";
}
Also Live On Coliru (without libfmt)

use of undeclared identifier `yylex` and `yyin`?

Here's my simple project source code:
bison.y
flex.l
flex_bison.cpp
flex.l:
%option noyywrap
%{
#include <string>
#include <cstring>
#include "bison.tab.hpp"
#define FT_SAVE_TOKEN yylval.literal = strndup(yytext, yyleng)
#define FT_TOKEN(t) (yylval.token = t)
%}
%%
"True" return FT_TRUE;
"False" return FT_FALSE;
"let" return FT_LET;
"Nil" return FT_NIL;
"if" return FT_IF;
"elseif" return FT_ELSEIF;
"else" return FT_ELSE;
"switch" return FT_SWITCH;
"case" return FT_CASE;
"otherwise" return FT_OTHERWISE;
"for" return FT_FOR;
"while" return FT_WHILE;
"break" return FT_BREAK;
"continue" return FT_CONTINUE;
"func" return FT_FUNC;
"class" return FT_CLASS;
"type" return FT_TYPE;
"isinstance" return FT_ISINSTANCE;
"import" return FT_IMPORT;
"return" return FT_RETURN;
"void" return FT_VOID;
"and" return FT_LOGICALAND;
"or" return FT_LOGICALOR;
"not" return FT_LOGICALNOT;
"int" return FT_INTEGER_KEYWORD;
"uint" return FT_UNSIGNED_INTEGER_KEYWORD;
"double" return FT_DOUBLE_KEYWORD;
[ \t\v\n\f\r] ;
[a-zA-Z_][a-zA-Z0-9_]* FT_SAVE_TOKEN; return FT_IDENTIFIER;
[0-9]+"."[0-9]+([Ee][+-]?[0-9]+)? FT_SAVE_TOKEN; return FT_DOUBLE;
[0-9]+([Ee][+-]?[0-9]+)? FT_SAVE_TOKEN; return FT_INTEGER;
\"(\\.|[^\\"])*\" FT_SAVE_TOKEN; return FT_STRING;
"+" return FT_TOKEN(FT_ADD);
"-" return FT_TOKEN(FT_SUB);
"*" return FT_TOKEN(FT_MUL);
"/" return FT_TOKEN(FT_DIV);
"%" return FT_TOKEN(FT_MOD);
"!" return FT_TOKEN(FT_BITNOT);
"&" return FT_TOKEN(FT_BITAND);
"|" return FT_TOKEN(FT_BITOR);
"~" return FT_TOKEN(FT_BITCOMPLEMENT);
"^" return FT_TOKEN(FT_BITXOR);
"=" return FT_TOKEN(FT_ASSIGN);
"+=" return FT_TOKEN(FT_ADDASSIGN);
"-=" return FT_TOKEN(FT_SUBASSIGN);
"*=" return FT_TOKEN(FT_MULASSIGN);
"/=" return FT_TOKEN(FT_DIVASSIGN);
"%=" return FT_TOKEN(FT_MODASSIGN);
"==" return FT_TOKEN(FT_EQ);
"!=" return FT_TOKEN(FT_NEQ);
"<" return FT_TOKEN(FT_LT);
"<=" return FT_TOKEN(FT_LE);
">" return FT_TOKEN(FT_GT);
">=" return FT_TOKEN(FT_GE);
"(" return FT_TOKEN(FT_LPAREN);
")" return FT_TOKEN(FT_RPAREN);
"[" return FT_TOKEN(FT_LBRACKET);
"]" return FT_TOKEN(FT_RBRACKET);
"{" return FT_TOKEN(FT_LBRACE);
"}" return FT_TOKEN(FT_RBRACE);
"," return FT_TOKEN(FT_COMMA);
";" return FT_TOKEN(FT_SEMI);
"?" return FT_TOKEN(FT_QUESTION);
":" return FT_TOKEN(FT_COLON);
"." return FT_TOKEN(FT_DOT);
. printf("Unknown token!n"); yyterminate();
%%
bison.y:
%{
#include <string>
#include <cstring>
#include <cstdio>
extern FILE *yyin;
extern int yylex();
void yyerror(const char *s) { printf("yyerror: %s\n", s); }
%}
/* Represents the many different ways we can access our data */
%union {
char *literal;
int token;
}
/* union.token: eof, keyword */
%token <token> FT_EOF
%token <token> FT_TRUE FT_FALSE FT_LET FT_NIL FT_IF FT_ELSEIF FT_ELSE FT_FOR FT_WHILE FT_BREAK FT_CONTINUE FT_SWITCH FT_CASE FT_OTHERWISE
%token <token> FT_FUNC FT_CLASS FT_TYPE FT_ISINSTANCE FT_IMPORT FT_RETURN FT_VOID FT_LOGICALAND FT_LOGICALOR FT_LOGICALNOT
%token <token> FT_INTEGER_KEYWORD FT_UNSIGNED_INTEGER_KEYWORD FT_DOUBLE_KEYWORD
/* union.literal, identifier, integer, double number, string */
%token <literal> FT_IDENTIFIER FT_INTEGER FT_DOUBLE FT_STRING
/* union.token: operator, comparator, punctuation */
%token <token> FT_ADD FT_SUB FT_MUL FT_DIV FT_MOD FT_BITNOT FT_BITAND FT_BITOR FT_BITCOMPLEMENT FT_BITXOR
%token <token> FT_ASSIGN FT_ADDASSIGN FT_SUBASSIGN FT_MULASSIGN FT_DIVASSIGN FT_MODASSIGN FT_EQ FT_NEQ FT_LT FT_LE FT_GT FT_GE
%token <token> FT_LPAREN FT_RPAREN FT_LBRACKET FT_RBRACKET FT_LBRACE FT_RBRACE FT_COMMA FT_SEMI FT_QUESTION FT_COLON FT_DOT
/*
%type <ident> ident
%type <expr> numeric expr
%type <varvec> func_decl_args
%type <exprvec> call_args
%type <block> program stmts block
%type <stmt> stmt var_decl func_decl
%type <token> comparison
*/
/* operator/comparator precedence */
%left FT_DOT FT_LPAREN FT_RPAREN FT_LBRACKET FT_RBRACKET
%left FT_MUL FT_DIV FT_MOD
%left FT_ADD FT_SUB
%left FT_LT FT_LE FT_GT FT_GE FT_EQ FT_NEQ
%left FT_BITNOT FT_BITAND FT_BITOR FT_BITXOR FT_BITCOMPLEMENT
%left FT_LOGICALNOT FT_LOGICALAND FT_LOGICALOR
%left FT_ASSIGN FT_ADDASSIGN FT_SUBASSIGN FT_MULASSIGN FT_DIVASSIGN FT_MODASSIGN
/*
%start program
*/
%%
primary_expression : FT_IDENTIFIER
| FT_INTEGER
| FT_DOUBLE
| FT_STRING
| '(' expression ')'
;
postfix_expression : primary_expression
/*| postfix_expression '[' expression ']'*/
| postfix_expression '(' ')'
| postfix_expression '(' argument_expression_list ')'
/*| postfix_expression '.' IDENTIFIER*/
;
argument_expression_list : assignment_expression
| argument_expression_list ',' assignment_expression
;
unary_expression : postfix_expression
| unary_operator postfix_expression
;
unary_operator : FT_BITAND
| FT_BITOR
| FT_BITNOT
| FT_BITCOMPLEMENT
| FT_BITXOR
| FT_ADD
| FT_SUB
;
/*
cast_expression : unary_expression
| '(' type_name ')' cast_expression
;
*/
multiplicative_expression : unary_expression
| multiplicative_expression FT_MUL unary_expression
| multiplicative_expression FT_DIV unary_expression
| multiplicative_expression FT_MOD unary_expression
;
additive_expression : multiplicative_expression
| additive_expression FT_ADD multiplicative_expression
| additive_expression FT_SUB multiplicative_expression
;
/*
shift_expression : additive_expression
| shift_expression '<<' additive_expression
| shift_expression '>>' additive_expression
;
*/
relational_expression : additive_expression
| relational_expression FT_LT additive_expression
| relational_expression FT_LE additive_expression
| relational_expression FT_GT additive_expression
| relational_expression FT_GE additive_expression
;
equality_expression : relational_expression
| equality_expression FT_EQ relational_expression
| equality_expression FT_NEQ relational_expression
;
/*
bit_and_expression : equality_expression
| bitand_expression '&' equality_expression
;
bit_xor_expression : bit_and_expression
| bit_xor_expression '^' bit_and_expression
;
bit_or_expression : bit_xor_expression
| bit_or_expression '|' bit_xor_expression
;
*/
logical_not_expression : equality_expression
| logical_not_expression FT_LOGICALNOT equality_expression
;
logical_and_expression : logical_not_expression
| logical_and_expression FT_LOGICALAND logical_not_expression
;
logical_or_expression : logical_and_expression
| logical_or_expression FT_LOGICALOR logical_and_expression
;
assignment_expression : logical_or_expression
| unary_expression assignment_operator assignment_expression
;
assignment_operator : FT_ASSIGN
| FT_MULASSIGN
| FT_DIVASSIGN
| FT_MODASSIGN
| FT_ADDASSIGN
| FT_SUBASSIGN
;
constant_expression : logical_or_expression
;
expression : assignment_expression
;
unit : external_declaration
| unit external_declaration
;
external_declaration : function_declaration
| declaration
;
declaration : FT_LET declaration_init_list FT_SEMI
;
declaration_init_list : declaration_init
| declaration_init_list declaration_init
;
declaration_init : FT_IDENTIFIER FT_ASSIGN constant_expression
;
function_declaration : FT_FUNC FT_IDENTIFIER FT_LPAREN function_arg_list FT_RPAREN compound_statement
| FT_FUNC FT_IDENTIFIER FT_LPAREN FT_RPAREN compound_statement
;
function_arg_list : function_arg
| function_arg_list FT_COMMA function_arg
;
function_arg : FT_IDENTIFIER
;
compound_statement : FT_LBRACE FT_RBRACE
| FT_LBRACE statement_list FT_RBRACE
| FT_LBRACE declaration_list FT_RBRACE
;
statement_list : statement
| statement_list statement
;
declaration_list : declaration
| declaration_list declaration
;
statement : compound_statement
| expression_statement
| selection_statement
| iteration_statement
| jump_statement
;
expression_statement : FT_SEMI
| expression FT_SEMI
;
selection_statement : FT_IF FT_LPAREN expression FT_RPAREN statement
| FT_IF FT_LPAREN expression FT_RPAREN statement FT_ELSE statement
| FT_SWITCH FT_LPAREN expression FT_RPAREN statement
;
iteration_statement : FT_WHILE FT_LPAREN expression FT_RPAREN statement
| FT_FOR FT_LPAREN expression_statement expression_statement FT_RPAREN statement
| FT_FOR FT_LPAREN expression_statement expression_statement expression FT_RPAREN statement
;
jump_statement : FT_CONTINUE FT_SEMI
| FT_BREAK FT_SEMI
| FT_RETURN FT_SEMI
| FT_RETURN expression FT_SEMI
;
%%
flex_bison.cpp:
#include "bison.tab.hpp"
#include <cstdio>
int main(int argc, char **argv) {
if (argc <= 1) {
printf("error! filename missing!\n");
return 0;
}
FILE *fp = fopen(argv[1], "r");
yyin = fp;
int t;
while ((t = yylex()) != 0) {
printf("token: %d", t);
if (t == FT_IDENTIFIER || t == FT_INTEGER || t == FT_DOUBLE ||
t == FT_STRING) {
printf("literal: %s\n", yylval.literal);
} else {
printf("\n");
}
}
fclose(fp);
return 0;
}
generate code through commands:
$ flex -o flex.yy.cpp flex.l
$ bison -d -o bison.tab.hpp bison.y
$ g++ -o test.exe bison.tab.cpp flex.yy.cpp flex_bison.cpp
Here's the error message:
use of undeclared identifier 'yyin'
use of undeclared identifier 'yylex`
Do I have to define yyin and yylex before main function with code below?
extern FILE *yyin;
extern int yylex(void);
Even if the quoted code would be added to the output file, you still would get complaints about undefined yylex and yyin. Your code only declares these things, it does not define them. Neither yacc nor bison define these for you, you have to provide those functions yourself (you can use for example lex or flex to generate them). Take a look at bison documentation. The RPN example has a short example for a yylex function.
Edit after question was edited: as per the documentation of bison, the header generated from the -d option
Pretend that ‘%defines’ was specified, i.e., write an extra output
file containing macro definitions for the token type names defined
in the grammar, as well as a few other declarations.
This does not include the things you specified in the prologue, in particular not the declaration of yyin and yylex. If you need these declarations in multiple files then you may want to declare them in a separate header file and include that header file from bison.y and all other files that require that declaration (like flex_bison.cpp).

Bison error recovery suggestion

I'm trying to integrate error recovery in my grammar. From the bison manual, the simplest error recovery would be skip the current line. But in my flex file, I have no action regarding the newline so the parser would not know about it. So I want the parser to ignore everything until it encounters a semicolon in case of an error.
I have the following grammar:
start : program;
program : program unit
| unit
;
unit : var_declaration
| func_declaration
| func_definition
;
func_declaration : type_specifier ID LPAREN parameter_list RPAREN SEMICOLON
| type_specifier ID LPAREN RPAREN SEMICOLON
;
func_definition : type_specifier ID LPAREN parameter_list RPAREN compound_statement
| type_specifier ID LPAREN RPAREN compound_statement
;
parameter_list : parameter_list COMMA type_specifier ID
| parameter_list COMMA type_specifier
| type_specifier ID
| type_specifier
;
compound_statement : LCURL statements RCURL
| LCURL RCURL
;
var_declaration : type_specifier declaration_list SEMICOLON
;
type_specifier : INT
| FLOAT
| VOID
;
declaration_list : declaration_list COMMA ID
| declaration_list COMMA ID LTHIRD CONST_INT RTHIRD
| ID
| ID LTHIRD CONST_INT RTHIRD
;
statements : statement
| statements statement
;
statement : var_declaration
| expression_statement
| compound_statement
| FOR LPAREN expression_statement expression_statement expression RPAREN statement
| IF LPAREN expression RPAREN statement
| IF LPAREN expression RPAREN statement ELSE statement
| WHILE LPAREN expression RPAREN statement
| PRINTLN LPAREN ID RPAREN SEMICOLON
| RETURN expression SEMICOLON
;
expression_statement : SEMICOLON
| expression SEMICOLON
;
variable : ID
| ID LTHIRD expression RTHIRD
;
expression : logic_expression
| variable ASSIGNOP logic_expression
;
logic_expression : rel_expression
| rel_expression LOGICOP rel_expression
;
rel_expression : simple_expression
| simple_expression RELOP simple_expression
;
simple_expression : term
| simple_expression ADDOP term
;
term : unary_expression
| term MULOP unary_expression
;
unary_expression : ADDOP unary_expression
| NOT unary_expression
| factor
;
factor : variable
| ID LPAREN argument_list RPAREN
| LPAREN expression RPAREN
| CONST_INT
| CONST_FLOAT
| variable INCOP
| variable DECOP
;
argument_list : arguments
|
;
arguments : arguments COMMA logic_expression
| logic_expression
;
I'm currently working on the following input:
int main(){
int a[2],c,i,j ; float c;
a[2.5]=1;
i=2.3
j=2%3.7;
a=4;
func(a);
b=8;
return 0;
}
When the parser encounters i = 2.3, it won't stop parsing but rather continue doing so after reporting a syntax error.
Based on the grammar, where should I put my error production so that the parser can continue parsing without any conflict ? And possibly shed some light on other syntax errors like missing a RPAREN or Curly braces ? How should I approach to add the error production for a given grammar ?

Bison parser doesn't recognize the "New" keyword [closed]

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 4 years ago.
Improve this question
I am trying to build a simple compiler and I am in the stage to test the Bison parser I created recently over some sample .decaf files, the parser works well with all keywords and the grammar's terminal and non-terminal tokens/types and rest of the grammar rules and actions, but there is only one problem that my parser does not recognize the New keyword/operator, when ever a statement includes a New keyword it results into an error in the output!
Defining New as a terminal token
%token T_New
CFG grammar rule and action for Expr that also includes rule and action for T_New
Expr : LValue '=' Expr { $$=new AssignExpr($1,new Operator(#2,"="),$3); }
| '(' Expr ')' { $$=$2; }
| Expr '+' Expr { $$=new ArithmeticExpr($1,new Operator(#2,"+"),$3); }
| Expr '-' Expr { $$=new ArithmeticExpr($1,new Operator(#2,"-"),$3); }
| Expr '*' Expr { $$=new ArithmeticExpr($1,new Operator(#2,"*"),$3); }
| Expr '/' Expr { $$=new ArithmeticExpr($1,new Operator(#2,"/"),$3); }
| Expr '%' Expr { $$=new ArithmeticExpr($1,new Operator(#2,"%"),$3); }
| '-' Expr %prec T_UnaryMinus { $$=new ArithmeticExpr(new Operator(#1,"-"),$2); }
| Expr T_And Expr { $$=new LogicalExpr($1,new Operator(#2,"&&"),$3); }
| Expr T_Or Expr { $$=new LogicalExpr($1,new Operator(#2,"||"),$3); }
| Expr '<' Expr { $$=new RelationalExpr($1,new Operator(#2,"<"),$3); }
| Expr T_LessEqual Expr { $$=new RelationalExpr($1,new Operator(#2,"<="),$3); }
| Expr '>' Expr { $$=new RelationalExpr($1,new Operator(#2,">"),$3); }
| Expr T_GreaterEqual Expr { $$=new RelationalExpr($1,new Operator(#2,">="),$3); }
| Expr T_Equal Expr { $$=new EqualityExpr($1,new Operator(#2,"=="),$3); }
| Expr T_NotEqual Expr { $$=new EqualityExpr($1,new Operator(#2,"!="),$3); }
| '!' Expr { $$=new LogicalExpr(new Operator(#1, "!"), $2); }
| T_ReadInteger '(' ')' { $$=new ReadIntegerExpr(#1); }
| T_ReadLine '(' ')' { $$=new ReadLineExpr(#1); }
| T_New Identifier { $$=new NewExpr(#2,new NamedType($2)); }
| T_NewArray '(' Expr ',' Type ')' { $$=new NewArrayExpr(#1,$3,$5); }
| LValue { $$=$1; }
| T_This { $$=new This(#1); }
| Call { $$=$1; }
| Constant { $$=$1; }
;
for example I have this sample file interface.decaf for testing and it has a main function as below:
void main() {
Colorable s;
Color green;
green = New(Color);
green.SetRGB(0, 0, 255);
s = New(Rectangle);
s.SetColor(green);
}
But when I run my parser over this sample file in the terminal I get this error:
*** Error line 33.
green = New(Color);
*** syntax error
I tried with other sample files and noticed that any file that has a statement that mentions 'New' keyword returns the same error.
I got some hint from this question that probably New keyword is mixed up between C and C++ and that's why its not recognized by bison. but I am still not able to figure out how to fix this ! Can anyone help please ?
Your grammar has a rule
| T_New Identifier { ...
matching a New keyword followed immediately by an identifier. However, your examples all have parenthesis around the identifier:
green = New(Color)
s = new(Rectangle)
thus the syntax error you are seeing -- the input has a ( where the grammar expects an identifier...

How to solve "syntax error at line 1 "

Hi I've used lex and yacc to create my own programming language syntax (sort of) , but no matter how my grammar rules are put , it gives me syntax error at the same first line.
This is my lex code of regular expressions:
%{
#include <stdio.h>
#include "y.tab.h"
%}
%option noyywrap
punct [.]
virgula [,]
numar [0-9]+
numar2 [0-9]
%%
"&librarie=>" {return INCLUDE;}
"stringuri"|"vectori"|"mape"|"matematica" {return LIBRARII;}
"intreg"|"caracter"|"string"|"natural" {return TIPVAR;}
"real" {return REAL;}
"daca" {return DACA;}
"pentru" {return PENTRU;}
"cat_timp" {return CATTIMP;}
def_variabla_globala {return VARGLOBALE;}
def_variabla_locala {return VARLOCALE;}
structura_obiect {return STRUCTURA;}
"procedura[]" {return PROCEDURA;}
start_program {return START;}
stop_program {return STOP;}
inceput_bloc_if {return INBLOCIF;}
sfarsit_bloc_if {return SFBLOCIF;}
atunci {return ATUNCI;}
altfel {return ALTFEL;}
from {return FROM;}
to {return TO;}
smaller_than {return MAIMIC;}
greater_than {return MAIMARE;}
equal_to {return EGAL;}
different_than {return DIFERIT;}
inceput_bloc_for {return INBLOCFOR;}
sfarsit_bloc_for {return SFBLOCFOR;}
inceput_bloc_cat_timp {return INBLOCCATTIMP;}
sfarsit_bloc_cat_timp {return SFBLOCCATTIMP;}
executa {return EXECUTA;}
suma {return SUM;}
invers {return INV;}
oglindit {return OGL;}
"<-" {return ASIGNARE;}
[a-zA-Z][a-zA-Z0-9]* {return ID;REJECT;}
{numar} {return NUMAR;REJECT;}
[0-9]{punct}{numar} {return NUMARREAL;}
{numar}|({virgula}{numar})* {return VECTASIGN;}
({numar2}{punct}{numar})|({virgula}({numar2}{punct}{numar}))* {return VECTASIGNREAL;}
[a-zA-Z][a-zA-Z ]* {return CUVANT;REJECT;}
afisare {return AFISARE;}
[ \t] ;
\n {yylineno++;}
. {return yytext[0];}
%%
I used REJECT at those 2 regex , because it gave me a warning , several of my rules were having conflicts with each other.
My grammar rules :
%{
#include <stdio.h>
extern FILE* yyin;
extern char* yytext;
extern int yylineno;
%}
%token INCLUDE LIBRARII ID TIPVAR CUVCHEIE REAL NUMARREAL FROM TO VECTASIGNREAL VARGLOBALE VARLOCALE VECTASIGN STRUCTURA PROCEDURA START STOP DACA PENTRU CATTIMP INBLOCIF SFBLOCIF ATUNCI ALTFEL MAIMIC MAIMARE EGAL DIFERIT INBLOCFOR SFBLOCFOR INBLOCCATTIMP SFBLOCCATTIMP EXECUTA SUM INV OGL ASIGNARE NUMAR CUVANT AFISARE
%start progr
%left '+' '-'
%left '*' '/'
%%
progr: headere declaratii program {printf("correct syntax");}
;
headere : header
|headere header
;
header : INCLUDE LIBRARII
;
declaratii : declaratie ';'
| declaratii declaratie ';'
;
declaratie : VARGLOBALE TIPVAR ID
| VARGLOBALE TIPVAR ID '[' NUMAR ']'
| VARGLOBALE TIPVAR ID ASIGNARE NUMAR
| VARGLOBALE TIPVAR ID ASIGNARE '#' CUVANT '#'
| VARGLOBALE TIPVAR ID '[' NUMAR ']' ASIGNARE '[' VECTASIGN ']'
| VARGLOBALE REAL ID
| VARGLOBALE REAL ID '[' NUMAR ']'
| VARGLOBALE REAL ID ASIGNARE NUMARREAL
| VARGLOBALE REAL ID '[' NUMAR ']' ASIGNARE '[' VECTASIGNREAL ']'
;
program : PROCEDURA START bloc STOP
;
bloc : declaratiile instructiuni
;
declaratiile : declaratia ';'
| declaratiile declaratia ';'
;
declaratia : VARLOCALE TIPVAR ID
| VARLOCALE TIPVAR ID '[' NUMAR ']'
| VARLOCALE TIPVAR ID ASIGNARE NUMAR
| VARLOCALE TIPVAR ID ASIGNARE '#' CUVANT '#'
| VARLOCALE TIPVAR ID '[' NUMAR ']' ASIGNARE '[' VECTASIGN ']'
| VARLOCALE REAL ID
| VARLOCALE REAL ID '[' NUMAR ']'
| VARLOCALE REAL ID ASIGNARE NUMARREAL
| VARLOCALE REAL ID '[' NUMAR ']' ASIGNARE '[' VECTASIGNREAL ']'
;
instructiuni : instructiune ';'
| instructiuni instructiune ';'
;
instructiune : instructiune_simpla ';'
| instructiune_compusa ';'
;
instructiune_simpla : ID ASIGNARE expresie ';'
;
expresie : expresie '+' expresie
| expresie '-' expresie
| expresie '*' expresie
| expresie '/' expresie
| functie
| NUMAR
| ID
;
functie : INV '(' expresie ')'
| SUM '(' expresie ',' expresie ',' expresie ',' expresie ',' expresie ')'
| OGL '(' expresie ')'
| AFISARE '(' NUMAR ')'
| AFISARE '(' ID ')'
| AFISARE '(' CUVANT ')'
;
instructiune_compusa : DACA conditie ATUNCI
INBLOCIF
instructiuni
SFBLOCIF
ALTFEL
INBLOCIF
instructiuni
SFBLOCIF
|PENTRU ID FROM NUMAR TO NUMAR EXECUTA
INBLOCFOR
instructiuni
SFBLOCFOR
|CATTIMP ID conditie NUMAR
INBLOCCATTIMP
instructiuni
SFBLOCCATTIMP
;
conditie : MAIMARE
| MAIMIC
| EGAL
;
%%
int yyerror(char * s){
printf("error: %s line:%d\n",s,yylineno);
}
int main(int argc, char** argv){
yyin=fopen(argv[1],"r");
yyparse();
}
And this is the file that I test with , and it should return that the program has a correct syntax.
&librarie=>stringuri
&librarie=>vectori
def_variabila_globala intreg var1<-23;
def_variabila_globala natural vect[50]<-{1,3,51,2,421,12,43};
def_variabila_globala real a<-12.5;
def_variabila_globala caracter ch<-#x#;
def_variabila_globala string s<-#alabala portocala#;
structura_obiect persoana
~
real inaltime;
natural varsta;
~
procedura[]
start_program
def_variabila_locala intreg negativ,pozitiv,s,contor1,contor2<-5;
def_variabila_locala adunari_scaderi;
def_variabila_locala inmultiri_impartiri;
def_variabila_locala ad_scd_inm_imp;
persoana p1;
p1#inaltime<-1.82;
p1#varsta<-20;
adunari_scaderi<-243-12+43-12+11+31-124;
afisare<adunari_scaderi>;
inmultiri_impartiri<-3*4/2/2*3/9;
afisare<inmultiri_impartiri>;
ad_scd_inm_imp<-4*2-3+5/2*5;
afisare<ad_scd_inm_imp>;
negativ<-invers<2>;
afisare<negativ>;
daca(invers<inmultiri_impartiri> smaller_than 0)
antunci
inceput_bloc_if
pentru contor1 from 0 to 10 executa
inceput_bloc_for
afiseaza<#for1#>;
afiseaza<#for2#>;
sfarsit_bloc_for
sfarsit_bloc_if
altfel
inceput_bloc_if
afiseaza<#if#>;
sfarsit_bloc_if
cat_timp contor2 greater_than 0
inceput_bloc_cat_timp
afisare<#cattimp#>;
contor<-contor-1;
sfarsit_bloc_cat_timp
pozitiv<-invers<-2>;
afisare<pozitiv>;
var1<-oglindit<321>;
afisare<var1>;
s<-suma<1,3,12,31,oglindit<123-2>>;
afisare<s>;
afisare<#ProgramTerminat#>;
stop_program
I don't know where it could be wrong , is because of the lexical rules , are they interfering with each other?
Thank you.
EDIT
error mesage : error: syntax error at line:1
The exact problem is that the parser sees a non valid rule at my "&librarie=>stringuri" declaration , in my file. (line 1)