I'm playing around with Flex, BISON and LLVM, creating my own programming language to understand how compilers work better. I've got a basic parser working and a syntax going off of this series of blog posts http://gnuu.org/2009/09/18/writing-your-own-toy-compiler/
However, I can't figure out how to extend it to include strings, such as string myVar = "testing 123".
Here's my list of tokens...
[ \t\n] ;
"extern" return TOKEN(TEXTERN);
"return" return TOKEN(TRETURN);
[a-zA-Z_][a-zA-Z0-9_]* SAVE_TOKEN; return TIDENTIFIER;
[0-9]+\.[0-9]* SAVE_TOKEN; return TDOUBLE;
[0-9]+ SAVE_TOKEN; return TINTEGER;
\"[^\n"]+\" SAVE_TOKEN; return TSTRING;
"=" return TOKEN(TEQUAL);
"==" return TOKEN(TCEQ);
"!=" return TOKEN(TCNE);
"<" return TOKEN(TCLT);
"<=" return TOKEN(TCLE);
">" return TOKEN(TCGT);
">=" return TOKEN(TCGE);
"(" return TOKEN(TLPAREN);
")" return TOKEN(TRPAREN);
"{" return TOKEN(TLBRACE);
"}" return TOKEN(TRBRACE);
"=>" return TOKEN(TCLO);
"co" return TOKEN(TCO);
"const" return TOKEN(TCONST);
"let" return TOKEN(TLET);
"." return TOKEN(TDOT);
"," return TOKEN(TCOMMA);
"+" return TOKEN(TPLUS);
"-" return TOKEN(TMINUS);
"*" return TOKEN(TMUL);
"/" return TOKEN(TDIV);
. printf("Unknown token!\n"); yyterminate();
Here's my parser...
%{
#include "node.h"
#include <cstdio>
#include <cstdlib>
NBlock *programBlock; /* the top level root node of our final AST */
extern int yylex();
void yyerror(const char *s) { std::printf("Error: %s\n", s);std::exit(1); }
%}
/* Represents the many different ways we can access our data */
%union {
Node *node;
NBlock *block;
NExpression *expr;
NStatement *stmt;
NIdentifier *ident;
NVariableDeclaration *var_decl;
std::vector<NVariableDeclaration*> *varvec;
std::vector<NExpression*> *exprvec;
std::string *string;
int token;
}
/* Define our terminal symbols (tokens). This should
match our tokens.l lex file. We also define the node type
they represent.
*/
%token <string> TIDENTIFIER TINTEGER TDOUBLE TSTRING
%token <token> TCEQ TCNE TCLT TCLE TCGT TCGE TEQUAL
%token <token> TLPAREN TRPAREN TLBRACE TRBRACE TCOMMA TDOT TCLO TCO TCONST TLET
%token <token> TPLUS TMINUS TMUL TDIV
%token <token> TRETURN TEXTERN
/* Define the type of node our nonterminal symbols represent.
The types refer to the %union declaration above. Ex: when
we call an ident (defined by union type ident) we are really
calling an (NIdentifier*). It makes the compiler happy.
*/
%type <ident> ident
%type <expr> numeric expr string
%type <varvec> func_decl_args
%type <exprvec> call_args
%type <block> program stmts block
%type <stmt> stmt var_decl func_decl extern_decl const_func_decl let_func_decl
%type <token> comparison
/* Operator precedence for mathematical operators */
%left TPLUS TMINUS
%left TMUL TDIV
%start program
%%
program : stmts { programBlock = $1; }
;
stmts : stmt { $$ = new NBlock(); $$->statements.push_back($<stmt>1); }
| stmts stmt { $1->statements.push_back($<stmt>2); }
;
stmt : var_decl | func_decl | extern_decl | const_func_decl | let_func_decl
| expr { $$ = new NExpressionStatement(*$1); }
| TRETURN expr { $$ = new NReturnStatement(*$2); }
;
block : TLBRACE stmts TRBRACE { $$ = $2; }
| TLBRACE TRBRACE { $$ = new NBlock(); }
;
var_decl : ident ident { $$ = new NVariableDeclaration(*$1, *$2); }
| ident ident TEQUAL expr { $$ = new NVariableDeclaration(*$1, *$2, $4); }
;
extern_decl : TEXTERN ident ident TLPAREN func_decl_args TRPAREN
{ $$ = new NExternDeclaration(*$2, *$3, *$5); delete $5; }
;
const_func_decl : TCONST ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
{ $$ = new NFunctionDeclaration(*$2, *$3, *$6, *$9); delete $6; }
;
let_func_decl : TLET ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
{ $$ = new NFunctionDeclaration(*$2, *$3, *$6, *$9); delete $6; }
;
func_decl : ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
{ $$ = new NFunctionDeclaration(*$1, *$2, *$5, *$8); delete $5; }
;
func_decl_args : /*blank*/ { $$ = new VariableList(); }
| var_decl { $$ = new VariableList(); $$->push_back($<var_decl>1); }
| func_decl_args TCOMMA var_decl { $1->push_back($<var_decl>3); }
;
ident : TIDENTIFIER { $$ = new NIdentifier(*$1); delete $1; }
;
string : TSTRING { $$ = new NString($1->c_str()); delete $1; }
;
numeric : TINTEGER { $$ = new NInteger(atol($1->c_str())); delete $1; }
| TDOUBLE { $$ = new NDouble(atof($1->c_str())); delete $1; }
;
expr : ident TEQUAL expr { $$ = new NAssignment(*$<ident>1, *$3); }
| ident TLPAREN call_args TRPAREN { $$ = new NMethodCall(*$1, *$3); delete $3; }
| ident { $<ident>$ = $1; }
| numeric
| expr TMUL expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr TDIV expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr TPLUS expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr TMINUS expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr comparison expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| TLPAREN expr TRPAREN { $$ = $2; }
;
call_args : /*blank*/ { $$ = new ExpressionList(); }
| expr { $$ = new ExpressionList(); $$->push_back($1); }
| call_args TCOMMA expr { $1->push_back($3); }
;
comparison : TCEQ | TCNE | TCLT | TCLE | TCGT | TCGE;
%%
Finally, here's my code generation C++ code for strings...
Value* NString::codeGen(CodeGenContext& context)
{
// Generate the type for the global var
ArrayType* ArrayTy_0 = ArrayType::get(IntegerType::get(getGlobalContext(), 8), value.size() +1 );
// create global var which holds the constant string.
GlobalVariable* gvar_array__str = new GlobalVariable(*context.module,
/*Type=*/ArrayTy_0,
/*isConstant=*/true,
GlobalValue::PrivateLinkage,
/*Initializer=*/0, // has initializer, specified below
".str");
gvar_array__str->setAlignment(1);
// create the contents for the string global.
Constant* const_array_str = ConstantDataArray::getString(getGlobalContext(), value);
// Initialize the global with the string
gvar_array__str->setInitializer(const_array_str);
// generate access pointer to the string
std::vector<Constant*> const_ptr_8_indices;
ConstantInt* const_int = ConstantInt::get(getGlobalContext(), APInt(64, StringRef("0"), 10));
const_ptr_8_indices.push_back(const_int);
const_ptr_8_indices.push_back(const_int);
Constant* const_ptr_8 = ConstantExpr::getGetElementPtr(ArrayTy_0, gvar_array__str, const_ptr_8_indices);
return const_ptr_8;
}
In my own syntax, when I run... int myInt = 123 that works fine, however, string myString = "123" that triggers a syntax error.
Related
Hi I have a scenario where bison will successfully parse my input if there is a space separating a grammar...
Here is the situation: I am attempting to declare a variable:
int a = 31 ;
This yyin parses successfully
int a = 31;
Does not parse successfully
The error I receive is:
syntax error, unexpected $end, expecting TSEMI
Here is the section of the bison code
%token <string> TIDENTIFIER TINTEGER TDOUBLE
%token <token> TCEQUAL TCNE TCLT TCLE TCGT TCGE TASSIGN
%token <token> TLPAREN TRPAREN TLBRACE TRBRACE TCOMMA TDOT TSEMI
%token <token> TPLUS TMINUS TMUL TDIV
...
var_decl : ident ident TSEMI { $$ = new VarDel($1, $2); }
| ident ident TASSIGN expr TSEMI {$$ = new VarDel($1, $2, $4);}
;
ident : TIDENTIFIER { $$ = new Var($1->c_str()); delete $1; }
;
expr : ident { $<ident>$ = $1; }
| numeric
;
numeric : TINTEGER { $$ = new Num(atol($1->c_str())); delete $1; }
| TDOUBLE { $$ = new Num(atof($1->c_str())); delete $1; }
;
And here is a section of my flex file
[ \t\n] ;
[a-zA-Z_][a-zA-Z0-9_]* SAVE_TOKEN; return TIDENTIFIER;
[0-9]+.[0-9]* SAVE_TOKEN; return TDOUBLE;
[0-9]+ SAVE_TOKEN; return TINTEGER;
"=" return TOKEN(TASSIGN);
"==" return TOKEN(TCEQUAL);
"!=" return TOKEN(TCNE);
"<" return TOKEN(TCLT);
"<=" return TOKEN(TCLE);
">" return TOKEN(TCGT);
">=" return TOKEN(TCGE);
"(" return TOKEN(TLPAREN);
")" return TOKEN(TRPAREN);
"{" return TOKEN(TLBRACE);
"}" return TOKEN(TRBRACE);
"." return TOKEN(TDOT);
"," return TOKEN(TCOMMA);
"+" return TOKEN(TPLUS);
"-" return TOKEN(TMINUS);
";" return TOKEN(TSEMI);
"*" return TOKEN(TMUL);
"/" return TOKEN(TDIV);
. printf("Unknown token!n"); yyterminate();
Why is it parsing successfully when there is a space but not when there is one?
Thanks
[0-9]+.[0-9]* should be [0-9]+\.[0-9]*. As written it matches 31;.
You would do well to enable flex debugging (the -d command-line flag) to see how it tokenises. Also, using atof silently hides the fact that the token is not a valid number. Consider using a safer string→number converter; you'll find one in the C++ standard library; in C, it would be strtod followed by a check that endptr is at the the end. (And you could do this conversion in the lexer, avoiding the unnecessary allocation and deallocation of a string.)
Here's my simple project source code:
bison.y
flex.l
flex_bison.cpp
flex.l:
%option noyywrap
%{
#include <string>
#include <cstring>
#include "bison.tab.hpp"
#define FT_SAVE_TOKEN yylval.literal = strndup(yytext, yyleng)
#define FT_TOKEN(t) (yylval.token = t)
%}
%%
"True" return FT_TRUE;
"False" return FT_FALSE;
"let" return FT_LET;
"Nil" return FT_NIL;
"if" return FT_IF;
"elseif" return FT_ELSEIF;
"else" return FT_ELSE;
"switch" return FT_SWITCH;
"case" return FT_CASE;
"otherwise" return FT_OTHERWISE;
"for" return FT_FOR;
"while" return FT_WHILE;
"break" return FT_BREAK;
"continue" return FT_CONTINUE;
"func" return FT_FUNC;
"class" return FT_CLASS;
"type" return FT_TYPE;
"isinstance" return FT_ISINSTANCE;
"import" return FT_IMPORT;
"return" return FT_RETURN;
"void" return FT_VOID;
"and" return FT_LOGICALAND;
"or" return FT_LOGICALOR;
"not" return FT_LOGICALNOT;
"int" return FT_INTEGER_KEYWORD;
"uint" return FT_UNSIGNED_INTEGER_KEYWORD;
"double" return FT_DOUBLE_KEYWORD;
[ \t\v\n\f\r] ;
[a-zA-Z_][a-zA-Z0-9_]* FT_SAVE_TOKEN; return FT_IDENTIFIER;
[0-9]+"."[0-9]+([Ee][+-]?[0-9]+)? FT_SAVE_TOKEN; return FT_DOUBLE;
[0-9]+([Ee][+-]?[0-9]+)? FT_SAVE_TOKEN; return FT_INTEGER;
\"(\\.|[^\\"])*\" FT_SAVE_TOKEN; return FT_STRING;
"+" return FT_TOKEN(FT_ADD);
"-" return FT_TOKEN(FT_SUB);
"*" return FT_TOKEN(FT_MUL);
"/" return FT_TOKEN(FT_DIV);
"%" return FT_TOKEN(FT_MOD);
"!" return FT_TOKEN(FT_BITNOT);
"&" return FT_TOKEN(FT_BITAND);
"|" return FT_TOKEN(FT_BITOR);
"~" return FT_TOKEN(FT_BITCOMPLEMENT);
"^" return FT_TOKEN(FT_BITXOR);
"=" return FT_TOKEN(FT_ASSIGN);
"+=" return FT_TOKEN(FT_ADDASSIGN);
"-=" return FT_TOKEN(FT_SUBASSIGN);
"*=" return FT_TOKEN(FT_MULASSIGN);
"/=" return FT_TOKEN(FT_DIVASSIGN);
"%=" return FT_TOKEN(FT_MODASSIGN);
"==" return FT_TOKEN(FT_EQ);
"!=" return FT_TOKEN(FT_NEQ);
"<" return FT_TOKEN(FT_LT);
"<=" return FT_TOKEN(FT_LE);
">" return FT_TOKEN(FT_GT);
">=" return FT_TOKEN(FT_GE);
"(" return FT_TOKEN(FT_LPAREN);
")" return FT_TOKEN(FT_RPAREN);
"[" return FT_TOKEN(FT_LBRACKET);
"]" return FT_TOKEN(FT_RBRACKET);
"{" return FT_TOKEN(FT_LBRACE);
"}" return FT_TOKEN(FT_RBRACE);
"," return FT_TOKEN(FT_COMMA);
";" return FT_TOKEN(FT_SEMI);
"?" return FT_TOKEN(FT_QUESTION);
":" return FT_TOKEN(FT_COLON);
"." return FT_TOKEN(FT_DOT);
. printf("Unknown token!n"); yyterminate();
%%
bison.y:
%{
#include <string>
#include <cstring>
#include <cstdio>
extern FILE *yyin;
extern int yylex();
void yyerror(const char *s) { printf("yyerror: %s\n", s); }
%}
/* Represents the many different ways we can access our data */
%union {
char *literal;
int token;
}
/* union.token: eof, keyword */
%token <token> FT_EOF
%token <token> FT_TRUE FT_FALSE FT_LET FT_NIL FT_IF FT_ELSEIF FT_ELSE FT_FOR FT_WHILE FT_BREAK FT_CONTINUE FT_SWITCH FT_CASE FT_OTHERWISE
%token <token> FT_FUNC FT_CLASS FT_TYPE FT_ISINSTANCE FT_IMPORT FT_RETURN FT_VOID FT_LOGICALAND FT_LOGICALOR FT_LOGICALNOT
%token <token> FT_INTEGER_KEYWORD FT_UNSIGNED_INTEGER_KEYWORD FT_DOUBLE_KEYWORD
/* union.literal, identifier, integer, double number, string */
%token <literal> FT_IDENTIFIER FT_INTEGER FT_DOUBLE FT_STRING
/* union.token: operator, comparator, punctuation */
%token <token> FT_ADD FT_SUB FT_MUL FT_DIV FT_MOD FT_BITNOT FT_BITAND FT_BITOR FT_BITCOMPLEMENT FT_BITXOR
%token <token> FT_ASSIGN FT_ADDASSIGN FT_SUBASSIGN FT_MULASSIGN FT_DIVASSIGN FT_MODASSIGN FT_EQ FT_NEQ FT_LT FT_LE FT_GT FT_GE
%token <token> FT_LPAREN FT_RPAREN FT_LBRACKET FT_RBRACKET FT_LBRACE FT_RBRACE FT_COMMA FT_SEMI FT_QUESTION FT_COLON FT_DOT
/*
%type <ident> ident
%type <expr> numeric expr
%type <varvec> func_decl_args
%type <exprvec> call_args
%type <block> program stmts block
%type <stmt> stmt var_decl func_decl
%type <token> comparison
*/
/* operator/comparator precedence */
%left FT_DOT FT_LPAREN FT_RPAREN FT_LBRACKET FT_RBRACKET
%left FT_MUL FT_DIV FT_MOD
%left FT_ADD FT_SUB
%left FT_LT FT_LE FT_GT FT_GE FT_EQ FT_NEQ
%left FT_BITNOT FT_BITAND FT_BITOR FT_BITXOR FT_BITCOMPLEMENT
%left FT_LOGICALNOT FT_LOGICALAND FT_LOGICALOR
%left FT_ASSIGN FT_ADDASSIGN FT_SUBASSIGN FT_MULASSIGN FT_DIVASSIGN FT_MODASSIGN
/*
%start program
*/
%%
primary_expression : FT_IDENTIFIER
| FT_INTEGER
| FT_DOUBLE
| FT_STRING
| '(' expression ')'
;
postfix_expression : primary_expression
/*| postfix_expression '[' expression ']'*/
| postfix_expression '(' ')'
| postfix_expression '(' argument_expression_list ')'
/*| postfix_expression '.' IDENTIFIER*/
;
argument_expression_list : assignment_expression
| argument_expression_list ',' assignment_expression
;
unary_expression : postfix_expression
| unary_operator postfix_expression
;
unary_operator : FT_BITAND
| FT_BITOR
| FT_BITNOT
| FT_BITCOMPLEMENT
| FT_BITXOR
| FT_ADD
| FT_SUB
;
/*
cast_expression : unary_expression
| '(' type_name ')' cast_expression
;
*/
multiplicative_expression : unary_expression
| multiplicative_expression FT_MUL unary_expression
| multiplicative_expression FT_DIV unary_expression
| multiplicative_expression FT_MOD unary_expression
;
additive_expression : multiplicative_expression
| additive_expression FT_ADD multiplicative_expression
| additive_expression FT_SUB multiplicative_expression
;
/*
shift_expression : additive_expression
| shift_expression '<<' additive_expression
| shift_expression '>>' additive_expression
;
*/
relational_expression : additive_expression
| relational_expression FT_LT additive_expression
| relational_expression FT_LE additive_expression
| relational_expression FT_GT additive_expression
| relational_expression FT_GE additive_expression
;
equality_expression : relational_expression
| equality_expression FT_EQ relational_expression
| equality_expression FT_NEQ relational_expression
;
/*
bit_and_expression : equality_expression
| bitand_expression '&' equality_expression
;
bit_xor_expression : bit_and_expression
| bit_xor_expression '^' bit_and_expression
;
bit_or_expression : bit_xor_expression
| bit_or_expression '|' bit_xor_expression
;
*/
logical_not_expression : equality_expression
| logical_not_expression FT_LOGICALNOT equality_expression
;
logical_and_expression : logical_not_expression
| logical_and_expression FT_LOGICALAND logical_not_expression
;
logical_or_expression : logical_and_expression
| logical_or_expression FT_LOGICALOR logical_and_expression
;
assignment_expression : logical_or_expression
| unary_expression assignment_operator assignment_expression
;
assignment_operator : FT_ASSIGN
| FT_MULASSIGN
| FT_DIVASSIGN
| FT_MODASSIGN
| FT_ADDASSIGN
| FT_SUBASSIGN
;
constant_expression : logical_or_expression
;
expression : assignment_expression
;
unit : external_declaration
| unit external_declaration
;
external_declaration : function_declaration
| declaration
;
declaration : FT_LET declaration_init_list FT_SEMI
;
declaration_init_list : declaration_init
| declaration_init_list declaration_init
;
declaration_init : FT_IDENTIFIER FT_ASSIGN constant_expression
;
function_declaration : FT_FUNC FT_IDENTIFIER FT_LPAREN function_arg_list FT_RPAREN compound_statement
| FT_FUNC FT_IDENTIFIER FT_LPAREN FT_RPAREN compound_statement
;
function_arg_list : function_arg
| function_arg_list FT_COMMA function_arg
;
function_arg : FT_IDENTIFIER
;
compound_statement : FT_LBRACE FT_RBRACE
| FT_LBRACE statement_list FT_RBRACE
| FT_LBRACE declaration_list FT_RBRACE
;
statement_list : statement
| statement_list statement
;
declaration_list : declaration
| declaration_list declaration
;
statement : compound_statement
| expression_statement
| selection_statement
| iteration_statement
| jump_statement
;
expression_statement : FT_SEMI
| expression FT_SEMI
;
selection_statement : FT_IF FT_LPAREN expression FT_RPAREN statement
| FT_IF FT_LPAREN expression FT_RPAREN statement FT_ELSE statement
| FT_SWITCH FT_LPAREN expression FT_RPAREN statement
;
iteration_statement : FT_WHILE FT_LPAREN expression FT_RPAREN statement
| FT_FOR FT_LPAREN expression_statement expression_statement FT_RPAREN statement
| FT_FOR FT_LPAREN expression_statement expression_statement expression FT_RPAREN statement
;
jump_statement : FT_CONTINUE FT_SEMI
| FT_BREAK FT_SEMI
| FT_RETURN FT_SEMI
| FT_RETURN expression FT_SEMI
;
%%
flex_bison.cpp:
#include "bison.tab.hpp"
#include <cstdio>
int main(int argc, char **argv) {
if (argc <= 1) {
printf("error! filename missing!\n");
return 0;
}
FILE *fp = fopen(argv[1], "r");
yyin = fp;
int t;
while ((t = yylex()) != 0) {
printf("token: %d", t);
if (t == FT_IDENTIFIER || t == FT_INTEGER || t == FT_DOUBLE ||
t == FT_STRING) {
printf("literal: %s\n", yylval.literal);
} else {
printf("\n");
}
}
fclose(fp);
return 0;
}
generate code through commands:
$ flex -o flex.yy.cpp flex.l
$ bison -d -o bison.tab.hpp bison.y
$ g++ -o test.exe bison.tab.cpp flex.yy.cpp flex_bison.cpp
Here's the error message:
use of undeclared identifier 'yyin'
use of undeclared identifier 'yylex`
Do I have to define yyin and yylex before main function with code below?
extern FILE *yyin;
extern int yylex(void);
Even if the quoted code would be added to the output file, you still would get complaints about undefined yylex and yyin. Your code only declares these things, it does not define them. Neither yacc nor bison define these for you, you have to provide those functions yourself (you can use for example lex or flex to generate them). Take a look at bison documentation. The RPN example has a short example for a yylex function.
Edit after question was edited: as per the documentation of bison, the header generated from the -d option
Pretend that ‘%defines’ was specified, i.e., write an extra output
file containing macro definitions for the token type names defined
in the grammar, as well as a few other declarations.
This does not include the things you specified in the prologue, in particular not the declaration of yyin and yylex. If you need these declarations in multiple files then you may want to declare them in a separate header file and include that header file from bison.y and all other files that require that declaration (like flex_bison.cpp).
I have to implement a parser of expression tree (like "(a > b) AND (c <= d)") using flex and bison, but I fails to solve type errors...
The fatal errors occur during the g++ compilation of the parser.y.c file (which is generated by this command : "bison -o parser.y.c -d parser.y") :
parser.y:54:36: erreur: request for member ‘nodeVal’ in ‘*(yyvsp + -8u)’, which is of pointer type ‘Node*’ (maybe you meant to use ‘->’ ?)
parser.y:58:14: erreur: request for member ‘nodeVal’ in ‘yyval’, which is of pointer type ‘Node*’ (maybe you meant to use ‘->’ ?)
parser.y:58:55: erreur: request for member ‘strVal’ in ‘*(yyvsp + -16u)’, which is of pointer type ‘Node*’ (maybe you meant to use ‘->’ ?)
parser.y:58:82: erreur: request for member ‘strVal’ in ‘* yyvsp’, which is of pointer type ‘Node*’ (maybe you meant to use ‘->’ ?)
parser.y:59:14: erreur: request for member ‘nodeVal’ in ‘yyval’, which is of pointer type ‘Node*’ (maybe you meant to use ‘->’ ?)
parser.y:59:55: erreur: request for member ‘strVal’ in ‘*(yyvsp + -16u)’, which is of pointer type ‘Node*’ (maybe you meant to use ‘->’ ?)
There is also a warning that I don't understand :
parser.lex:35: warning, la règle ne peut être pairée [english : "rule cannot be matched"]
I hope someone can help me !
Here, the parser.y file :
%{
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iostream>
#include "Node.h"
#include "parser.lex.h"
#define YYSTYPE Node*
int yyerror(char *s) {
printf("%s\n",s);
}
extern "C++"
{
int yyparse(void);
int yylex(void);
Node * rootNode;
}
%}
%union {
Node * nodeVal;
char * strVal;
}
%token <strVal> IDENT
%token <strVal> LT GT LE GE EQ NE
%token <strVal> AND OR
%token <strVal> LEFT_PARENTHESIS RIGHT_PARENTHESIS
%token FIN
%left LT GT LE GE EQ NE
%left AND OR
%type<nodeVal> Expression
%start Input
%%
Input:
/* Vide */
| Input Ligne
;
Ligne:
FIN
| Expression FIN { rootNode = $1; }
;
Expression:
IDENT LT IDENT { $$=new Node("<", $1, $3); }
| IDENT GT IDENT { $$=new Node(">", $1, $3); }
| IDENT LE IDENT { $$=new Node("<=", $1, $3); }
| IDENT GE IDENT { $$=new Node(">=", $1, $3); }
| IDENT EQ IDENT { $$=new Node("=", $1, $3); }
| IDENT NE IDENT { $$=new Node("!=", $1, $3); }
| Expression AND Expression { $$=new Node("AND", $1, $3); }
| Expression OR Expression { $$=new Node("OR", $1, $3); }
| LEFT_PARENTHESIS Expression RIGHT_PARENTHESIS { $$=$2; }
;
%%
void parse_string(const std::string & str)
{
yy_scan_string(str.c_str());
yyparse();
}
then the parser.lex file :
%{
#define YYSTYPE Node*
#include <cstdlib>
#include "BooleanNode.h"
#include "AttributeNode.h"
#include "parser.y.h"
extern "C++"
{
int yylex(void);
}
%}
%option noyywrap
blancs [ \t]+
ident [a-zA-Z_]{1}[a-zA-Z0-9_]*
%%
{ident} { return(IDENT); }
"<" return(LT);
">" return(GT);
"<=" return(LE);
">=" return(GE);
"=" return(EQ);
"!=" return(NE);
"AND" return(AND);
"OR" return(OR);
"(" return(LEFT_PARENTHESIS);
")" return(RIGHT_PARENTHESIS);
"\n" return(FIN);
and finally the Node.h file :
#ifndef _NODE_H_
#define _NODE_H_
#include <string>
#include <iostream>
class Node
{
public:
enum E_op
{
AND = 0,
OR,
LT,
GT,
LE,
GE,
EQ,
NE
};
Node(const std::string & op)
{
_op = op;
}
Node(const std::string & op, const std::string & left, const std::string & right)
{
_op = op;
}
Node(const std::string & op, Node * left, Node * right)
{
_op = op;
}
virtual ~Node()
{
}
virtual void print() {}
protected:
std::string _op;
};
#endif
UPDATE
Thanks to Jonathan Leffler and some others corrections (char* instead of std::string in %union), the compilation goes well but the result is not what I expected.
With the "foo < bar" expression, the "IDENT LT IDENT" directive is executed but the value of $1 and $3 is NULL...
** NEW UPDATE **
I corrected the error by splitting the Expression directive :
Expression:
id LT id { $$ = new Node("<", $1, $3); }
| id GT id { $$ = new Node(">", $1, $3); }
| id LE id { $$ = new Node("<=", $1, $3); }
| id GE id { $$ = new Node(">=", $1, $3); }
| id EQ id { $$ = new Node("=", $1, $3); }
| id NE id { $$ = new Node("!=", $1, $3); }
| Expression AND Expression { $$ = new Node("AND", $1, $3); }
| Expression OR Expression { $$ = new Node("OR", $1, $3); }
| LEFT_PARENTHESIS Expression RIGHT_PARENTHESIS { $$ = $2; }
;
id:
IDENT { $$ = strdup(yytext); }
The problem is that you've declared that the Yacc stack contains Node * elements via the #define YYSTYPE Node * define, but your %union, %token and %type declarations say that there are StrVal and NodeVal types within the union.
IIRC, you only use YYSTYPE when you do not use %union. Removing that line should resolve the other problems.
Is there a way to specify that a Bison rule should NOT match if the lookahead token is a given value?
I currently have the following Bison grammar (simplified):
var_decl:
type ident
{
$$ = new NVariableDeclaration(*$1, *$2);
} |
type ident ASSIGN_EQUAL expr
{
$$ = new NVariableDeclaration(*$1, *$2, $4);
} |
type CURVED_OPEN STAR ident CURVED_CLOSE CURVED_OPEN func_decl_args CURVED_CLOSE
{
$$ = new NVariableDeclaration(*(new NFunctionPointerType(*$1, *$7)) /* TODO: free this memory */, *$4);
} |
type CURVED_OPEN STAR ident CURVED_CLOSE CURVED_OPEN func_decl_args CURVED_CLOSE ASSIGN_EQUAL expr
{
$$ = new NVariableDeclaration(*(new NFunctionPointerType(*$1, *$7)) /* TODO: free this memory */, *$4, $10);
} ;
...
deref:
STAR ident
{
$$ = new NDereferenceOperator(*$<ident>2);
} |
...
type:
ident
{
$$ = new NType($<type>1->name, 0, false);
delete $1;
} |
... ;
...
expr:
deref
{
$$ = $1;
} |
...
ident
{
$<ident>$ = $1;
} |
...
ident CURVED_OPEN call_args CURVED_CLOSE
{
$$ = new NMethodCall(*$1, *$3);
delete $3;
} |
...
CURVED_OPEN expr CURVED_CLOSE
{
$$ = $2;
} ;
...
call_args:
/* empty */
{
$$ = new ExpressionList();
} |
expr
{
$$ = new ExpressionList();
$$->push_back($1);
} |
call_args COMMA expr
{
$1->push_back($3);
} ;
The problem is that when parsing:
void (*ident)(char* some_arg);
It's seeing void (*ident) and deducing that it must be a function call instead of a function declaration. Is there a way I can tell Bison that it should favour looking ahead to match var_decl instead of reducing *ident and void into derefs and exprs?
any identifier can be a type
That's exactly the problem. LALR(1) grammars for C-like languages (or languages with C-like syntax for types) need to differentiate types and other identifiers at the token level. That is, you need IDENT and TYPEIDENT be two different tokens. (You will have to feed data about identifiers from the compiler back to the tokenizer). It's the most standard way to disambiguate the otherwise ambiguous grammar.
Update See, for instance, this ANSI C grammar for Yacc.
I was teaching myself Bison and headed over to wikipedia for the same and copy-pasted the entire code from the example that was put there [ http://en.wikipedia.org/wiki/GNU_Bison ]. It compiled and works perfect. Then, I OOPed it by adding in a bit of C++. Here's is my new Parser.y file:
%{
#include "TypeParser.h"
#include "ParserParam.h"
#include "addition.h"
%}
%define api.pure
%left '+' TOKEN_PLUS
%left '*' TOKEN_MULTIPLY
%left '-' TOKEN_SUBTRACT
%left '/' TOKEN_DIVIDE
%left '^' TOKEN_EXP
%token TOKEN_LPAREN
%token TOKEN_RPAREN
%token TOKEN_PLUS
%token TOKEN_MULTIPLY
%token <value> TOKEN_NUMBER
%type <expression> expr
%%
input:
expr { ((SParserParam*)data)->expression = $1; }
;
expr:
expr TOKEN_PLUS expr { $$ = new Addition($1, $2); }
| expr TOKEN_MULTIPLY expr { $$ = new Multiplication($1, $2); }
| expr TOKEN_SUBTRACT expr { $$ = new Addition($1, $2); }
| expr TOKEN_DIVIDE expr { $$ = new Multiplication($1, $2); }
| expr TOKEN_EXP expr { $$ = new Addition($1, $2); }
| TOKEN_LPAREN expr TOKEN_RPAREN { $$ = $2; }
| TOKEN_NUMBER { $$ = new Value($1); }
;
%%
But then I keep getting the following errors:
Parser.y:33.52-53: $2 of `expr' has no declared type
Parser.y:34.62-63: $2 of `expr' has no declared type
Parser.y:35.56-57: $2 of `expr' has no declared type
Parser.y:36.60-61: $2 of `expr' has no declared type
Parser.y:37.52-53: $2 of `expr' has no declared type
How do I resolve it? I mean, what have I changed that is causing this? I haven't changed anything from the wikipedia code, the %type% declaration is still there [The union has the same members, with type changed from SExpression to Expression.]. All classes i.e. Addition, Expression, Multiplication are defined and declared. I don't think that is what is causing the problem here, but just saying.
And why exactly does it have a problem only with $2. Even $1 is of type expr, then why do I not get any errors for $1?
Any help is appreciated...
In the rule expr TOKEN_PLUS expr $1 is the first expression, $2 is TOKEN_PLUS, and $3 is the second expression. See the bison manual.
So the semantic action needs to change from your { $$ = new Addition($1, $2); } to { $$ = new Addition($1, $3); }.