I'm new to flex and bison. I want to write a compiler that read C program and translate it to my processor commands that are similar to assembly. I downloaded a pre-written compiler that uses flex and bison. I should change the scanner.l and parser.y as it can process the asm commands that are inside my C code like asm [asm command1 \n asm command2 \n asm command3 \n ...]. Which definitions and rules should I add to these two files?
scanner.l:
%{
#include "scanner.h"
#include "y.tab.h"
#include <stdio.h>
#include <stdlib.h>
#define MAX_STR_CONST 1000
char string_buf[MAX_STR_CONST];
char *string_buf_ptr;
int line_num = 1;
int line_pos = 1;
void updatePosition();
#define YY_USER_ACTION updatePosition();
%}
NUMBER (0)|([1-9][0-9]*)
HEXNUM ((0x)|(0X))([a-fA-F0-9]+)
IDENT [a-zA-Z_][a-zA-Z0-9_]*
%x comment
%x str
%option noyywrap
%option yylineno
%option nounput
%%
\" string_buf_ptr = string_buf; BEGIN(str);
<str>{
\" { /* saw closing quote - all done */
BEGIN(INITIAL);
*string_buf_ptr = '\0';
/* return string constant token type and
* value to parser
*/
yylval.strConst = new std::string(string_buf);
return T_STR_CONST;
}
\n {
/* error - unterminated string constant */
/* generate error message */
yyerror("Unterminated string constant.");
}
<<EOF>> { return T_UNTERM_STRING; }
\\[0-7]{1,3} {
/* octal escape sequence */
int result;
(void) sscanf( yytext + 1, "%o", &result );
if ( result > 0xff )
/* error, constant is out-of-bounds */
*string_buf_ptr++ = result;
}
\\[0-9]+ {
/* generate error - bad escape sequence; something
* like '\48' or '\0777777'
*/
yyerror("Bad string escape sequence.");
}
\\n *string_buf_ptr++ = '\n';
\\t *string_buf_ptr++ = '\t';
\\r *string_buf_ptr++ = '\r';
\\b *string_buf_ptr++ = '\b';
\\f *string_buf_ptr++ = '\f';
\\(.|\n) *string_buf_ptr++ = yytext[1];
[^\\\n\"]+ {
char *yptr = yytext;
while ( *yptr )
*string_buf_ptr++ = *yptr++;
}
}
"/*" BEGIN(comment);
<comment>{
[^*\n]* /* eat anything that's not a '*' */
"*"+[^*/\n]* /* eat up '*'s not followed by '/'s */
\n
<<EOF>> { return T_UNTERM_COMMENT; }
"*"+"/" BEGIN(INITIAL);
}
"do" { return T_DO; }
"while" { return T_WHILE; }
"for" { return T_FOR; }
"if" { return T_IF; }
"else" { return T_ELSE; }
"int" { return T_INT_TYPE; }
"string" { return T_STRING_TYPE; }
"void" { return T_VOID_TYPE; }
"struct" { return T_STRUCT; }
"return" { return T_RETURN; }
"switch" { return T_SWITCH; }
"case" { return T_CASE; }
"default" { return T_DEFAULT; }
"break" { return T_BREAK; }
"continue" { return T_CONTINUE; }
"sizeof" { return T_SIZEOF; }
"{" { return '{'; }
"}" { return '}'; }
"(" { return '('; }
")" { return ')'; }
"[" { return '['; }
"]" { return ']'; }
"+" { return '+'; }
"-" { return '-'; }
"*" { return '*'; }
"/" { return '/'; }
"%" { return '%'; }
"=" { return '='; }
">" { return '>'; }
"<" { return '<'; }
"!" { return '!'; }
"|" { return '|'; }
"&" { return '&'; }
"^" { return '^'; }
"~" { return '~'; }
"." { return '.'; }
":" { return ':'; }
";" { return ';'; }
"," { return ','; }
"<<" { return T_LEFT_SHIFT; }
">>" { return T_RIGHT_SHIFT; }
"&&" { return T_BOOL_AND; }
"||" { return T_BOOL_OR; }
"+=" { return T_PLUS_EQUALS; }
"-=" { return T_MINUS_EQUALS; }
"*=" { return T_STAR_EQUALS; }
"/=" { return T_DIV_EQUALS; }
"%=" { return T_MOD_EQUALS; }
"==" { return T_EQUAL; }
"<=" { return T_LESS_OR_EQUAL; }
">=" { return T_GREATER_OR_EQUAL; }
"!=" { return T_NOT_EQUAL; }
"|=" { return T_BIT_OR_EQUALS; }
"&=" { return T_BIT_AND_EQUALS; }
"^=" { return T_BIT_XOR_EQUALS; }
"~=" { return T_BIT_NOT_EQUALS; }
"->" { return T_ARROW; }
"<<=" { return T_LEFT_SHIFT_EQUALS; }
">>=" { return T_RIGHT_SHIFT_EQUALS; }
"++" { return T_PLUS_PLUS; }
"--" { return T_MINUS_MINUS; }
" "|"\t"|"\r"|"\n"|"const" {}
{HEXNUM} { yylval.intConst = std::strtoul(yytext, NULL, 0); return T_INT_CONST; }
{NUMBER} { yylval.intConst = atoi(yytext); return T_INT_CONST; }
{IDENT} { yylval.ident = new std::string(yytext); return T_IDENT; }
. {{ char err[] = "Unknown Character: a"; err[strlen(err)-1] = *yytext; yyerror(err); }}
%%
/**
* This function is called on every token, and updates the yylloc global variable, which stores the
* location/position of the current token.
*/
void updatePosition() {
yylloc.first_line = line_num;
yylloc.first_column = line_pos;
char* text = yytext;
while(*text != '\0') {
if(*text == '\n') {
line_num++;
line_pos = 1;
} else {
line_pos++;
}
text++;
}
yylloc.last_line = line_num;
yylloc.last_column = line_pos;
}
parser.y:
%code requires {
#include "Declaration.h"
#include "Expression.h"
#include "Statement.h"
#include "Type.h"
#include "Parser.h"
#include "Util.h"
extern Program* program_out;
}
%locations
%define parse.lac full
%error-verbose
%{
#include "Parser.h"
#include "scanner.h"
#include <string>
#include <iostream>
#include "Type.h"
%}
//%parse-param {Program*& out}
%union {
char* cstr;
std::string* ident;
std::string* strConst;
unsigned int intConst;
Type* type;
std::vector<Declaration*>* declareList;
Declaration* declare;
ConstantExpression* constant;
std::vector<FunctionParameter*>* paramList;
FunctionParameter* param;
std::vector<StructMember*>* structMemberList;
StructMember* structMember;
StatementBlock* statementBlock;
Statement* statement;
std::vector<Statement*>* statementList;
Expression* expression;
std::vector<Expression*>* expressionList;
}
%type <type> type
%type <cstr> root
%type <declareList> root_declare_list
%type <declare> root_declare
%type <constant> constant
%type <paramList> param_list non_empty_param_list
%type <param> param
%type <structMemberList> struct_list
%type <structMember> struct_member;
%type <statementBlock> statement_block
%type <statementList> statement_list
%type <statement> statement
%type <expression> expression
%type <expressionList> argument_list non_empty_argument_list
%token <ident> T_IDENT
%token <strConst> T_STR_CONST
%token <intConst> T_INT_CONST
%token T_IF T_ELSE T_FOR T_WHILE T_DO T_SIZEOF
%token T_INT_TYPE T_STRING_TYPE T_VOID_TYPE T_STRUCT
%token T_RETURN T_SWITCH T_CASE T_DEFAULT T_BREAK T_CONTINUE
%token T_BOOL_OR T_BOOL_AND
%token T_LEFT_SHIFT T_RIGHT_SHIFT T_PLUS_EQUALS T_MINUS_EQUALS
%token T_STAR_EQUALS T_DIV_EQUALS T_MOD_EQUALS T_EQUAL
%token T_LESS_OR_EQUAL T_GREATER_OR_EQUAL T_NOT_EQUAL
%token T_BIT_OR_EQUALS T_BIT_AND_EQUALS T_BIT_XOR_EQUALS
%token T_BIT_NOT_EQUALS T_ARROW T_LEFT_SHIFT_EQUALS
%token T_RIGHT_SHIFT_EQUALS T_PLUS_PLUS T_MINUS_MINUS
%token T_UNTERM_STRING T_UNTERM_COMMENT
/* tokens for precedence */
%token PREC_ADDRESS PREC_DEREFERENCE PREC_UNARY_MINUS PREC_UNARY_PLUS
%token PREC_SUFFIX_PLUS_PLUS PREC_SUFFIX_MINUS_MINUS
%token PREC_PREFIX_PLUS_PLUS PREC_PREFIX_MINUS_MINUS
%token PREC_APPLICATION
/* lowest precedence */
%left ','
%right T_BIT_AND_EQUALS T_BIT_XOR_EQUALS T_BIT_OR_EQUALS
%right T_LEFT_SHIFT_EQUALS T_RIGHT_SHIFT_EQUALS
%right T_STAR_EQUALS T_DIV_EQUALS T_MOD_EQUALS
%right T_PLUS_EQUALS T_MINUS_EQUALS
%right '='
%left T_BOOL_OR
%left T_BOOL_AND
%left '|'
%left '^'
%left '&'
%left T_EQUAL T_NOT_EQUAL
%left '>' T_GREATER_OR_EQUAL
%left '<' T_LESS_OR_EQUAL
%left T_LEFT_SHIFT T_RIGHT_SHIFT
%left '+' '-'
%left '*' '/' '%'
%right PREC_ADDRESS
%right PREC_DEREFERENCE
%right '!' '~'
%right PREC_UNARY_PLUS PREC_UNARY_MINUS
%right PREC_PREFIX_PLUS_PLUS PREC_PREFIX_MINUS_MINUS
%right T_PLUS_PLUS T_MINUS_MINUS
%left T_ARROW
%left '.'
%left '['
%left PREC_APPLICATION
%left PREC_SUFFIX_PLUS_PLUS PREC_SUFFIX_MINUS_MINUS
%nonassoc T_IF
%nonassoc T_ELSE
/* highest precedence */
%%
root:
root_declare_list { $$ = NULL; program_out = new Program(#$, *$1); delete $1; }
;
root_declare_list:
root_declare_list root_declare { $$ = $1; $1->push_back($2); }
| { $$ = new std::vector<Declaration*>(); }
;
root_declare:
type T_IDENT '(' param_list ')' ';' { $$ = new FunctionPrototype(#$, $1, *$2, *$4); delete $2; delete $4; }
| type T_IDENT '(' param_list ')' statement_block { $$ = new FunctionDeclaration(#$, $1, *$2, *$4, $6); delete $2; delete $4; }
| type T_IDENT ';' { $$ = new GlobalVarDeclaration(#$, $1, *$2); delete $2; }
| type T_IDENT '[' T_INT_CONST ']' ';' { $$ = new GlobalArrayDeclaration(#$, $1, *$2, $4); delete $2; }
| type T_IDENT '=' constant ';' { $$ = new GlobalVarDeclarationInit(#$, $1, *$2, $4); delete $2; }
| T_STRUCT T_IDENT '{' struct_list '}' ';' { $$ = new StructDeclaration(#$, *$2, *$4); delete $2; delete $4; }
| T_STRUCT T_IDENT ';' { $$ = new StructPredeclaration(#$, *$2); delete $2; }
;
constant:
T_INT_CONST { $$ = new IntConstantExpression(#$, $1); }
| T_STR_CONST { $$ = new StringConstantExpression(#$, *$1); delete $1; }
;
param_list:
non_empty_param_list { $$ = $1; }
| { $$ = new std::vector<FunctionParameter*>(); }
;
non_empty_param_list:
non_empty_param_list ',' param { $$ = $1; $1->push_back($3); }
| param { $$ = new std::vector<FunctionParameter*>({$1}); }
;
param:
type T_IDENT { $$ = new FunctionParameter(#$, $1, *$2); delete $2; }
;
struct_list:
struct_list struct_member ';' { $$ = $1; $1->push_back($2); }
| { $$ = new std::vector<StructMember*>(); }
;
struct_member:
type T_IDENT { $$ = new StructMember(#$, $1, *$2); delete $2; }
;
type:
type '*' { $$ = new PointerType($1); }
| T_STRUCT T_IDENT { $$ = new StructType(*$2); delete $2; }
| T_INT_TYPE { $$ = new IntType(); }
| T_VOID_TYPE { $$ = new VoidType(); }
| T_STRING_TYPE { $$ = new StringType(); }
;
statement_block:
'{' statement_list '}' { $$ = new StatementBlock(#$, *$2); delete $2; }
;
statement_list:
statement_list statement { $$ = $1; $1->push_back($2); }
| { $$ = new std::vector<Statement*>(); }
;
statement:
expression ';' { $$ = $1; }
| type T_IDENT ';' { $$ = new VarDeclaration(#$, $1, *$2); delete $2; }
| type T_IDENT '=' expression ';' { $$ = new VarDeclarationInit(#$, $1, *$2, $4); delete $2; }
| type T_IDENT '[' T_INT_CONST ']' ';' { $$ = new ArrayDeclaration(#$, $1, *$2, $4); delete $2; }
| T_WHILE '(' expression ')' statement { $$ = new WhileStatement(#$, $3, $5); }
| T_DO statement T_WHILE '(' expression ')' ';' { $$ = new DoWhileStatement(#$, $2, $5); }
| T_FOR '(' expression ';' expression ';' expression ')' statement { $$ = new ForStatement(#$, $3, $5, $7, $9); }
| statement_block { $$ = $1; }
| T_IF '(' expression ')' statement %prec T_IF { $$ = new IfStatement(#$, $3, $5); }
| T_IF '(' expression ')' statement T_ELSE statement { $$ = new IfElseStatement(#$, $3, $5, $7); }
| T_BREAK ';' { $$ = new BreakStatement(#$); }
| T_CONTINUE ';' { $$ = new ContinueStatement(#$); }
| T_SWITCH '(' expression ')' '{' statement_list '}' { $$ = new SwitchStatement(#$, $3, *$6); delete $6; }
| T_CASE T_INT_CONST ':' { $$ = new CaseStatement(#$, $2); }
| T_DEFAULT ':' { $$ = new DefaultStatement(#$); }
| T_RETURN expression ';' { $$ = new ReturnStatement(#$, $2); }
;
expression:
expression '=' expression { $$ = new AssignExpression(#$, $1, $3); }
| expression T_PLUS_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "+", $3)); }
| expression T_MINUS_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "-", $3)); }
| expression T_STAR_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "*", $3)); }
| expression T_DIV_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "/", $3)); }
| expression T_MOD_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "%", $3)); }
| expression T_BIT_AND_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "&", $3)); }
| expression T_BIT_OR_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "|", $3)); }
| expression T_BIT_XOR_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "^", $3)); }
| expression T_LEFT_SHIFT_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "<<", $3)); }
| expression T_RIGHT_SHIFT_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), ">>", $3)); }
| expression T_PLUS_PLUS %prec PREC_SUFFIX_PLUS_PLUS { $$ = new UnaryAssignExpression(#$, $1, "++"); }
| T_PLUS_PLUS expression %prec PREC_PREFIX_PLUS_PLUS { $$ = new UnaryAssignExpression(#$, "++", $2); }
| expression T_MINUS_MINUS %prec PREC_SUFFIX_MINUS_MINUS { $$ = new UnaryAssignExpression(#$, $1, "--"); }
| T_MINUS_MINUS expression %prec PREC_PREFIX_MINUS_MINUS { $$ = new UnaryAssignExpression(#$, "--", $2); }
| constant { $$ = $1; }
| '(' expression ')' { $$ = $2; }
| T_IDENT '(' argument_list ')' %prec PREC_APPLICATION { $$ = new FunctionCallExpression(#$, *$1, *$3); delete $1; delete $3; }
| T_SIZEOF '(' type ')' { $$ = new SizeofExpression(#$, $3); }
| '!' expression { $$ = new UnaryOperatorExpression(#$, "!", $2); }
| '~' expression { $$ = new UnaryOperatorExpression(#$, "~", $2); }
| '+' expression %prec PREC_UNARY_PLUS { $$ = new UnaryOperatorExpression(#$, "+", $2); }
| '-' expression %prec PREC_UNARY_MINUS { $$ = new UnaryOperatorExpression(#$, "-", $2); }
| '*' expression %prec PREC_DEREFERENCE { $$ = new ArraySubscriptExpression(#$, $2, new IntConstantExpression(#2, 0)); }
| '&' expression %prec PREC_ADDRESS { $$ = new UnaryOperatorExpression(#$, "&", $2); }
| expression '+' expression { $$ = new BinaryOperatorExpression(#$, $1, "+", $3); }
| expression '-' expression { $$ = new BinaryOperatorExpression(#$, $1, "-", $3); }
| expression '*' expression { $$ = new BinaryOperatorExpression(#$, $1, "*", $3); }
| expression '/' expression { $$ = new BinaryOperatorExpression(#$, $1, "/", $3); }
| expression '%' expression { $$ = new BinaryOperatorExpression(#$, $1, "%", $3); }
| expression '&' expression { $$ = new BinaryOperatorExpression(#$, $1, "&", $3); }
| expression '|' expression { $$ = new BinaryOperatorExpression(#$, $1, "|", $3); }
| expression '^' expression { $$ = new BinaryOperatorExpression(#$, $1, "^", $3); }
| expression T_BOOL_AND expression { $$ = new BinaryOperatorExpression(#$, $1, "&&", $3); }
| expression T_BOOL_OR expression { $$ = new BinaryOperatorExpression(#$, $1, "||", $3); }
| expression T_LEFT_SHIFT expression { $$ = new BinaryOperatorExpression(#$, $1, "<<", $3); }
| expression T_RIGHT_SHIFT expression { $$ = new BinaryOperatorExpression(#$, $1, ">>", $3); }
| expression T_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "==", $3); }
| expression T_NOT_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "!=", $3); }
| expression '<' expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "<", $3); }
| expression '>' expression { $$ = new BinaryOperatorConditionExpression(#$, $1, ">", $3); }
| expression T_LESS_OR_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "<=", $3); }
| expression T_GREATER_OR_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, ">=", $3); }
| T_IDENT { $$ = new VarExpression(#$, *$1); delete $1; }
| expression '.' T_IDENT { $$ = new StructMemberExpression(#$, $1, *$3); delete $3; }
| expression T_ARROW T_IDENT { $$ = new StructMemberExpression(#$, $1, *$3); delete $3; }
| expression '[' expression ']' { $$ = new ArraySubscriptExpression(#$, $1, $3); }
;
argument_list:
non_empty_argument_list { $$ = $1; }
| { $$ = new std::vector<Expression*>(); }
;
non_empty_argument_list:
non_empty_argument_list ',' expression { $$ = $1; $1->push_back($3); }
| expression { $$ = new std::vector<Expression*>({$1}); }
;
I created a small compiler and need help to fix it.
Code of my compiler:
t.l:
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "y.tab.h"
%}
%x DOUBLE_QUOTES
%%
<INITIAL>[s|S][h|H][o|O][w|W] {return show;}
<INITIAL>[a-zA-z] {yylval.id=yytext[0];return identifier;}
<INITIAL>[0-9]+ {yylval.num=atoi(yytext);return number;}
<INITIAL>[\-\+\=\;\*\/] {return yytext[0];}
<INITIAL>["] {
printf("(STRING_OPEN) ");
BEGIN(DOUBLE_QUOTES);
}
<DOUBLE_QUOTES>["] {
printf("(STRING_CLOSE) ");
BEGIN(INITIAL);
printf("(STRING:%S) ",yytext[1]);
}
%%
int yywrap (void) {return 1;}
t.y:
%{
void yyerror(char *s);
#include <stdio.h>
#include <stdlib.h>
int symbols[52];
int symbolVal(char symbol);
void updateSymbolVal(char symbol,int val);
%}
%union {int num;char id;}
%start line
%token show
%token <num> number
%token <id> identifier
%type <num> line exp term
%type <id> assignment
%%
line : assignment ';' {;}
| show exp ';' {printf("showing : %d\n",$2);}
| line assignment ';' {;}
| line show exp ';' {printf("showing : %d\n",$3);}
;
assignment: identifier '=' exp {updateSymbolVal($1,$3);}
;
exp : term {$$ = $1;}
| exp '+' term {$$ = $1 + $3;}
| exp '-' term {$$ = $1 - $3;}
| exp '*' term {$$ = $1 * $3;}
| exp '/' term {$$ = $1 / $3;}
;
term : number {$$ = $1;}
| identifier {$$ = symbolVal($1);}
%%
int computerSymbolIndex(char token)
{
int idx=-1;
if(islower(token))
{
idx=token-'a'+26;
}
else if(isupper(token))
{
idx = token - 'A';
}
return idx;
}
int symbolVal(char symbol)
{
int bucket = computerSymbolIndex(symbol);
return symbols[bucket];
}
void updateSymbolVal(char symbol,int val)
{
int bucket = computerSymbolIndex(symbol);
symbols[bucket] = val;
}
int main (void) {
printf("Created By BoxWeb Inc\n");
int i;
for(i=0;i<52;i++)
{
symbols[i]=0;
}
return yyparse();
}
void yyerror (char *s) {printf("-%s at %s !\n",s );}
command for test compiler :
show 5+5;
show 5*2;
show 5+5-2*2/1;
i need to upgrade to (want can print string):
show "hello" . " " . "mr";//hello mr
show 5+5 . " ?";//10 ?
and more....
In the lexer I use :
<INITIAL>["] {
printf("(STRING_OPEN) ");
BEGIN(DOUBLE_QUOTES);
}
<DOUBLE_QUOTES>["] {
printf("(STRING_CLOSE) ");
BEGIN(INITIAL);
printf("(STRING:%S) ",yytext[1]);
}
but I don't know how use this in a parser.
Please help me to complete this compiler.
Lets simplify it for a moment to just one possible operation
We have the following grammar
assignment: '$' identifier '=' exp ';' {updateSymbolVal($2,$4); }
;
exp: number {$$ = createExp($1);}
| string {$$ = createExp($1);}
| exp '+' exp {$$ = addExp($1,$3);}
;
Since the expression can be many different things we can't just save it in a integer but need a more complex structure, something like this:
enum expType {NUMBER, STRING};
struct Exp{
expType type;
double number;
std::string str;
};
Then we make the functions to create your expressions:
Exp* createExp(int v){
Exp *e = new Exp();
e->type = NUMBER;
e->number = v;
return e;
}
Exp* createExp(std::string s){
Exp *e = new Exp();
e->type = STRING;
e->str = s;
return e;
}
And then to do all your calculations and assignment you will always have to check the type.
Exp* addExp(Exp *a, Exp *b){
Exp *c;
if(a->type == NUMBER && b->type == NUMBER){
c->type == NUMBER;
c->number == a->number + b->number;
}
else{
std::cout << "some nice error message\n";
}
return c;
}
Same with the assign function
void updateSymbolVal(const std::string &identifier, Exp *e){
if(e->type == NUMBER){
myNumbers[identifier] = e->number;
}
if(e->type == STRING){
myStrings[identifier] = e->str;
}
}
Of course you could also make a map/vector/array of the struct Exp if you need to do some more manipulations with it. Or just hand it over to the next level.
Edit for the question of multi-language support
As written in the comment I refer to this question Flex(lexer) support for unicode. To simplify it to your need here you can make it like this.
ASC [a-zA-Z_0-9]
U [\x80-\xbf]
U2 [\xc2-\xdf]
U3 [\xe0-\xef]
U4 [\xf0-\xf4]
UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
UANY+ {yylval.id = yytext[0]; return string;}
For some reason or another, bison doesn't want to do any evaluation. Compilation of all files goes smoothly and the program runs. When I enter the expression 4+5 and press return, it creates tokens for 4 + 5 respectively. I can even put in some printf into the places where bison recognizes the attributes of each token including the plus (43).
However the program never evaluates this production expr '+' term { $$ = $1 + $3; }. It's simply never called at least to my knowledge and even if it was this production assign '\n' { printf("%d\n", $1); } never prints out the value. Upon ^D to quit, it fires void yyerror(const char *).
Any help on this matter is much appreciated. Thanks!
//FLEX
%{
//#include <stdio.h>
#include "y.tab.h"
%}
%option noyywrap
letter [A-Za-z]
digit [0-9]
space [ \t]
var {letter}
int {digit}+
ws {space}+
%%
{var} { yylval = (int)yytext[0]; return VAR; }
{int} { yylval = atoi(yytext); return CONST; }
{ws} { }
. { return (int)yytext[0]; }
%%
/* nothing */
.
//BISON
%{
//INCLUDE
//#include <ctype.h>
//DEFINE
#define YYDEBUG 1
//PROTOTYPE
void yyerror(const char *);
void print_welcome();
int get_val(int);
void set_val(int, int);
%}
%token CONST
%token VAR
%%
session
: { print_welcome(); }
eval
;
eval
: eval line
|
;
line
: assign '\n' { printf("%d\n", $1); }
;
assign
: VAR '=' expr { set_val($1, $3); $$ = $3; }
| expr { $$ = $1; }
;
expr
: expr '+' term { $$ = $1 + $3; }
| expr '-' term { $$ = $1 - $3; }
| term { $$ = $1; }
;
term
: term '*' factor { $$ = $1 * $3; }
| term '/' factor { $$ = $1 / $3; }
| term '%' factor { $$ = $1 % $3; }
| factor { $$ = $1; }
;
factor
: '(' expr ')' { $$ = $2; }
| CONST { $$ = $1; }
| VAR { $$ = get_val($1); }
;
%%
void yyerror(const char * s)
{
fprintf(stderr, "%s\n", s);
}
void print_welcome()
{
printf("Welcome to the Simple Expression Evaluator.\n");
printf("Enter one expression per line, end with ^D\n\n");
}
static int val_tab[26];
int get_val(int var)
{
return val_tab[var - 'A'];
}
void set_val(int var, int val)
{
val_tab[var - 'A'] = val;
}
.
//MAIN
//PROTOTYPE
int yyparse();
int main()
{
extern int yydebug;
yydebug = 0;
yyparse();
return 0;
}
Your lex file does not have any rule which matches \n, because in lex/flex, . matches any character except line-end. The default rule for lex (or flex) echoes and otherwise ignores the matched character, so that's what happens to the \n. Since the parser won't be able to accept a line unless it sees a \n token, it will eventually be forced to present you with a syntax error.
So you need to change the rule
. { return (int)yytext[0]; }
to
.|\n { return (int)yytext[0]; }
(I wouldn't have bothered with the cast to int but it's certainly not doing any harm, so I left it in.)
I have a lex file , a yacc file and main.cpp file.
My main.cpp looks like
int main(int argc, char **argv)
{
if (argc == 1)
{ int token;
curr_filename = "<stdin>";
yyin = stdin;
yyparse();
}
else
{
for (int i = 1; i < argc; ++i)
{
curr_filename = argv[i];
yyin = std::fopen(argv[i], "r");
if (yyin)
{
yyparse();
std::fclose(yyin);
}
else
{
utility::print_error(argv[i], "cannot be opened");
}
}
}
if (yynerrs > 0)
{
std::cerr << "Compilation halted due to lexical or syntax errors.\n";
exit(1);
}
This is helping to do a parse .But now i want to print token generated from the lex file also. So i do a little amendment to it by calling yylex() as follows
int main(int argc, char **argv)
{
if (argc == 1)
{ int token;
curr_filename = "<stdin>";
yyin = stdin;
// calling yylex to get token
while(token= yylex())
{
switch(token){
case 258 :
std::cout << "class" ;
default :
std::cout << "token " ;
}
yyparse();
}
//rest of the code same
but nothing is getting printed to output.
Any help how to get tokens printed on standard output or file
flex file
%option noyywrap
%option yylineno
%{
#include "flexbison.hpp"
#include "tokentable.hpp"
#include "symboltable.hpp"
#include "y.tab.h"
#include <stdio.h>
#define YY_USER_ACTION yylloc.first_line = yylloc.last_line = yylineno;
static const int MAX_STR_CONST = 1025;
char string_buf[MAX_STR_CONST]; // buffer to store string contstants encountered in source file
char *string_buf_ptr;
int num_comment = 0; // count to keep track how many opening comment tokens have been encountered
std::size_t curr_lineno = 0; // keep track of current line number of source file
bool str_too_long = false; // used to handle string constant size error check
%}
%x COMMENT
%x LINECOMMENT
%x STRING
DARROW =>
%%
"(*" {
BEGIN(COMMENT);
num_comment++;
}
"*)" {
if (num_comment <= 0) {
yylval.error_msg = "Unmatched *)";
return ERROR;
}
}
<COMMENT>"*)" {
num_comment--;
if (num_comment < 0) {
yylval.error_msg = "Unmatched *)";
return ERROR;
}
if (num_comment == 0) {
BEGIN(INITIAL);
}
}
<COMMENT>"(*" {
num_comment++;
}
<COMMENT>[^\n] {
// eat everything within comments
}
<COMMENT>\n {
++curr_lineno;
}
"--"[^\n]* {
BEGIN(LINECOMMENT);
}
<LINECOMMENT>\n {
++curr_lineno;
BEGIN(INITIAL);
}
<COMMENT><<EOF>> {
BEGIN(INITIAL);
yylval.error_msg = "EOF in comment";
return ERROR;
}
"=>" {
return DARROW;
}
(?i:class) {
return CLASS;
}
(?i:else) {
return ELSE;
}
(?i:in) {
return IN;
}
(?i:then) {
return THEN;
}
(?i:fi) {
return FI;
}
(?i:if) {
return IF;
}
(?i:inherits) {
return INHERITS;
}
(?i:let) {
return LET;
}
(?i:loop) {
return LOOP;
}
(?i:pool) {
return POOL;
}
(?i:while) {
return WHILE;
}
(?i:case) {
return CASE;
}
(?i:esac) {
return ESAC;
}
(?i:of) {
return OF;
}
(?i:new) {
return NEW;
}
(?i:isvoid) {
return ISVOID;
}
(?i:not) {
return NOT;
}
t(?i:rue) {
yylval.boolean = true;
return BOOL_CONST;
}
f(?i:alse) {
yylval.boolean = false;
return BOOL_CONST;
}
[0-9]+ {
yylval.symbol = inttable().add(yytext);
return INT_CONST;
}
"<=" {
return LE;
}
"<-" {
return ASSIGN;
}
[A-Z][a-zA-Z0-9_]* {
yylval.symbol = idtable().add(yytext);
return TYPEID;
}
[a-z][a-zA-Z0-9_]* {
yylval.symbol = idtable().add(yytext);
return OBJECTID;
}
";"|","|"{"|"}"|":"|"("|")"|"+"|"-"|"*"|"/"|"="|"~"|"<"|"."|"#" {
return *yytext;
}
\n {
++curr_lineno;
}
[ \f\r\t\v] {
// eat whitespace
}
/*
* String constants (C syntax)
* Escape sequence \c is accepted for all characters c. Except for
* \n \t \b \f, the result is c.
*
*/
\" {
BEGIN(STRING);
string_buf_ptr = string_buf;
memset(string_buf, 0, MAX_STR_CONST);
}
<STRING>\" {
BEGIN(INITIAL);
yylval.symbol = stringtable().add(string_buf);
return STR_CONST;
}
<STRING>\0[^\n]*\" {
BEGIN(INITIAL);
if (str_too_long) {
str_too_long = false;
}
else {
yylval.error_msg = "String contains null character";
return ERROR;
}
}
<STRING>\0[^"]*\n {
if (str_too_long) {
yyinput(); /* eat quote */
BEGIN(INITIAL);
str_too_long = false;
}
else {
if (yytext[yyleng - 1] != '\\') {
BEGIN(INITIAL);
yylval.error_msg = "String contains null character";
return ERROR;
}
}
}
<STRING><<EOF>> {
BEGIN(INITIAL);
yylval.error_msg = "EOF in string constant";
return ERROR;
}
<STRING>\\ {
if (strlen(string_buf) >= MAX_STR_CONST - 1) {
str_too_long = true;
unput('\0');
yylval.error_msg = "String constant too long";
return ERROR;
}
char ahead = yyinput();
switch (ahead) {
case 'b':
*string_buf_ptr++ = '\b';
break;
case 't':
*string_buf_ptr++ = '\t';
break;
case 'n':
*string_buf_ptr++ = '\n';
break;
case 'f':
*string_buf_ptr++ = '\f';
break;
case '\n':
++curr_lineno;
*string_buf_ptr++ = '\n';
break;
case '\0':
unput(ahead);
break;
default:
*string_buf_ptr++ = ahead;
}
}
<STRING>\n {
++curr_lineno;
BEGIN(INITIAL);
yylval.error_msg = "Unterminated string constant";
return ERROR;
}
<STRING>. {
if (strlen(string_buf) >= MAX_STR_CONST - 1) {
str_too_long = true;
unput('\0');
yylval.error_msg = "String constant too long";
return ERROR;
}
*string_buf_ptr++ = *yytext;
}
. /* error for invalid tokens */ {
yylval.error_msg = std::string(yytext) + " is not a valid character in the current context.";
return ERROR;
}
%%
bison file
%{
#include "flexbison.hpp"
#include "symboltable.hpp"
#include "tokentable.hpp"
#include "ast.hpp"
#include <iostream>
// convinience function for setting location of each ast node
#define SETLOC(lval,node) (lval)->setloc((node).first_line, curr_filename)
// both defined in main.cpp
extern ProgramPtr ast_root;
extern std::string curr_filename;
// both defined in lexer
extern int yylex();
extern int yylineno;
void yyerror(char *);
%}
%token CLASS 258 ELSE 259 FI 260 IF 261 IN 262
%token INHERITS 263 LET 264 LOOP 265 POOL 266 THEN 267 WHILE 268
%token CASE 269 ESAC 270 OF 271 DARROW 272 NEW 273 ISVOID 274
%token <symbol> STR_CONST 275 INT_CONST 276
%token <boolean> BOOL_CONST 277
%token <symbol> TYPEID 278 OBJECTID 279
%token ASSIGN 280 NOT 281 LE 282 ERROR 283
%type <program> program
%type <clazz> class
%type <classes> class_list
%type <attribute> attribute
%type <attributes> attribute_list
%type <method> method
%type <methods> method_list
%type <expression> expression
%type <expression> let_expr
%type <expressions> expression_list
%type <expressions> method_expr_list
%type <formal> formal
%type <formals> formal_list
%type <branch> case
%type <cases> case_list
%nonassoc '='
%left LET
%right ASSIGN
%left NOT
%left '+' '-'
%left '*' '/'
%left ISVOID
%left '~'
%left '#'
%left '.'
%nonassoc LE '<'
%%
program : class_list { #$ = #1; ast_root = std::make_shared<Program>($1); }
;
class_list : class { $$ = Classes(); $$.push_back($1); }
| class_list class { $$.push_back($2); }
;
class : CLASS TYPEID '{' attribute_list method_list '}' ';' { $$ = std::make_shared<Class>($2, idtable().add("Object"), $4, $5); SETLOC($$, #1); }
| CLASS TYPEID INHERITS TYPEID '{' attribute_list method_list '}' ';' { $$ = std::make_shared<Class>($2, $4, $6, $7); SETLOC($$, #1); }
| error ';' { yyerrok; }
;
attribute_list : attribute ';' { $$ = Attributes(); $$.push_back($1); }
| attribute_list attribute ';' { $$.push_back($2); }
| error ';' { yyerrok; }
;
attribute : OBJECTID ':' TYPEID { $$ = std::make_shared<Attribute>($1, $3, std::make_shared<NoExpr>()); SETLOC($$, #1); }
| OBJECTID ':' TYPEID ASSIGN expression { $$ = std::make_shared<Attribute>($1, $3, $5); SETLOC($$, #5); }
;
method_list : method ';' { $$ = Methods(); $$.push_back($1); }
| method_list method ';' { $$.push_back($2); }
| error ';' { yyerrok; }
;
method : OBJECTID '(' formal_list ')' ':' TYPEID '{' expression '}' { $$ = std::make_shared<Method>($1, $6, $3, $8); SETLOC($$, #1); }
| OBJECTID '(' ')' ':' TYPEID '{' expression '}' { $$ = std::make_shared<Method>($1, $5, Formals(), $7); SETLOC($$, #1); }
;
formal_list : formal { $$ = Formals(); $$.push_back($1); }
| formal_list ',' formal { $$.push_back($3); }
;
formal : OBJECTID ':' TYPEID { $$ = std::make_shared<Formal>($1, $3); SETLOC($$, #1); }
;
case_list : case { $$ = Cases(); $$.push_back($1); }
| case_list case { $$.push_back($2); }
;
case : OBJECTID ':' TYPEID DARROW expression ';' { $$ = std::make_shared<CaseBranch>($1, $3, $5); SETLOC($$, #5); }
;
method_expr_list : expression { $$ = Expressions(); $$.push_back($1); }
| method_expr_list ',' expression { $$.push_back($3); }
;
expression_list : expression ';' { $$ = Expressions(); $$.push_back($1); }
| expression_list expression ';' { $$.push_back($2); }
| error ';' { yyerrok; }
;
let_expr : OBJECTID ':' TYPEID IN expression %prec LET { $$ = std::make_shared<Let>($1, $3, std::make_shared<NoExpr>(), $5); SETLOC($$, #5); }
| OBJECTID ':' TYPEID ASSIGN expression IN expression %prec LET { $$ = std::make_shared<Let>($1, $3, $5, $7); SETLOC($$, #5); }
| OBJECTID ':' TYPEID ',' let_expr { $$ = std::make_shared<Let>($1, $3, std::make_shared<NoExpr>(), $5); SETLOC($$, #5); }
| OBJECTID ':' TYPEID ASSIGN expression ',' let_expr { $$ = std::make_shared<Let>($1, $3, $5, $7); SETLOC($$, #4); }
| error ',' let_expr { yyerrok; }
;
expression : OBJECTID ASSIGN expression { $$ = std::make_shared<Assign>($1, $3); SETLOC($$, #3); }
| expression '.' OBJECTID '(' method_expr_list ')' { $$ = std::make_shared<DynamicDispatch>($1, $3, $5); SETLOC($$, #1); }
| expression '.' OBJECTID '(' ')' { $$ = std::make_shared<DynamicDispatch>($1, $3, Expressions()); SETLOC($$, #1); }
| expression '#' TYPEID '.' OBJECTID '(' method_expr_list ')' { $$ = std::make_shared<StaticDispatch>($1, $3, $5, $7); SETLOC($$, #1); }
| expression '#' TYPEID '.' OBJECTID '(' ')' { $$ = std::make_shared<StaticDispatch>($1, $3, $5, Expressions()); SETLOC($$, #1);}
| OBJECTID '(' method_expr_list ')' { $$ = std::make_shared<DynamicDispatch>(std::make_shared<Object>(idtable().add("self")), $1, $3);
SETLOC($$, #1); }
| OBJECTID '(' ')' { $$ = std::make_shared<DynamicDispatch>(std::make_shared<Object>(idtable().add("self")), $1, Expressions());
SETLOC($$, #1); }
| IF expression THEN expression ELSE expression FI { $$ = std::make_shared<If>($2, $4, $6); SETLOC($$, #2); }
| WHILE expression LOOP expression POOL { $$ = std::make_shared<While>($2, $4); SETLOC($$, #2); }
| '{' expression_list '}' { $$ = std::make_shared<Block>($2); SETLOC($$, #2); }
| LET let_expr { $$ = $2; SETLOC($$, #2); }
| CASE expression OF case_list ESAC { $$ = std::make_shared<Case>($2, $4); SETLOC($$, #2); }
| NEW TYPEID { $$ = std::make_shared<New>($2); SETLOC($$, #2); }
| ISVOID expression { $$ = std::make_shared<IsVoid>($2); SETLOC($$, #2); }
| expression '+' expression { $$ = std::make_shared<Plus>($1, $3); SETLOC($$, #1); }
| expression '-' expression { $$ = std::make_shared<Sub>($1, $3); SETLOC($$, #1); }
| expression '*' expression { $$ = std::make_shared<Mul>($1, $3); SETLOC($$, #1); }
| expression '/' expression { $$ = std::make_shared<Div>($1, $3); SETLOC($$, #1); }
| '~' expression { $$ = std::make_shared<Complement>($2); SETLOC($$, #2); }
| expression '<' expression { $$ = std::make_shared<LessThan>($1, $3); SETLOC($$, #1); }
| expression LE expression { $$ = std::make_shared<LessThanEqualTo>($1, $3); SETLOC($$, #1); }
| expression '=' expression { $$ = std::make_shared<EqualTo>($1, $3); SETLOC($$, #1); }
| NOT expression { $$ = std::make_shared<Not>($2); SETLOC($$, #2); }
| '(' expression ')' { $$ = $2; SETLOC($$, #2); }
| OBJECTID { $$ = std::make_shared<Object>($1); SETLOC($$, #1); }
| INT_CONST { $$ = std::make_shared<IntConst>($1); SETLOC($$, #1); }
| STR_CONST { $$ = std::make_shared<StringConst>($1); SETLOC($$, #1); }
| BOOL_CONST { $$ = std::make_shared<BoolConst>($1); SETLOC($$, #1); }
;
%%
// utility function for converting bison tokens to its string representation
// for better error reporting
std::string convert_token(int token)
{
std::string rep;
switch (token)
{
case CLASS: rep = "class"; break;
case ELSE: rep = "else"; break;
case FI: rep = "fi"; break;
case IF: rep = "if"; break;
case IN: rep = "in"; break;
case INHERITS: rep = "inherits"; break;
case LET: rep = "let"; break;
case LOOP: rep = "loop"; break;
case POOL: rep = "pool"; break;
case THEN: rep = "then"; break;
case WHILE: rep = "while"; break;
case CASE: rep = "case"; break;
case ESAC: rep = "esac"; break;
case OF: rep = "of"; break;
case DARROW: rep = "=>"; break;
case NEW: rep = "new"; break;
case ISVOID: rep = "isvoid"; break;
case ASSIGN: rep = "<-"; break;
case NOT: rep = "not"; break;
case LE: rep = "<="; break;
case STR_CONST: rep = "STR_CONST = " + yylval.symbol.get_val(); break;
case INT_CONST: rep = "INT_CONST = " + yylval.symbol.get_val(); break;
case BOOL_CONST: rep = "BOOL_CONST = " + yylval.boolean; break;
case TYPEID: rep = "TYPEID = " + yylval.symbol.get_val(); break;
case OBJECTID: rep = "OBJECTID = " + yylval.symbol.get_val(); break;
default: rep = (char) token;
}
return rep;
}
void yyerror(char *)
{
if (yylval.error_msg.length() <= 0)
std::cerr << curr_filename << ":" << yylineno << ": " << "error: " << "syntax error near or at character or token '" << convert_token(yychar) << "'\n";
else
std::cerr << curr_filename << ":" << yylineno << ": " << "error: " << yylval.error_msg << "\n";
}
I'm not sure why you don't see any output, but I didn't look through all that code. If you call yylex from main, that will read and effectively discard one token. Then when you call yyparse, yyparse will call yylex itself until yylex returns 0. Presumably (but not certainly), the next time you call yylex from the while loop in main, it will again return 0 and the loop will end. The result should be that one word is printed from the while loop, followed by whatever output is produced by your yyparse (if any), which will possibly signal a syntax error since it never sees the first token from the input.
I doubt that is what you wanted to do, but it's not totally clear.
If you want to see the tokens as they are being lexed, then insert the statement to print the token in each lex action. Or tell flex to call the scanning function something else, like yylex_internal and create your own function called yylex() which calls yylex_internal and then prints the result before returning it.
If, as seems likely, you are only interested in this for debugging purposes, then you'd probably be better off using the -d command line option to flex, which will generate debugging output automatically. It might not be exactly the debugging format you want, but it's a lot easier to do and undo :)
To change the name of the yylex function generated by flex, insert something like the following in the code block at the top of the .l file:
#define YY_DECL int yylex_internal()
The flex-generated file declares the scanning function as follows:
YY_DECL {
/* body of function
}
So you can rename the function or add arguments, or even change the return type by defining the YY_DECL macro. See the Generated Scanner section of the flex manual.
By the way, it's not generally considered good style to manually number all the terminal tokens, even though bison allows you to do it. You should just let bison number them, and include the definitions in a source file by #include "y.tab.h" (or whatever you've called the bison header file; you can easily change the name by using the -o option).