I'm new to flex and bison. I want to write a compiler that read C program and translate it to my processor commands that are similar to assembly. I downloaded a pre-written compiler that uses flex and bison. I should change the scanner.l and parser.y as it can process the asm commands that are inside my C code like asm [asm command1 \n asm command2 \n asm command3 \n ...]. Which definitions and rules should I add to these two files?
scanner.l:
%{
#include "scanner.h"
#include "y.tab.h"
#include <stdio.h>
#include <stdlib.h>
#define MAX_STR_CONST 1000
char string_buf[MAX_STR_CONST];
char *string_buf_ptr;
int line_num = 1;
int line_pos = 1;
void updatePosition();
#define YY_USER_ACTION updatePosition();
%}
NUMBER (0)|([1-9][0-9]*)
HEXNUM ((0x)|(0X))([a-fA-F0-9]+)
IDENT [a-zA-Z_][a-zA-Z0-9_]*
%x comment
%x str
%option noyywrap
%option yylineno
%option nounput
%%
\" string_buf_ptr = string_buf; BEGIN(str);
<str>{
\" { /* saw closing quote - all done */
BEGIN(INITIAL);
*string_buf_ptr = '\0';
/* return string constant token type and
* value to parser
*/
yylval.strConst = new std::string(string_buf);
return T_STR_CONST;
}
\n {
/* error - unterminated string constant */
/* generate error message */
yyerror("Unterminated string constant.");
}
<<EOF>> { return T_UNTERM_STRING; }
\\[0-7]{1,3} {
/* octal escape sequence */
int result;
(void) sscanf( yytext + 1, "%o", &result );
if ( result > 0xff )
/* error, constant is out-of-bounds */
*string_buf_ptr++ = result;
}
\\[0-9]+ {
/* generate error - bad escape sequence; something
* like '\48' or '\0777777'
*/
yyerror("Bad string escape sequence.");
}
\\n *string_buf_ptr++ = '\n';
\\t *string_buf_ptr++ = '\t';
\\r *string_buf_ptr++ = '\r';
\\b *string_buf_ptr++ = '\b';
\\f *string_buf_ptr++ = '\f';
\\(.|\n) *string_buf_ptr++ = yytext[1];
[^\\\n\"]+ {
char *yptr = yytext;
while ( *yptr )
*string_buf_ptr++ = *yptr++;
}
}
"/*" BEGIN(comment);
<comment>{
[^*\n]* /* eat anything that's not a '*' */
"*"+[^*/\n]* /* eat up '*'s not followed by '/'s */
\n
<<EOF>> { return T_UNTERM_COMMENT; }
"*"+"/" BEGIN(INITIAL);
}
"do" { return T_DO; }
"while" { return T_WHILE; }
"for" { return T_FOR; }
"if" { return T_IF; }
"else" { return T_ELSE; }
"int" { return T_INT_TYPE; }
"string" { return T_STRING_TYPE; }
"void" { return T_VOID_TYPE; }
"struct" { return T_STRUCT; }
"return" { return T_RETURN; }
"switch" { return T_SWITCH; }
"case" { return T_CASE; }
"default" { return T_DEFAULT; }
"break" { return T_BREAK; }
"continue" { return T_CONTINUE; }
"sizeof" { return T_SIZEOF; }
"{" { return '{'; }
"}" { return '}'; }
"(" { return '('; }
")" { return ')'; }
"[" { return '['; }
"]" { return ']'; }
"+" { return '+'; }
"-" { return '-'; }
"*" { return '*'; }
"/" { return '/'; }
"%" { return '%'; }
"=" { return '='; }
">" { return '>'; }
"<" { return '<'; }
"!" { return '!'; }
"|" { return '|'; }
"&" { return '&'; }
"^" { return '^'; }
"~" { return '~'; }
"." { return '.'; }
":" { return ':'; }
";" { return ';'; }
"," { return ','; }
"<<" { return T_LEFT_SHIFT; }
">>" { return T_RIGHT_SHIFT; }
"&&" { return T_BOOL_AND; }
"||" { return T_BOOL_OR; }
"+=" { return T_PLUS_EQUALS; }
"-=" { return T_MINUS_EQUALS; }
"*=" { return T_STAR_EQUALS; }
"/=" { return T_DIV_EQUALS; }
"%=" { return T_MOD_EQUALS; }
"==" { return T_EQUAL; }
"<=" { return T_LESS_OR_EQUAL; }
">=" { return T_GREATER_OR_EQUAL; }
"!=" { return T_NOT_EQUAL; }
"|=" { return T_BIT_OR_EQUALS; }
"&=" { return T_BIT_AND_EQUALS; }
"^=" { return T_BIT_XOR_EQUALS; }
"~=" { return T_BIT_NOT_EQUALS; }
"->" { return T_ARROW; }
"<<=" { return T_LEFT_SHIFT_EQUALS; }
">>=" { return T_RIGHT_SHIFT_EQUALS; }
"++" { return T_PLUS_PLUS; }
"--" { return T_MINUS_MINUS; }
" "|"\t"|"\r"|"\n"|"const" {}
{HEXNUM} { yylval.intConst = std::strtoul(yytext, NULL, 0); return T_INT_CONST; }
{NUMBER} { yylval.intConst = atoi(yytext); return T_INT_CONST; }
{IDENT} { yylval.ident = new std::string(yytext); return T_IDENT; }
. {{ char err[] = "Unknown Character: a"; err[strlen(err)-1] = *yytext; yyerror(err); }}
%%
/**
* This function is called on every token, and updates the yylloc global variable, which stores the
* location/position of the current token.
*/
void updatePosition() {
yylloc.first_line = line_num;
yylloc.first_column = line_pos;
char* text = yytext;
while(*text != '\0') {
if(*text == '\n') {
line_num++;
line_pos = 1;
} else {
line_pos++;
}
text++;
}
yylloc.last_line = line_num;
yylloc.last_column = line_pos;
}
parser.y:
%code requires {
#include "Declaration.h"
#include "Expression.h"
#include "Statement.h"
#include "Type.h"
#include "Parser.h"
#include "Util.h"
extern Program* program_out;
}
%locations
%define parse.lac full
%error-verbose
%{
#include "Parser.h"
#include "scanner.h"
#include <string>
#include <iostream>
#include "Type.h"
%}
//%parse-param {Program*& out}
%union {
char* cstr;
std::string* ident;
std::string* strConst;
unsigned int intConst;
Type* type;
std::vector<Declaration*>* declareList;
Declaration* declare;
ConstantExpression* constant;
std::vector<FunctionParameter*>* paramList;
FunctionParameter* param;
std::vector<StructMember*>* structMemberList;
StructMember* structMember;
StatementBlock* statementBlock;
Statement* statement;
std::vector<Statement*>* statementList;
Expression* expression;
std::vector<Expression*>* expressionList;
}
%type <type> type
%type <cstr> root
%type <declareList> root_declare_list
%type <declare> root_declare
%type <constant> constant
%type <paramList> param_list non_empty_param_list
%type <param> param
%type <structMemberList> struct_list
%type <structMember> struct_member;
%type <statementBlock> statement_block
%type <statementList> statement_list
%type <statement> statement
%type <expression> expression
%type <expressionList> argument_list non_empty_argument_list
%token <ident> T_IDENT
%token <strConst> T_STR_CONST
%token <intConst> T_INT_CONST
%token T_IF T_ELSE T_FOR T_WHILE T_DO T_SIZEOF
%token T_INT_TYPE T_STRING_TYPE T_VOID_TYPE T_STRUCT
%token T_RETURN T_SWITCH T_CASE T_DEFAULT T_BREAK T_CONTINUE
%token T_BOOL_OR T_BOOL_AND
%token T_LEFT_SHIFT T_RIGHT_SHIFT T_PLUS_EQUALS T_MINUS_EQUALS
%token T_STAR_EQUALS T_DIV_EQUALS T_MOD_EQUALS T_EQUAL
%token T_LESS_OR_EQUAL T_GREATER_OR_EQUAL T_NOT_EQUAL
%token T_BIT_OR_EQUALS T_BIT_AND_EQUALS T_BIT_XOR_EQUALS
%token T_BIT_NOT_EQUALS T_ARROW T_LEFT_SHIFT_EQUALS
%token T_RIGHT_SHIFT_EQUALS T_PLUS_PLUS T_MINUS_MINUS
%token T_UNTERM_STRING T_UNTERM_COMMENT
/* tokens for precedence */
%token PREC_ADDRESS PREC_DEREFERENCE PREC_UNARY_MINUS PREC_UNARY_PLUS
%token PREC_SUFFIX_PLUS_PLUS PREC_SUFFIX_MINUS_MINUS
%token PREC_PREFIX_PLUS_PLUS PREC_PREFIX_MINUS_MINUS
%token PREC_APPLICATION
/* lowest precedence */
%left ','
%right T_BIT_AND_EQUALS T_BIT_XOR_EQUALS T_BIT_OR_EQUALS
%right T_LEFT_SHIFT_EQUALS T_RIGHT_SHIFT_EQUALS
%right T_STAR_EQUALS T_DIV_EQUALS T_MOD_EQUALS
%right T_PLUS_EQUALS T_MINUS_EQUALS
%right '='
%left T_BOOL_OR
%left T_BOOL_AND
%left '|'
%left '^'
%left '&'
%left T_EQUAL T_NOT_EQUAL
%left '>' T_GREATER_OR_EQUAL
%left '<' T_LESS_OR_EQUAL
%left T_LEFT_SHIFT T_RIGHT_SHIFT
%left '+' '-'
%left '*' '/' '%'
%right PREC_ADDRESS
%right PREC_DEREFERENCE
%right '!' '~'
%right PREC_UNARY_PLUS PREC_UNARY_MINUS
%right PREC_PREFIX_PLUS_PLUS PREC_PREFIX_MINUS_MINUS
%right T_PLUS_PLUS T_MINUS_MINUS
%left T_ARROW
%left '.'
%left '['
%left PREC_APPLICATION
%left PREC_SUFFIX_PLUS_PLUS PREC_SUFFIX_MINUS_MINUS
%nonassoc T_IF
%nonassoc T_ELSE
/* highest precedence */
%%
root:
root_declare_list { $$ = NULL; program_out = new Program(#$, *$1); delete $1; }
;
root_declare_list:
root_declare_list root_declare { $$ = $1; $1->push_back($2); }
| { $$ = new std::vector<Declaration*>(); }
;
root_declare:
type T_IDENT '(' param_list ')' ';' { $$ = new FunctionPrototype(#$, $1, *$2, *$4); delete $2; delete $4; }
| type T_IDENT '(' param_list ')' statement_block { $$ = new FunctionDeclaration(#$, $1, *$2, *$4, $6); delete $2; delete $4; }
| type T_IDENT ';' { $$ = new GlobalVarDeclaration(#$, $1, *$2); delete $2; }
| type T_IDENT '[' T_INT_CONST ']' ';' { $$ = new GlobalArrayDeclaration(#$, $1, *$2, $4); delete $2; }
| type T_IDENT '=' constant ';' { $$ = new GlobalVarDeclarationInit(#$, $1, *$2, $4); delete $2; }
| T_STRUCT T_IDENT '{' struct_list '}' ';' { $$ = new StructDeclaration(#$, *$2, *$4); delete $2; delete $4; }
| T_STRUCT T_IDENT ';' { $$ = new StructPredeclaration(#$, *$2); delete $2; }
;
constant:
T_INT_CONST { $$ = new IntConstantExpression(#$, $1); }
| T_STR_CONST { $$ = new StringConstantExpression(#$, *$1); delete $1; }
;
param_list:
non_empty_param_list { $$ = $1; }
| { $$ = new std::vector<FunctionParameter*>(); }
;
non_empty_param_list:
non_empty_param_list ',' param { $$ = $1; $1->push_back($3); }
| param { $$ = new std::vector<FunctionParameter*>({$1}); }
;
param:
type T_IDENT { $$ = new FunctionParameter(#$, $1, *$2); delete $2; }
;
struct_list:
struct_list struct_member ';' { $$ = $1; $1->push_back($2); }
| { $$ = new std::vector<StructMember*>(); }
;
struct_member:
type T_IDENT { $$ = new StructMember(#$, $1, *$2); delete $2; }
;
type:
type '*' { $$ = new PointerType($1); }
| T_STRUCT T_IDENT { $$ = new StructType(*$2); delete $2; }
| T_INT_TYPE { $$ = new IntType(); }
| T_VOID_TYPE { $$ = new VoidType(); }
| T_STRING_TYPE { $$ = new StringType(); }
;
statement_block:
'{' statement_list '}' { $$ = new StatementBlock(#$, *$2); delete $2; }
;
statement_list:
statement_list statement { $$ = $1; $1->push_back($2); }
| { $$ = new std::vector<Statement*>(); }
;
statement:
expression ';' { $$ = $1; }
| type T_IDENT ';' { $$ = new VarDeclaration(#$, $1, *$2); delete $2; }
| type T_IDENT '=' expression ';' { $$ = new VarDeclarationInit(#$, $1, *$2, $4); delete $2; }
| type T_IDENT '[' T_INT_CONST ']' ';' { $$ = new ArrayDeclaration(#$, $1, *$2, $4); delete $2; }
| T_WHILE '(' expression ')' statement { $$ = new WhileStatement(#$, $3, $5); }
| T_DO statement T_WHILE '(' expression ')' ';' { $$ = new DoWhileStatement(#$, $2, $5); }
| T_FOR '(' expression ';' expression ';' expression ')' statement { $$ = new ForStatement(#$, $3, $5, $7, $9); }
| statement_block { $$ = $1; }
| T_IF '(' expression ')' statement %prec T_IF { $$ = new IfStatement(#$, $3, $5); }
| T_IF '(' expression ')' statement T_ELSE statement { $$ = new IfElseStatement(#$, $3, $5, $7); }
| T_BREAK ';' { $$ = new BreakStatement(#$); }
| T_CONTINUE ';' { $$ = new ContinueStatement(#$); }
| T_SWITCH '(' expression ')' '{' statement_list '}' { $$ = new SwitchStatement(#$, $3, *$6); delete $6; }
| T_CASE T_INT_CONST ':' { $$ = new CaseStatement(#$, $2); }
| T_DEFAULT ':' { $$ = new DefaultStatement(#$); }
| T_RETURN expression ';' { $$ = new ReturnStatement(#$, $2); }
;
expression:
expression '=' expression { $$ = new AssignExpression(#$, $1, $3); }
| expression T_PLUS_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "+", $3)); }
| expression T_MINUS_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "-", $3)); }
| expression T_STAR_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "*", $3)); }
| expression T_DIV_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "/", $3)); }
| expression T_MOD_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "%", $3)); }
| expression T_BIT_AND_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "&", $3)); }
| expression T_BIT_OR_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "|", $3)); }
| expression T_BIT_XOR_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "^", $3)); }
| expression T_LEFT_SHIFT_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), "<<", $3)); }
| expression T_RIGHT_SHIFT_EQUALS expression { $$ = new AssignExpression(#$, $1, new BinaryOperatorExpression(#3, $1->clone(), ">>", $3)); }
| expression T_PLUS_PLUS %prec PREC_SUFFIX_PLUS_PLUS { $$ = new UnaryAssignExpression(#$, $1, "++"); }
| T_PLUS_PLUS expression %prec PREC_PREFIX_PLUS_PLUS { $$ = new UnaryAssignExpression(#$, "++", $2); }
| expression T_MINUS_MINUS %prec PREC_SUFFIX_MINUS_MINUS { $$ = new UnaryAssignExpression(#$, $1, "--"); }
| T_MINUS_MINUS expression %prec PREC_PREFIX_MINUS_MINUS { $$ = new UnaryAssignExpression(#$, "--", $2); }
| constant { $$ = $1; }
| '(' expression ')' { $$ = $2; }
| T_IDENT '(' argument_list ')' %prec PREC_APPLICATION { $$ = new FunctionCallExpression(#$, *$1, *$3); delete $1; delete $3; }
| T_SIZEOF '(' type ')' { $$ = new SizeofExpression(#$, $3); }
| '!' expression { $$ = new UnaryOperatorExpression(#$, "!", $2); }
| '~' expression { $$ = new UnaryOperatorExpression(#$, "~", $2); }
| '+' expression %prec PREC_UNARY_PLUS { $$ = new UnaryOperatorExpression(#$, "+", $2); }
| '-' expression %prec PREC_UNARY_MINUS { $$ = new UnaryOperatorExpression(#$, "-", $2); }
| '*' expression %prec PREC_DEREFERENCE { $$ = new ArraySubscriptExpression(#$, $2, new IntConstantExpression(#2, 0)); }
| '&' expression %prec PREC_ADDRESS { $$ = new UnaryOperatorExpression(#$, "&", $2); }
| expression '+' expression { $$ = new BinaryOperatorExpression(#$, $1, "+", $3); }
| expression '-' expression { $$ = new BinaryOperatorExpression(#$, $1, "-", $3); }
| expression '*' expression { $$ = new BinaryOperatorExpression(#$, $1, "*", $3); }
| expression '/' expression { $$ = new BinaryOperatorExpression(#$, $1, "/", $3); }
| expression '%' expression { $$ = new BinaryOperatorExpression(#$, $1, "%", $3); }
| expression '&' expression { $$ = new BinaryOperatorExpression(#$, $1, "&", $3); }
| expression '|' expression { $$ = new BinaryOperatorExpression(#$, $1, "|", $3); }
| expression '^' expression { $$ = new BinaryOperatorExpression(#$, $1, "^", $3); }
| expression T_BOOL_AND expression { $$ = new BinaryOperatorExpression(#$, $1, "&&", $3); }
| expression T_BOOL_OR expression { $$ = new BinaryOperatorExpression(#$, $1, "||", $3); }
| expression T_LEFT_SHIFT expression { $$ = new BinaryOperatorExpression(#$, $1, "<<", $3); }
| expression T_RIGHT_SHIFT expression { $$ = new BinaryOperatorExpression(#$, $1, ">>", $3); }
| expression T_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "==", $3); }
| expression T_NOT_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "!=", $3); }
| expression '<' expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "<", $3); }
| expression '>' expression { $$ = new BinaryOperatorConditionExpression(#$, $1, ">", $3); }
| expression T_LESS_OR_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, "<=", $3); }
| expression T_GREATER_OR_EQUAL expression { $$ = new BinaryOperatorConditionExpression(#$, $1, ">=", $3); }
| T_IDENT { $$ = new VarExpression(#$, *$1); delete $1; }
| expression '.' T_IDENT { $$ = new StructMemberExpression(#$, $1, *$3); delete $3; }
| expression T_ARROW T_IDENT { $$ = new StructMemberExpression(#$, $1, *$3); delete $3; }
| expression '[' expression ']' { $$ = new ArraySubscriptExpression(#$, $1, $3); }
;
argument_list:
non_empty_argument_list { $$ = $1; }
| { $$ = new std::vector<Expression*>(); }
;
non_empty_argument_list:
non_empty_argument_list ',' expression { $$ = $1; $1->push_back($3); }
| expression { $$ = new std::vector<Expression*>({$1}); }
;
I created a small compiler and need help to fix it.
Code of my compiler:
t.l:
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "y.tab.h"
%}
%x DOUBLE_QUOTES
%%
<INITIAL>[s|S][h|H][o|O][w|W] {return show;}
<INITIAL>[a-zA-z] {yylval.id=yytext[0];return identifier;}
<INITIAL>[0-9]+ {yylval.num=atoi(yytext);return number;}
<INITIAL>[\-\+\=\;\*\/] {return yytext[0];}
<INITIAL>["] {
printf("(STRING_OPEN) ");
BEGIN(DOUBLE_QUOTES);
}
<DOUBLE_QUOTES>["] {
printf("(STRING_CLOSE) ");
BEGIN(INITIAL);
printf("(STRING:%S) ",yytext[1]);
}
%%
int yywrap (void) {return 1;}
t.y:
%{
void yyerror(char *s);
#include <stdio.h>
#include <stdlib.h>
int symbols[52];
int symbolVal(char symbol);
void updateSymbolVal(char symbol,int val);
%}
%union {int num;char id;}
%start line
%token show
%token <num> number
%token <id> identifier
%type <num> line exp term
%type <id> assignment
%%
line : assignment ';' {;}
| show exp ';' {printf("showing : %d\n",$2);}
| line assignment ';' {;}
| line show exp ';' {printf("showing : %d\n",$3);}
;
assignment: identifier '=' exp {updateSymbolVal($1,$3);}
;
exp : term {$$ = $1;}
| exp '+' term {$$ = $1 + $3;}
| exp '-' term {$$ = $1 - $3;}
| exp '*' term {$$ = $1 * $3;}
| exp '/' term {$$ = $1 / $3;}
;
term : number {$$ = $1;}
| identifier {$$ = symbolVal($1);}
%%
int computerSymbolIndex(char token)
{
int idx=-1;
if(islower(token))
{
idx=token-'a'+26;
}
else if(isupper(token))
{
idx = token - 'A';
}
return idx;
}
int symbolVal(char symbol)
{
int bucket = computerSymbolIndex(symbol);
return symbols[bucket];
}
void updateSymbolVal(char symbol,int val)
{
int bucket = computerSymbolIndex(symbol);
symbols[bucket] = val;
}
int main (void) {
printf("Created By BoxWeb Inc\n");
int i;
for(i=0;i<52;i++)
{
symbols[i]=0;
}
return yyparse();
}
void yyerror (char *s) {printf("-%s at %s !\n",s );}
command for test compiler :
show 5+5;
show 5*2;
show 5+5-2*2/1;
i need to upgrade to (want can print string):
show "hello" . " " . "mr";//hello mr
show 5+5 . " ?";//10 ?
and more....
In the lexer I use :
<INITIAL>["] {
printf("(STRING_OPEN) ");
BEGIN(DOUBLE_QUOTES);
}
<DOUBLE_QUOTES>["] {
printf("(STRING_CLOSE) ");
BEGIN(INITIAL);
printf("(STRING:%S) ",yytext[1]);
}
but I don't know how use this in a parser.
Please help me to complete this compiler.
Lets simplify it for a moment to just one possible operation
We have the following grammar
assignment: '$' identifier '=' exp ';' {updateSymbolVal($2,$4); }
;
exp: number {$$ = createExp($1);}
| string {$$ = createExp($1);}
| exp '+' exp {$$ = addExp($1,$3);}
;
Since the expression can be many different things we can't just save it in a integer but need a more complex structure, something like this:
enum expType {NUMBER, STRING};
struct Exp{
expType type;
double number;
std::string str;
};
Then we make the functions to create your expressions:
Exp* createExp(int v){
Exp *e = new Exp();
e->type = NUMBER;
e->number = v;
return e;
}
Exp* createExp(std::string s){
Exp *e = new Exp();
e->type = STRING;
e->str = s;
return e;
}
And then to do all your calculations and assignment you will always have to check the type.
Exp* addExp(Exp *a, Exp *b){
Exp *c;
if(a->type == NUMBER && b->type == NUMBER){
c->type == NUMBER;
c->number == a->number + b->number;
}
else{
std::cout << "some nice error message\n";
}
return c;
}
Same with the assign function
void updateSymbolVal(const std::string &identifier, Exp *e){
if(e->type == NUMBER){
myNumbers[identifier] = e->number;
}
if(e->type == STRING){
myStrings[identifier] = e->str;
}
}
Of course you could also make a map/vector/array of the struct Exp if you need to do some more manipulations with it. Or just hand it over to the next level.
Edit for the question of multi-language support
As written in the comment I refer to this question Flex(lexer) support for unicode. To simplify it to your need here you can make it like this.
ASC [a-zA-Z_0-9]
U [\x80-\xbf]
U2 [\xc2-\xdf]
U3 [\xe0-\xef]
U4 [\xf0-\xf4]
UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
UANY+ {yylval.id = yytext[0]; return string;}
I made a .l and a .y files for the parsing and the calculation result for my new language: it is working fine! A string like this:
SET(IFEL(MAJEQ(IFEL(EQ(VAL(16),MUL(VAL(2),VAL(8))),VAL(11),VAL(10)),VAL(10)),MUL(VAL(3),VAL(4)),SUB(VAL(6),VAL(2))))
is correctly parsed and calculated by my two files:
%{
#include <stdio.h>
#include <string>
#include <cstring>
using namespace std;
extern int yylex();
extern void yyerror(char*);
%}
//Symbols
%union
{
char *str_val;
int int_val;
};
%token OPEN;
%token CLOSE;
%token SET;
%token STORE;
%token MUL;
%token ADD;
%token DIV;
%token SUB;
%token ABS;
%token IFEL;
%token AND;
%token OR;
%token NOT;
%token MAJEQ;
%token MINEQ;
%token MAJ;
%token MIN;
%token EQ;
%token GET;
%token S; /* separator */
%token VAR;
%token VAL;
%token <int_val> NUMBER
%token <str_val> IDENTIFIER
%type <int_val> Exp
%type <int_val> Cond
%type <int_val> Check
%type <int_val> Var
%start Expression
%%
Expression:
/* empty */
| SET OPEN Exp CLOSE
{
printf("value set %d\n",$3);
}
| STORE OPEN VAR OPEN IDENTIFIER CLOSE S Exp CLOSE
{
printf("var %s set on %d\n",$5,$8);
}
;
Exp:
Var
| IFEL OPEN Cond S Exp S Exp CLOSE
{
if($3==1){
$$ = $5;
}else{
$$ = $7;
}
}
| ADD OPEN Exp S Exp CLOSE
{
$$ = $3+$5;
}
| SUB OPEN Exp S Exp CLOSE
{
$$ = $3-$5;
}
| MUL OPEN Exp S Exp CLOSE
{
$$ = $3*$5;
}
| DIV OPEN Exp S Exp CLOSE
{
$$ = $3/$5; //TBD check div 0
}
| ABS OPEN Exp CLOSE
{
$$ = $3; //TBD
}
;
Cond:
NOT OPEN Cond CLOSE
{
int result = $3;
if(result==1) $$ = 0;
else $$ = 1;
}
| AND OPEN Cond S Cond CLOSE
{
int result1 = $3;
int result2 = $5;
if(result1==1 && result2==1) $$ = 1;
else $$ = 0;
}
| OR OPEN Cond S Cond CLOSE
{
int result1 = $3;
int result2 = $5;
if(result1==1 || result2==1) $$ = 1;
else $$ = 0;
}
| Check
;
Check:
MAJ OPEN Exp S Exp CLOSE
{
int val1 = $3;
int val2 = $5;
if(val1>val2) $$ = 1;
else $$ = 0;
}
| MIN OPEN Exp S Exp CLOSE
{
int val1 = $3;
int val2 = $5;
if(val1<val2) $$ = 1;
else $$ = 0;
}
| EQ OPEN Exp S Exp CLOSE
{
int val1 = $3;
int val2 = $5;
if(val1==val2) $$ = 1;
else $$ = 0;
}
| MAJEQ OPEN Exp S Exp CLOSE
{
int val1 = $3;
int val2 = $5;
if(val1>=val2) $$ = 1;
else $$ = 0;
}
| MINEQ OPEN Exp S Exp CLOSE
{
int val1 = $3;
int val2 = $5;
if(val1<=val2) $$ = 1;
else $$ = 0;
}
;
Var:
VAR OPEN IDENTIFIER CLOSE
{
$$ = atoi($3); //TBD
}
| VAL OPEN NUMBER CLOSE
{
$$ = $3;
}
| GET OPEN CLOSE
{
$$ = 11; //TBD
}
;
%%
and
%{
#include <string>
#include "expression.tab.h"
void yyerror(char*);
extern void printVars();
int yyparse(void);
%}
%%
[ \t\n]+ { /* ignore */ };
"(" return(OPEN);
")" return(CLOSE);
"SET" return(SET);
"STORE" return(STORE);
"MUL" return(MUL);
"ADD" return(ADD);
"DIV" return(DIV);
"SUB" return(SUB);
"ABS" return(ABS);
"IFEL" return(IFEL);
"NOT" return(NOT);
"AND" return(AND);
"OR" return(OR);
"MAJEQ" return(MAJEQ);
"MINEQ" return(MINEQ);
"MAJ" return(MAJ);
"MIN" return(MIN);
"EQ" return(EQ);
"VAR" return(VAR);
"VAL" return(VAL);
"GET" return(GET);
"," return(S);
[[:digit:]]+ { yylval.int_val = atoi(yytext); return NUMBER;}
[[:alnum:]]+ { yylval.str_val = strdup(yytext); return IDENTIFIER;}
. return yytext[0];
%%
void yyerror(char *s){
printf("<ERR> %s at %s in this line:\n", s, yytext);
}
int yywrap (void){
}
int main(int num_args, char** args){
if(num_args != 2) {printf("usage: ./parser filename\n"); exit(0);}
FILE* file = fopen(args[1],"r");
if(file == NULL) {printf("couldn't open %s\n",args[1]); exit(0);}
yyin = file;
yyparse();
fclose(file);
}
But actually the value inside Var in input as you can see will not be static but should be dynamic. So my next step is to modify the project: instead of calculating should write a C++ code in order to make the calculation dynamic.
My questions:
1) do you have a better solution instead of concatenate every step a char * for making the code?
2) If not, can you help me to find a smart way to concatenate all the strings and solving the following error that I face while compiling:
expression.y:75:43: error: invalid operands of types ‘const char [2]’
and ‘char*’ to binary ‘operator+’ $$ = "("+$3+"-"+$5+")";
... I would like to don't use the "malloc" every time...
char* str;
str = malloc(1+strlen(text1)+strlen(text2));
strcpy(str, text1);
strcat(str, text2);
is there any smarter way? Following the flex and bison modified files:
expression.l
%{
#include <string>
#include "expression.tab.h"
void yyerror(char*);
extern void printVars();
int yyparse(void);
%}
%%
[ \t\n]+ { /* ignore */ };
"(" return(OPEN);
")" return(CLOSE);
"SET" return(SET);
"STORE" return(STORE);
"MUL" return(MUL);
"ADD" return(ADD);
"DIV" return(DIV);
"SUB" return(SUB);
"ABS" return(ABS);
"IFEL" return(IFEL);
"NOT" return(NOT);
"AND" return(AND);
"OR" return(OR);
"MAJEQ" return(MAJEQ);
"MINEQ" return(MINEQ);
"MAJ" return(MAJ);
"MIN" return(MIN);
"EQ" return(EQ);
"VAR" return(VAR);
"VAL" return(VAL);
"GET" return(GET);
"," return(S);
([a-z0-9]+)|([0-9]+.[0-9]+) { yylval.str_val = strdup(yytext); return IDENTIFIER;}
. return yytext[0];
%%
void yyerror(char *s){
printf("<ERR> %s at %s in this line:\n", s, yytext);
}
int yywrap (void){
}
int main(int num_args, char** args){
if(num_args != 2) {printf("usage: ./parser filename\n"); exit(0);}
FILE* file = fopen(args[1],"r");
if(file == NULL) {printf("couldn't open %s\n",args[1]); exit(0);}
yyin = file;
yyparse();
fclose(file);
}
expression.y
%{
#include <stdio.h>
#include <string>
#include <cstring>
using namespace std;
extern int yylex();
extern void yyerror(char*);
%}
//Symbols
%union
{
char *str_val;
int int_val;
};
%token OPEN;
%token CLOSE;
%token SET;
%token STORE;
%token MUL;
%token ADD;
%token DIV;
%token SUB;
%token ABS;
%token IFEL;
%token AND;
%token OR;
%token NOT;
%token MAJEQ;
%token MINEQ;
%token MAJ;
%token MIN;
%token EQ;
%token GET;
%token S; /* separator */
%token VAR;
%token VAL;
%token <str_val> IDENTIFIER
%type <str_val> Exp
%type <str_val> Cond
%type <str_val> Check
%type <str_val> Var
%start Expression
%%
Expression:
/* empty */
| SET OPEN Exp CLOSE
{
printf("%s\n",$3);
}
| STORE OPEN VAR OPEN IDENTIFIER CLOSE S Exp CLOSE
{
printf("var %s with following code:\n%s\n",$5,$8);
}
;
Exp:
Var
| IFEL OPEN Cond S Exp S Exp CLOSE
{
$$ = "("+$3+" == 'true') ? ("+$5+") : ("+$7+")";
}
| ADD OPEN Exp S Exp CLOSE
{
$$ = "("+$3+"+"+$5+")";
}
| SUB OPEN Exp S Exp CLOSE
{
$$ = "("+$3+"-"+$5+")";
}
| MUL OPEN Exp S Exp CLOSE
{
$$ = "("+$3+"*"+$5+")";
}
| DIV OPEN Exp S Exp CLOSE
{
$$ = "("+$3+"/"+$5+")"; //TBD check div 0
}
| ABS OPEN Exp CLOSE
{
$$ = "("+$3+">0) ? "+$3+" : "(+$3+"*(-1))";
}
;
Cond:
NOT OPEN Cond CLOSE
{
$$ = "("+$3+"=='true') ? 'false' : 'true'";
}
| AND OPEN Cond S Cond CLOSE
{
$$ = "("+$3+"=='true' && "+$5+"=='true') ? 'true' : 'false'";
}
| OR OPEN Cond S Cond CLOSE
{
$$ = "("+$3+"=='true' || "+$5+"=='true') ? 'true' : 'false'";
}
| Check
;
Check:
MAJ OPEN Exp S Exp CLOSE
{
$$ = "("+$3+">"+$5+") ? 'true' : 'false'";
}
| MIN OPEN Exp S Exp CLOSE
{
$$ = "("+$3+"<"+$5+") ? 'true' : 'false'";
}
| EQ OPEN Exp S Exp CLOSE
{
$$ = "("+$3+"=="+$5+") ? 'true' : 'false'";
}
| MAJEQ OPEN Exp S Exp CLOSE
{
$$ = "("+$3+">="+$5+") ? 'true' : 'false'";
}
| MINEQ OPEN Exp S Exp CLOSE
{
$$ = "("+$3+"<="+$5+") ? 'true' : 'false'";
}
;
Var:
VAR OPEN IDENTIFIER CLOSE
{
//TBD check if variable exists in the engine
$$ = $3;
}
| VAL OPEN IDENTIFIER CLOSE
{
//TBD check correct value
$$ = $3;
}
| GET OPEN CLOSE
{
$$ = "getField()"; //TBD to implement in the engine
}
;
%%
It's difficult to do string concatenation without some form of memory allocation. Of course, it is possible avoid avoid malloc -- you could use new instead, or hide the memory allocation inside of a std::string or std::stringstream -- but in the end, you're going to have to deal with dynamic memory allocation, and furthermore with releasing the memory when you no longer need it.
It's worth noting that your (correct) use of strdup in your scanner action for IDENTIFIER is a memory leak, because you never free the allocated memory. So you already need to deal with this issue.
As you note, doing string concatenation in C can be pretty clunky. In a case like this, it's worth the trouble to reduce the clunkiness. My preferred solution is my wrapper function concatf, whose prototype is just like printf except that it returns a malloc'd character string instead of printing. (See this answer for implementations on various platforms).
With the help of this function, it would be possible to write:
Exp:
Var
| IFEL OPEN Cond S Exp S Exp CLOSE
{
$$ = concatf("(%s == 'true') ? (%s) : (%s)", $3, $5, $7);
}
Note that x == 'true' is not valid C++. You probably meant == true, but that's a dangerous idiom; better is an explicit cast to bool (although that's actually redundant in the context of the ternary operator), so I think you actually want
$$ = concatf("bool(%s) ? (%s) : (%s)", $3, $5, $7);
or just
$$ = concatf("(%s) ? (%s) : (%s)", $3, $5, $7);
But, as mentioned above, that results in memory leaks because the malloc'd strings are never freed. So let's fix that. First, in each action, it is necessary to explicitly free all malloc'd values which are never used again. In simple cases like yours, that will be all malloc'd values, except for unit productions in which the malloc'd value is just assigned to a different non-terminal. Since all IDENTIFIER have semantic values created by strdup, it's reasonable to assume that all str_val values have been malloc'd (and this needs to be a constraint; if you ever create a str_val value from a literal character string, you'll end up with a problem). Now, we can write the rule:
Exp:
Var { /* No free needed; this is a unit production */ }
| IFEL OPEN Cond S Exp S Exp CLOSE
{
$$ = concatf("(%s) ? (%s) : (%s)", $3, $5, $7);
free($3); free($5); free($7);
}
Another example. Note the added strdup in the last rule.
Var:
VAR OPEN IDENTIFIER CLOSE
{
$$ = $3; /* No free needed; value is moved on the stack */
}
| VAL OPEN IDENTIFIER CLOSE
{
$$ = $3; /* As above */
}
| GET OPEN CLOSE
{
$$ = strdup("getField()"); /* str_val's must be malloc'd */
}
;
(There are alternatives to calling strdup on literals, but usually the use case is uncommon, and the overhead is slight.)
That style will handle all cases where rule actions are executed, but there are also occasions when bison will discard values from the stack without every invoking a rule. That will happen during error recovery, and at the end of an unsuccessful parse when the parser stack is non-empty. To assist with this case, bison lets you declare a destructor action, which will be invoked on each stack value which it discards. In this case, the declaration is almost trivial:
%destructor { free($$); } <str_val>
Well... I solved the issue in this way:
...
Exp:
Var
| IFEL OPEN Cond S Exp S Exp CLOSE
{
string t1 = $3;
string t2 = $5;
string t3 = $7;
string result = "("+t1+" == 'true') ? ("+t2+") : ("+t3+")";
$$ = (char*)result.c_str();
}
...
It is working fine...
For some reason or another, bison doesn't want to do any evaluation. Compilation of all files goes smoothly and the program runs. When I enter the expression 4+5 and press return, it creates tokens for 4 + 5 respectively. I can even put in some printf into the places where bison recognizes the attributes of each token including the plus (43).
However the program never evaluates this production expr '+' term { $$ = $1 + $3; }. It's simply never called at least to my knowledge and even if it was this production assign '\n' { printf("%d\n", $1); } never prints out the value. Upon ^D to quit, it fires void yyerror(const char *).
Any help on this matter is much appreciated. Thanks!
//FLEX
%{
//#include <stdio.h>
#include "y.tab.h"
%}
%option noyywrap
letter [A-Za-z]
digit [0-9]
space [ \t]
var {letter}
int {digit}+
ws {space}+
%%
{var} { yylval = (int)yytext[0]; return VAR; }
{int} { yylval = atoi(yytext); return CONST; }
{ws} { }
. { return (int)yytext[0]; }
%%
/* nothing */
.
//BISON
%{
//INCLUDE
//#include <ctype.h>
//DEFINE
#define YYDEBUG 1
//PROTOTYPE
void yyerror(const char *);
void print_welcome();
int get_val(int);
void set_val(int, int);
%}
%token CONST
%token VAR
%%
session
: { print_welcome(); }
eval
;
eval
: eval line
|
;
line
: assign '\n' { printf("%d\n", $1); }
;
assign
: VAR '=' expr { set_val($1, $3); $$ = $3; }
| expr { $$ = $1; }
;
expr
: expr '+' term { $$ = $1 + $3; }
| expr '-' term { $$ = $1 - $3; }
| term { $$ = $1; }
;
term
: term '*' factor { $$ = $1 * $3; }
| term '/' factor { $$ = $1 / $3; }
| term '%' factor { $$ = $1 % $3; }
| factor { $$ = $1; }
;
factor
: '(' expr ')' { $$ = $2; }
| CONST { $$ = $1; }
| VAR { $$ = get_val($1); }
;
%%
void yyerror(const char * s)
{
fprintf(stderr, "%s\n", s);
}
void print_welcome()
{
printf("Welcome to the Simple Expression Evaluator.\n");
printf("Enter one expression per line, end with ^D\n\n");
}
static int val_tab[26];
int get_val(int var)
{
return val_tab[var - 'A'];
}
void set_val(int var, int val)
{
val_tab[var - 'A'] = val;
}
.
//MAIN
//PROTOTYPE
int yyparse();
int main()
{
extern int yydebug;
yydebug = 0;
yyparse();
return 0;
}
Your lex file does not have any rule which matches \n, because in lex/flex, . matches any character except line-end. The default rule for lex (or flex) echoes and otherwise ignores the matched character, so that's what happens to the \n. Since the parser won't be able to accept a line unless it sees a \n token, it will eventually be forced to present you with a syntax error.
So you need to change the rule
. { return (int)yytext[0]; }
to
.|\n { return (int)yytext[0]; }
(I wouldn't have bothered with the cast to int but it's certainly not doing any harm, so I left it in.)