Lex only detects symbols when there is whitespace between them - regex

I want Lex, when given an input of "foo+1", to first return the identifier "foo", then the character '+', and then the integer 1. This works if I lex "foo + 1", but for some reason with the grammar I have, it doesn't work if I omit the spaces, and it skips over the '+', just returning "foo" and then 1. I can't figure out why. Is there anything here that seems problematic?
%{
#include "expression.h"
#include "picoScanner.h"
static int block_comment_num = 0;
static char to_char(char *str);
int yylineno = 0;
%}
%option nodefault yyclass="FlexScanner" noyywrap c++
%x LINE_COMMENT
%x BLOCK_COMMENT
%%
Any { return pico::BisonParser::token::ANY; }
Int { return pico::BisonParser::token::INT; }
Float { return pico::BisonParser::token::FLOAT; }
Char { return pico::BisonParser::token::CHAR; }
List { return pico::BisonParser::token::LIST; }
Array { return pico::BisonParser::token::ARRAY; }
Table { return pico::BisonParser::token::TABLE; }
alg { return pico::BisonParser::token::ALG; }
if { return pico::BisonParser::token::IF; }
then { return pico::BisonParser::token::THEN; }
else { return pico::BisonParser::token::ELSE; }
is { return pico::BisonParser::token::IS; }
or { return pico::BisonParser::token::OR; }
and { return pico::BisonParser::token::AND; }
not { return pico::BisonParser::token::NOT; }
when { return pico::BisonParser::token::WHEN; }
[A-Z][a-zA-Z0-9_]* { yylval->strval = new std::string(yytext);
return pico::BisonParser::token::TYPENAME; }
[a-z_][a-zA-Z0-9_]* { printf("saw '%s'\n", yytext); yylval->strval = new std::string(yytext);
return pico::BisonParser::token::ID; }
"==" { return pico::BisonParser::token::EQ; }
"<=" { return pico::BisonParser::token::LEQ; }
">=" { return pico::BisonParser::token::GEQ; }
"!=" { return pico::BisonParser::token::NEQ; }
"->" { return pico::BisonParser::token::RETURN; }
[\+\-\*/%] { return yytext[0]; }
[-+]?[0-9]+ { yylval->ival = atoi(yytext);
return pico::BisonParser::token::INT_LITERAL; }
([0-9]+|([0-9]*\.[0-9]+)([eE][-+]?[0-9]+)?) { yylval->fval = atof(yytext);
return pico::BisonParser::token::FLOAT_LITERAL; }
\"(\\.|[^\\"])*\" { yylval->strval = new std::string(strndup(yytext+1, strlen(yytext) - 2));
return pico::BisonParser::token::STRING_LITERAL; }
\'(\\.|[^\\'])*\' { yylval->cval = to_char(yytext+1);
return pico::BisonParser::token::CHAR_LITERAL; }
[ \t\r]+ { /* ignore */ }
\n { yylineno++; }
. { printf("~~~~~~~~~~munched %s\n", yytext); return yytext[0]; }
%%
static char to_char(char *str) {
if (strlen(str) <= 1) {
fprintf(stderr, "Error: empty character constant (line %d)\n", yylineno);
exit(1);
} else if (str[0] != '\\') {
return str[0];
} else {
if (strlen(str) == 1) {
fprintf(stderr, "Error: somehow we got a single slash character\n");
exit(1);
}
switch (str[1]) {
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'a': return '\a';
case 'b': return '\b';
case 'f': return '\f';
case 'v': return '\v';
case '\'': return '\'';
case '"': return '"';
case '\\': return '\\';
case '?': return '\?';
case 'x':
fprintf(stderr, "Error: unicode not yet supported (line %d)\n", yylineno);
exit(1);
default:
fprintf(stderr, "Error: unrecognized escape sequence '\\%c' (line %d)\n",
str[1], yylineno);
exit(1);
}
}
}

I am not familair with lex, but I'm pretty sure the following causes the error:
[-+]?[0-9]+ { yylval->ival = atoi(yytext);
return pico::BisonParser::token::INT_LITERAL; }
foo is parsed as an identifier, but then "+0" is parsed as an int literal (due to the atoi conversion, the sign is discarded).
It is probably a good idea to only consider unsigned numeric literals at a lexer level, and handle signs at the level of the parser (treating the + and - tokens differently depending on their context).
Not only does this resolve the ambiguity, but it also enables you to "correctly" (in the sense that these are legal in C, C++, Java etc.) parse integer literals such as - 5 instead of -5.
Moreover: are the escaping backslashs in the arithmetic operator rule really necessary? Afaik, the only characters with special meaning inside a character class are -, ^, and ] (but I might be wrong).

Looks to me like it's matching foo+1 as foo and +1 (an INT_LITERAL). See related thread: Is it possible to set priorities for rules to avoid the "longest-earliest" matching pattern?
You could add an explicit rule to match + as a token, otherwise it sounds like Lex is going to take the longest match it can (+1 is longer than +).

Related

How to perform other operations when my tokenizer recognizes a token?

I have written a simple tokenizer that will split a command line into seperate lines each containing a single word. I am trying to ...
Make the program close if the first word of a command line is "quit"
Recognize instructions such as "Pickup", "Save", and "Go" in which the compiler will then look to the next token.
My idea has been to use a simple switch with cases to check for these commands, but I cannot figure out where to place it.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
char command[256];
int commandIndex;
char token[32];
int isWhiteSpace(char character) {
if (character == ' ') {
return 1;
}
else if(character == '\t') {
return 1;
}
else if(character < ' ') {
return 1;
}
else {
return 0;
}
} char* getToken() {
int index = 0; // Skip white spaces
while(commandIndex<256 && isWhiteSpace(command[commandIndex])) {
commandIndex ++;
} // If at end of line return empty token
if(commandIndex>=256) {
token[0] = 0;
return token;
} // Capture token
while(commandIndex<256 && !isWhiteSpace(command[commandIndex])) {
token[index] = command[commandIndex];
index++;
commandIndex ++;
}
token[index] = 0;
return token;
}
void main() {
printf("Zeta - Version 2.0\n");
while(1) {
printf("Command: ");
gets_s(command);
commandIndex = 0;
char* token = getToken();
while (strcmp(token,"") != 0) {
printf("%s\n", token);
token = getToken();
}
}
}
A little reorganization of the loop you have in main will do it.
int main() {
printf("Zeta - Version 2.0\n");
bool done = false;
while (!done) {
printf("Command: ");
gets_s(command);
commandIndex = 0;
char* token = getToken();
if (strcmp(token, "quit") == 0) {
done = true;
} else if (strcmp(token, "pickup") == 0) {
doPickup();
} else if (strcmp(token, "save") == 0) {
char * filename = getToken();
doSave(filename);
} ...
}
return 0;
}
You can't use a switch statement with strings, so you just use a bunch of if ... else if ... statements to check for each command. There are other approaches, but this one required the fewest changes from the code you already have.
In the example, under the handling for "save" I showed how you can just call getToken again to get the next token on the same command line.
(Note that I also fixed the return value for main. Some compilers will let you use void, but that's not standard so it's best if you don't do that.)

how to convert returned ascii arithmetic symbols, so it can be used with operands

How could I use arithmetic operation symbols if its assigned to variable and need to return for evaluation of simple arithmetic problems --> from the way its constructed in the code below. How? or any other suggestions welcomed thanks in advance
int arithmeticType();
int main() {
int arithmeticSymbol = arithmeticType();
int x, y;
int result = 0;
srand(time(NULL) );
printf( "Type is: %d\n", arithmeticSymbol );
//how to get result of (x + y)using arithmeticSymbol?????
printf( "The result %d %d %d", 4, arithmeticSymbol, 3 );
return 0;
}
int arithmeticType() {
int type, token;
printf("Select the type of arithmetic operation to perform:\n"
"\t1. Addition.\n\t2. Subtraction.\n\t3. Multiplication.\n\t"
"4. Mixture of all three. --> ");
scanf("%d", &type);
switch ( type ) {
case 1:
token = '+';
break;
case 2:
token = '-';
break;
case 3:
token = '*';
break;
case 4:
//get random value between 1-3
token = rand() % 3 + 1;
if ( token == 1 ) {
token = '+';
}
else if ( token == 2 ) {
token = '-';
}
else {
token = '*';
}
break;
default:
printf("Wrong input");
break;
}
return token;
}
Write a function to apply the operator. You also need to consider your types - if you just use integers, things like 3/2 may not do what you expect...
If it was me I'd probably use an enum instead of the ASCII operator code.
int getResult(int op1, int op, int op2)
{
int result = 0;
if (op== '+') {
result = op1 + op2;
}
else ...
return result;
}
You would call it like: printf( "The result of %d %c %d is: %d", 4, getOperation, 3, getResult(4, getOperation, 3) ); (note printing the operator as %d wont do what you expect and the name getOperator is very misleading.

c++ The evaluation of expression. How to exit the while loop?

I use stack to evaluate an expression.
The most important function is below:
double Expression_Eval()
{
SeqStack<char,100> OPTR;
SeqStack<double,100> OPND;
OPTR.Push('#');
char ch;
ch=getchar();
while (ch!='#' || OPTR.GetTop()!='#')
{
if (!InOPTR(ch))
{
int n=ch-'0';
double num=(double)n;
OPND.Push(num);
ch=getchar();
}
else
{
char pre_op=OPTR.GetTop();
switch (Precede(pre_op, ch))
{
case '<': OPTR.Push(ch);
ch=getchar();
break;
case '=': OPTR.Pop();
ch=getchar();
break;
case '>': double b=OPND.Pop();
double a=OPND.Pop();
pre_op=OPTR.Pop();
OPND.Push(Operate(a, pre_op, b));
ch=getchar();
break;
}
}
}
return OPND.GetTop();
}
Then, when I input 8/(5-3)#, it will not print the result.
I think the loop termination condition ch!='#' || OPTR.GetTop()!='#' is wrong.
When I press Enter, getchar() get the last char is CR but not #.
But, I don't know how to revise it to make my program work.
The other part of my program is below:
#include<iostream>
using namespace std;
template<typename DataType,int StackSize>
class SeqStack
{
private:
DataType data[StackSize];
int top;
public:
SeqStack()
{ top=-1; }
~SeqStack() {}
void Push(DataType x)
{
if(top == StackSize-1)
throw "error";
data[++top]=x;
}
DataType Pop()
{
if(top == -1)
throw "error";
DataType x=data[top--];
return x;
}
DataType GetTop()
{
if(top != -1)
return data[top];
else
cout<<"error";
}
};
bool InOPTR(char ch)
{
if( (ch>='(' && ch<='+') || ch=='-' || ch=='/' )
{
return true;
}else{
return false;
}
}
char Precede(char op1, char op2)
{
char pri[7][7]={ {'>','>','<','<','<','>','>'}
, {'>','>','<','<','<','>','>'}
, {'>','>','>','>','<','>','>'}
, {'>','>','>','>','<','>','>'}
, {'<','<','<','<','<','=','#'}
, {'>','>','>','>','#','>','>'}
, {'<','<','<','<','<','#','='} };
int m,n;
switch(op1)
{
case '+': m=0;break;
case '-': m=1;break;
case '*': m=2;break;
case '/': m=3;break;
case '(': m=4;break;
case ')': m=5;break;
case '#': m=6;break;
}
switch(op2)
{
case '+': n=0;break;
case '-': n=1;break;
case '*': n=2;break;
case '/': n=3;break;
case '(': n=4;break;
case ')': n=5;break;
case '#': n=6;break;
}
return pri[m][n];
}
double Operate(double a, char op, double b)
{
double result;
switch(op)
{
case '+': result=a+b; break;
case '-': result=a-b; break;
case '*': result=a*b; break;
case '/': result=a/b; break;
}
return result;
}
int main()
{
double r=Expression_Eval();
cout<<r<<endl;
return 0;
}
Problem seem to be that '#' is considered a number, but it should be considered an operation:
Use:
bool InOPTR(char ch) {
if ((ch >= '(' && ch <= '+') || ch == '-' || ch == '/' || ch=='#'){
return true;
}
else {
return false;
}
}
Note that '#' is ASCII 64 which is not covered in the ranage '(' to '+' [40-43]
Hope this helps.
You need to consume carriage return or newline character after getchar(); which comes into play when you press enter button.
One trick is as below.
ch=getchar();
getchar(); //this getchar to consume CR.
since you have used ch = getchar() many times you have to use above solution at many places.
Better solution to this problem will be to enter string instead of entering single character using getchar()...
Hope you got what I am trying to say...

regex (in flex) for complete general sentence

I am definig tokens inside flex as
%%
#[^\\\" \n\(\),=\{\}#~]+ {yylval.sval = strdup(yytext + 1); return ENTRYTYPE;}
[A-Za-z][A-Za-z0-9:"]* { yylval.sval = strdup(yytext); return KEY; }
\"([^"]|\\.)*\"|\{([^"]|\\.)*\} { yylval.sval = strdup(yytext); return VALUE; }
[ \t\n] ; /* ignore whitespace */
[{}=,] { return *yytext; }
. { fprintf(stderr, "Unrecognized character %c in input\n", *yytext); }
%%
(Though, not a good way)
The problem is the VALUE variable are doing fine for a quoted string, of the form "some quote"; but not for the form when they are enclosed by braces (of the form {some sentences}) as tried.
What is messy there?
I think that you want this, instead:
\"([^"]|\\.)*\"|\{([^\}]|\\.)*\} { yylval.sval = strdup(yytext); return VALUE; }
Even better, the following will be clearer and easier to maintain:
\"([^"]|\\.)*\" { yylval.sval = strdup(yytext); return VALUE; }
\{([^\}]|\\.)*\} { yylval.sval = strdup(yytext); return VALUE; }
Update
I have escaped the right brace in the character class expressions.

A recursive parser help

I'm supposed to write this c++ that take in 01 and on. for example: 01010101 but not 0,1,10,011,110. Can someone help me figure out what I need to do to fix the problem. Sorry guys the code isn't working right.I pushed ctrl+k and posted the code but everything wasn't in place.
What I was trying to do is that when some enter 1 than it prints invalid. if they enter 0 it prints invalid, if the enter 10 it prints invalid, if they enter 01 it prints valid, if the enter 0101 it prints valid. So 0 always have to come first and always follow by 1. another example: 0101010101 prints valid
Thanks Seth :). I removed the links
[From seth.arnold: I removed commented-out code and indented the code to follow some kind of logical pattern. Feel free to replace this with your code if you wish, indent each line by four spaces to properly format it.]
#include <iostream>
#include<stdlib.h> // for the exit(1) function
using namespace std;
char text[300];
char ToBeChecked;
char lexical(); //identify the characters
void SProd();
void BProd();
int main(){
cout<<"Enter some strings only 1 and 0 (max. 300 characters"<<endl;
cin>>text;
ToBeChecked = lexical(); //identify the character; find the first letter and give it to ToBeChecked
SProd();
if(ToBeChecked == '\0')
cout<<"Valid"<<endl;
else
cout<<"Invalid"<<endl;
cin.get();
return 0;
}
char lexical(){
static int index = -1; //a memory box named index with a value of -1; is static so it won't change.
//is -1 because -1 to 1 is 0; everything move on to next one
index++; //update index
return text[index]; //return the value of index
}
void SProd(){
if(ToBeChecked != '0' ) {
cout<<"Invalid"<<endl;
exit(1);
}
else{
BProd();
ToBeChecked = lexical();
}
}
void BProd(){
if(ToBeChecked != '1')
{
cout<<"Invalid"<<endl;
exit(1);
}
else
SProd();
ToBeChecked = lexical();
}
Have a look in Bjorn Stroustroup's book Programming Principles and Practice using c++ chapter 6-7.
You will have to write the grammar, you need to know how to:
Distinguish a rule from a token
Put one rule after another (sequencing)
Express alternative patterns (alternation)
Express a repeating pattern
(repetition)
Recognize the grammar rule to start
with
For example - you will have to have a token class:
class Token {
public:
char kind; // what kind of token
double value; // for numbers: a value
Token(char ch) // make a Token from a char
:kind(ch), value(0) { }
Token(char ch, double val) // make a Token from a char and a double
:kind(ch), value(val) { }
};
Then Token stream class:
class Token_stream {
public:
Token_stream(); // make a Token_stream that reads from cin
Token get(); // get a Token (get() is defined elsewhere)
void putback(Token t); // put a Token back
private:
bool full; // is there a Token in the buffer?
Token buffer; // here is where we keep a Token put back using putback()
};
Identify default constructor for Token stream:
Token_stream::Token_stream()
:full(false), buffer(0) // no Token in buffer
{
}
Then create a putback() function, you will need that to put back the character you read from the iostream if it is of interest to you, and function who specializes in extraction of that particular character will be called:
void Token_stream::putback(Token t)
{
if (full) throw std::runtime_error("putback() into a full buffer");
buffer = t; // copy t to buffer
full = true; // buffer is now full
}
Then in Token::get() you will have to make the rules what is important to you and what you want to include, omit or throw error:
Token Token_stream::get()
{
if (full) { // do we already have a Token ready?
// remove token from buffer
full=false;
return buffer;
}
char ch;
cin >> ch; // note that >> skips whitespace (space, newline, tab, etc.)
switch (ch) {
case '=': // for "print"
case 'x': // for "quit"
case '(': case ')': case '{': case '}': case '+': case '-': case '*': case '/': case '!':
return Token(ch); // let each character represent itself
break;
case '.':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '9':
{
cin.putback(ch); // put digit back into the input stream
double val;
cin >> val; // read a floating-point number
return Token('8',val); // let '8' represent "a number"
}
break;
default:
throw std::runtime_error("Bad token");
}
}
In this version of Token_stream::get() we are interested in numbers, mathematical operators and brackets. So you will have to change that case statement to get either '1' or '0', and ignore everything else, or throw, it is up to you, I don't know what is exactly you need to do.
Then create a grammar function, you will have to establish hierarchy of functions that call one another if you want or example 1 character to be processed in front of the other. But if you only need to read sequentially, you can have only 1 function. anyway, I include 3 functions that are using calculator example where you have +,-,*,/,(,),{,}. As you see this example need to identify what it is in order to call the right function before the other one, eg - multiplication before subscription.
primary() function deals with numbers and parentheses:
// deal with numbers and parentheses
double primary()
{
Token t = ts.get();
switch (t.kind) {
case '(': // handle '(' expression ')'
{
double d = expression();
t = ts.get();
if (t.kind != ')') throw std::runtime_error("')' expected");
return d;
break;
}
case '{':
{
double d = expression();
t=ts.get();
if (t.kind != '}') throw std::runtime_error("'}' expected");
return d;
break;
}
case '8': // we use '8' to represent a number
return t.value; // return the number's value
break;
default:
throw std::runtime_error("primary expected");
}
}
term() function deals with multiplication and division:
// deal with *, /, and %
double term()
{
double left = primary();
Token t = ts.get(); // get the next token from token stream
while(true) {
switch (t.kind) {
case '*':
left *= primary();
t = ts.get();
break;
case '/':
{
double d = primary();
if (d == 0) throw std::runtime_error("divide by zero");
left /= d;
t = ts.get();
break;
}
default:
ts.putback(t); // put t back into the token stream
return left;
}
}
}
expression() deals with addition and subtraction:
double expression()
{
double left = term(); // read and evaluate a Term
Token t = ts.get(); // get the next token from token stream
while(true) {
switch(t.kind) {
case '+':
left += term(); // evaluate Term and add
t = ts.get();
break;
case '-':
left -= term(); // evaluate Term and subtract
t = ts.get();
break;
default:
ts.putback(t); // put t back into the token stream
return left; // finally: no more + or -: return the answer
}
}
}
And finally our calling function:
int callDrill_01(void)
try
{
std::cout << "Welcome to simple calculator." << std::endl;
std::cout << "Please enter expressions using floating-point numbers." << std::endl;
std::cout << "The arithmetic operators available are: + - * / ( ) { } = e(x)it." << std::endl;
double val = 0;
while (cin) {
Token t = ts.get();
if (t.kind == 'x') break; // 'q' for quit
if (t.kind == '=') { // ';' for "print now"
cout << "=" << val << '\n';
}else{
ts.putback(t);
}
val = expression();
}
keep_window_open();
}
catch (exception& e) {
cerr << "error: " << e.what() << '\n';
keep_window_open();
return 1;
}
catch (...) {
cerr << "Oops: unknown exception!\n";
keep_window_open();
return 2;
}
This should give you an idea of how recursive parser are created. Your head is probably spinning. I suggest you to find that book that I mentioned and read those chapters. It will help you in the future.
#include <iostream>
//#include<stdlib.h> // for the exit(1) function
using namespace std;
char text[300];
char ToBeChecked;
char lexical(); //identify the characters
void SProd();
void BProd();
int main(){
cout<<"Enter some strings (max. 300 characters"<<endl;
cin>>text;
ToBeChecked = lexical(); //identify the character; find the first letter and give it to ToBeChecked
SProd();
if(ToBeChecked == '\0')
cout<<"Valid"<<endl;
else
cout<<"Invalid"<<endl;
cin.get();
return 0;
}
char lexical(){
static int index = -1; //a memory box named index with a value of -1; is static so it won't change.
//is -1 because -1 to 1 is 0; everything move on to next one
index++; //update index
return text[index]; //return the value of index
}
void SProd(){
if(ToBeChecked != 'a' ) {
BProd();
ToBeChecked = lexical();
}
}
void BProd(){
if(ToBeChecked == 'b'){
ToBeChecked = lexical();
SProd();
}
}