Regular expression for string excluding literal quotation marks - regex

I have the following config file that I am trying to parse.
[ main ]
e_type=0x1B
username="username"
appname="applicationname"
In the lex file (test.l) specified below,the regular expression for STR is \"[^\"]*\" so that it recognizes everything within quotes.When I access the value of "username" or "applicationname" inside the parser file using $N variable, it contains the literal string.I just want
username and applicationname i.e without string quotation marks.
Is there a standard way to acheive this.
I have the following lex file (test.l)
%option noyywrap
%option yylineno
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "y.tab.h"
int yylinenu = 1;
int yycolno=1;
/**
* Forward declerations
**/
void Number ();
void HexaNumber ();
unsigned char getHexaLex (char c);
unsigned int strtol16 (char * str);
%}
%option nounput
%option noinput
%option case-insensitive
/*-----------------------------------------------------------------
Some macros (standard regular expressions)
------------------------------------------------------------------*/
DIGIT [0-9]
HEXALETTER [a-fA-F]
HEXANUMBER [0][x](({DIGIT}|{HEXALETTER})+)
NUM {DIGIT}+
HEXA ({DIGIT}|{HEXALETTER}|[*])
STR \"[^\"]*\"
WSPACE [ \t]*
NEWLINE [\n\r]
/*----------------------------------------------------------------
The lexer rules
------------------------------------------------------------------*/
%%
e_type { yylval.str = yytext; return T_E_TYPE; }
main { yylval.str = yytext; return T_MAIN_SECTION;}
{HEXANUMBER} { yylval.n = atoi(yytext); HexaNumber(); return T_NUMBER; }
= { return T_EQUAL; }
"[" { return T_OPEN_BRACKET; }
"]" { return T_CLOSE_BRACKET;}
appname { Custom_tag(); return T_APPNAME; }
username { Custom_tag(); return T_APPNAME; }
[^\t\n\r] { }
{WSPACE} { } /* whitespace: (do nothing) */
{NEWLINE} {
yylinenu++;
return T_EOL;
}
{STR} { Generic_string(); return T_STRING;}
%%
void Number () {
yylval.n = atol(yytext);
}
void Generic_string() {
yylval.str = malloc(strlen(yytext)+1);
strcpy (yylval.str, yytext);
}

You have a pointer to the matched token (yytext) and its length (yyleng), so it is quite simple to remove the quotes:
void Generic_string() {
yylval.str = malloc(yyleng - 1); // length - 2 (quotes) + 1 (NUL)
memcpy (yylval.str, yytext + 1, yyleng - 2); // copy all but quotes
yylval.str[yyleng - 2] = 0; // NUL-terminate
}
Personally, I'd suggest avoiding use of global variables in Generic_string, both to simplify future implementation of a reentrant scanner, and to make the process a bit more flexible:
{STR} { yylval.str = duplicate_segment(yytext + 1, yyleng - 2);
return T_STRING;
}
/* ... */
char* duplicate_segment(const char* token, int token_length) {
char* dup = malloc(token_length + 1);
if (!dup) { /* handle memory allocation error */ }
memcpy(dup, token, token_length);
dup[token_length] = 0;
return dup;
}

Related

A function uses a different amount of memory when I call it with the same conditions the second time

I have a problem with C++ and memory. Here's the pseudocode:
main.cpp
#include <iostream>
#include "seq.h"
int main(int argc, char *argv[]) {
SnpSite snp_site("/mnt/c/Users/manht/Downloads/s_typhi_wong_holt.aln.gz");
snp_site.test(); // run the first time
snp_site.test(); // run the second time
}
seq.h
#include "file_handler.h"
#include <stdio.h>
class SnpSite {
private:
string inputfile;
FileHandler fh;
public:
SnpSite(char* _inputfile);
int is_unknown(char base);
void test();
};
seq.cpp
#include "seq.h"
SnpSite::SnpSite(char* _inputfile) {
fh = FileHandler();
inputfile = _inputfile;
}
void SnpSite::test() {
string sample_name, seq;
this->fh.open(this->inputfile.c_str());
this->fh.assign_next_sample_to(&sample_name, &seq);
this->fh.close();
}
file_handler.h
#ifndef SEQ_H_
#include <zlib.h>
#include <utility>
#include <ctype.h>
#include "my_string.h"
#include <string>
using namespace std;
#define SEQ_H_
typedef bool (*match_func)(int c, int delimiter);
class FileHandler {
private:
gzFile file;
char buffer[2048]; // Static allocation for better performance.
int buffer_start, buffer_end;
bool eof;
void get_until(int delimiter, string *s);
public:
FileHandler();
FileHandler(int _buffer_size);
void open(const char* filename);
void close();
void assign_next_sample_to(string *name, string *seq);
int next_char();
bool is_eof();
};
#endif
file_handler.cpp
#include "file_handler.h"
FileHandler::FileHandler() {
buffer_start = -1;
buffer_end = -1;
eof = false;
}
void FileHandler::open(const char* filename) {
file = gzopen(filename, "r");
eof = false;
}
void FileHandler::close() {
gzclose(file);
}
int FileHandler::next_char() {
/* Read current character and increase cursor (buffer_start) by 1.*/
if (buffer_start >= buffer_end) {
buffer_end = gzread(file, buffer, 2048);
buffer_start = -1;
if (buffer_end == 0) eof = true;
}
return buffer[++buffer_start];
}
bool FileHandler::is_eof() {
return eof;
}
#define SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define SEP_TAB 1 // isspace() && !' '
#define SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define SEP_MAX 2
// list of function to compare c and delimiter, need exactly 2 arguments.
bool match_space(int c, int delimter) {
return isspace(c);
}
bool match_tab(int c, int delimter) {
return isspace(c) && c != ' ';
}
bool match_newline(int c, int delimter) {
return c == '\n';
}
bool match_char(int c, int delimter) {
return c == delimter;
}
bool no_match(int c, int delimiter) {
return false;
}
// end list.
void FileHandler::get_until(int delimiter, string *s) {
/*
Read till delimiter and append bytes read to s.
When done cursor will be at the end of the line.
*/
match_func match; // function to check if a char match delimiter
switch (delimiter) {
case SEP_SPACE:
match = match_space;
break;
case SEP_TAB:
match = match_tab;
break;
case SEP_LINE:
match = match_newline;
break;
default:
if (delimiter > SEP_MAX) match = match_char;
else match = no_match;
}
// begin process
int i = buffer_start;
while (!match(buffer[i], delimiter)) {
if (buffer_start >= buffer_end) {
buffer_end = gzread(file, buffer, 2048);
buffer_start = 0;
i = 0;
if (buffer_end == 0) {
eof = true;
break;
}
}
while (!match(buffer[i], delimiter) && i < buffer_end) i++;
s->append((char*)(buffer + buffer_start), i - buffer_start);
buffer_start = i;
}
}
/*
Get next sample name and sequence, assign it to *name and *seq.
(Note: this function do not read quality score for QUAL file).
*/
void FileHandler::assign_next_sample_to(string *name, string *seq) {
/* Get next sample name and sequence, assign it to *name and *seq.*/
name->erase();
seq->erase();
int c;
while (!eof && (c = next_char()) != '>' && c != '#') {} // read until meet sample name
get_until(SEP_SPACE, name); // get sample name
while (!eof && (c = next_char()) != '>' && c != '#' && c != '+') {
if (c == '\n') continue;
get_until(SEP_LINE, seq); // read sequence
}
buffer_start--; // step back to the end of sequence
}
I don't use any dynamic allocation, and when I traced memory usage by PID in htop, I found something that I can't explain:
The first time I call test():
At the beginning of the function, my process uses 6168 KBytes.
At the end of the function, my process uses 13998 Kbytes.
The second time I call test():
At the beginning of the function, my process uses 6304 Kbytes.
At the end of the function, my process uses 21664 Kbytes.
The length of the seq variable is 4809037 and sample_name is 11 in both cases. I don't understand why memory usage is so different between them. Hope someone can find out and explain it to me, it helps me a lot. Thanks
This happens because of this line:
s->append((char*)(buffer + buffer_start), i - buffer_start);
Strings are dynamically allocated and every time the initial size is exceeded a new larger memory block is allocated. You can read more about this here: Chapter 4. Optimize String Use: A Case Study.

How to restart start state data in flex

I've created a start condition (for strings) in flex and everything works fine. However,
when I parse the same string twice, the elements using the start condition vanish.
How can I solve it?
please help me
flex file
%option stack noyywrap
%{
extern int lineNumber; // definie dans prog.y, utilise par notre code pour \n
#include "h5parse.hpp"
#include <iostream>
#include <fstream>
using namespace std;
extern string initialdata;
extern string pdata;
extern bool loop;
string val;
string compile(string content);
string compilefile(string path);
void runwithargs(int argc ,char ** argv);
int saveoutput(string compileddata ,string outputpath="");
%}
%x strenv
i_command #include
e_command #extends
l_command #layout
f_command #field
command {i_command}|{e_command}|{l_command}|{f_command}
%%
"\"" { val.clear(); BEGIN(strenv); }
<strenv>"\"" { BEGIN(INITIAL);sprintf(yylval.str,"%s",val.c_str());return(STRING); }
<strenv><<EOF>> { BEGIN(INITIAL); sprintf(yylval.str,"%s",val.c_str());return(STRING); }
<strenv>. { val+=yytext[0]; }
{command} {sprintf(yylval.str,"%s",yytext);return (COMMAND);}
"(" { return LPAREN; }
")" { return RPAREN; }
"{" { return LBRACE; }
"}" { return RBRACE; }
.|\n {yylval.c=yytext[0];return TXT; }
%%
//our main function
int main(int argc,char ** argv)
{
if(argc>1)runwithargs(argc,argv);// if there are arguments run with them
system("pause");//don't quit the app at the end of assembly
return(0);
}
//run h5A by using arguments
void runwithargs(int argc ,char ** argv)
{
if(argc == 2)
saveoutput(compilefile(argv[1]));
}
//assemble a string
string compile(string content)
{
do
{
loop=false;
pdata.clear();
YY_BUFFER_STATE b =yy_scan_string(content.c_str());
yyparse();
content=pdata;
}while(loop==true);
return content;
}
//assemble file
string compilefile(string path)
{
string data;
ifstream inputfile(path,ios::in|ios::binary|ios::ate);
int length = inputfile.tellg();
inputfile.seekg(0, std::ios::beg);
char * buffer = new char[length];// allocate memory for a buffer of appropriate dimension
inputfile.read(buffer, length);// read the whole file into the buffer
inputfile.close();
cout<<"start assembly : "<<path<<endl;
return compile(string(buffer));
}
//save assembled file to a specified path
int saveoutput(string compileddata ,string outputpath)
{
outputpath=(outputpath=="")?"output":outputpath;
ofstream outputfile ("output");
//dhow the compiled data in console if we're in debug
outputfile<<compileddata;
cout<<compileddata<<endl;
cout<<"operation terminated successfuly , output at :"
<<outputpath<<endl;
return 0;
}
bison file
%{
#include <stdio.h>
#include <iostream>
#include<fstream>
#include<map>
using namespace std;
typedef void* yyscan_t;
int lineNumber; // notre compteur de lignes
map <string,string> clayouts;
void yyerror ( char const *msg);
typedef union YYSTYPE YYSTYPE;
void yyerror ( char const *msg);
int yylex();
bool loop;
string pdata="";
%}
/* token definition */
%token STRING
%token COMMAND
%token LPAREN RPAREN LBRACE RBRACE
%token TXT
%union { char c; char str [0Xfff]; double real; int integer; }
%type<c> TXT;
%type<str> STRING COMMAND;
%start program
%%
program:value | command_call |txt | program program ;
value: STRING {pdata+='\"'+$1+'\"'; };
command_call : COMMAND LPAREN STRING RPAREN {
if(string($1)=="#field")
{
cout<<"define field :"<<$3;
}
else if(string($1)=="#include")
{
ifstream t;
int length;
char * buffer;
t.open($3);
t.seekg(0, std::ios::end);
length = t.tellg();
t.seekg(0, std::ios::beg);
buffer = new char[length];
t.read(buffer, length);
t.close();
pdata+=buffer;
}
else if (string($1)=="#layout")
{
cout<<"define layout for field "<<$3;
}
else if (string($1)=="#repeat")
{
cout<<"reapeat instruction"<<$3;
}
else
{
cout<<"extend with : "<<$3;
ifstream t;
int length;
char * buffer;
t.open($3);
t.seekg(0, std::ios::end);
length = t.tellg();
t.seekg(0, std::ios::beg);
buffer = new char[length];
t.read(buffer, length);
t.close();
}
loop=true;
};//LPAREN RPAREN ;
txt: TXT {pdata+=$1;};
%%
void yyerror (const char *msg)
{
cout<<msg;
}
this is the output
Please help me understand why the strings disappear.
Here is the full code my repository
thank in advance
Nothing here is disappearing and you're not parsing the same string twice.
The second parse is on a new string which you yourself created, consisting of data copied during the first parse. So they're different strings, and neither Flex nor Bison know about any relationship between them.
The reason that the second string does not contain the same data as the first string is simple: you didn't copy all of the data. Anything you don't copy "disappears".
In particular, your scanner only sends the data between double quotes to the parser. The parser attempts to add the double quotes, but it doesn't manage because the line:
pdata+='\"'+$1+'\"';
means
pdata += ('\"' + $1 + '\"');
Since character literals are integers and $1 is an array of characters, which decays to a character pointer, that is the same as:
pdata += &$1[68]; // '\"' is 34
which is really undefined behaviour unless $1 has at least 67 characters, but in practice will be an empty string because Bison zero initializes stack values. (You shouldn't depend on that, though.)
In short, the second time you parse, the double quoted strings are not present, something you could easily have noted by debugging your parser actions.
Honestly, I don't think this is an appropriate architecture for a macro preprocessor. In general, you should let Flex handle reading from a file; it's good at doing that. Also, the Flex manual illustrates a couple of ways to handle "include files", and macro expansions can be incorporated using a similar technique.
Moreover, using a semantic value which occupies 4kb is not a good way of managing memory. It can easily result in blowing up the parser stack. And constantly converting back and forth between std::string and C-style null-terminated arrays is also extremely inefficient.
But those are different questions.

G++ isn't recognizing my c++ classes when compiling my flex and bison code

I am writing a lexer and parser combo that is storing arm8 assembly code in a data structure for further testing. However, when compiling the code, the compiler doesn't recognize my imported classes as legitimate data types.
I have been following this guide: https://gnuu.org/2009/09/18/writing-your-own-toy-compiler/ to some extent. I tried changing the output settings of bison to produce a c++ file and it fixed partially fixed the problem, but it opened a whole other can of worms I'm hoping to avoid. All of the guides I've looked at use c++ code in the process and I don't really understand why it fails here.
assembly_bison.y:
%{
#include <cstdio>
#include <iostream>
#include <string>
#include "instructionds.h"
#include "AssemblyBlock.h"
using namespace std;
extern int yylex();
extern int yyparse();
extern FILE *yyin;
AssemblyBlock *assembly = new AssemblyBlock();
STP *input;
void yyerror(const char *s);
%}
%union {
long long imm;
std::string *string;
int token;
}
%token STP INSTRUCTION
%token STACKPOINTER "sp"
%token <imm> IMMEDIATE
%token <string> DIRECTIVE LABEL INLINELABEL
%token <token> REGISTER64 REGISTER32
%token <token> COMMA ","
%token <token> BANG "!"
%token <token> OPENBRACKET "["
%token <token> CLOSEBRACKET "]"
%%
document:
document line
| /* empty */
;
line:
LABEL
| DIRECTIVE {/* */}
| LABEL instruction
| instruction
;
instruction:
stp
;
stp:
STP REGISTER64 "," REGISTER64 "," "[" "sp" "," IMMEDIATE "]" "!"
{
input = new STP(true, true, $2, $4, -1, $9);
assembly->insert(input);
}
%%
int main(int, char**) {
// Open a file handle to a particular file:
FILE *myfile = fopen("Hello_World_Assembly_Code.asm", "r");
// Make sure it is valid:
if (!myfile) {
cout << "I can't open a.snazzle.file!" << endl;
return -1;
}
// Set Flex to read from it instead of defaulting to STDIN:
yyin = myfile;
// Parse through the input:
yyparse();
}
void yyerror(const char *s) {
cout << "EEK, parse error! Message: " << s << endl;
// might as well halt now:
exit(-1);
}
assembly_lexer.l
%{
#include <cstdio>
#include <string>
#include "instructionds.h"
#include "AssemblyBlock.h"
#include "parser.hpp"
#define SAVE_TOKEN yylval.string = new std::string(yytext, yyleng)
#define TOKEN(t) (yylval.token = t)
%}
%option noyywrap
delim [ \t\n]
ws [delim+]
letter [A-Za-z]
digit [0-9]
id {letter}({letter}|{digit})*
alphanumeric [A-Za-z0-9]
%%
{delim} {/* no action and return */}
\.L[A-Z0-9]*: { SAVE_TOKEN; return LABEL; }
\.[a-z0-9_]+.* { SAVE_TOKEN; return DIRECTIVE; }
{alphanumeric}+\: { SAVE_TOKEN; return LABEL; }
stp { return STP; }
add { return INSTRUCTION; }
adrp { return INSTRUCTION; }
bl { return INSTRUCTION; }
mov { return INSTRUCTION; }
ldp { return INSTRUCTION; }
ret { return INSTRUCTION; }
sp { return STACKPOINTER; }
x{digit}+ { yylval.register = stoi(yytext.substr(1,yytext.length())); return REGISTER64; }
w{digit}+ { yylval.register = stoi(yytext.substr(1,yytext.length())); return REGISTER32; }
, { return TOKEN(COMMA); }
\.L[A-Z0-9]* { yylval.sval = strdup(yytext); return INLINELABEL; } //Needs revision
\[ { return TOKEN(OPENBRACKET); }
\] { return TOKEN(CLOSEBRACKET); }
:{id}: { }
#?[+-]?{digit}+ { if(yytext[0] == '#') yytext.erase(0); yylval.imm = stoll(yytext); return IMMEDIATE } //Needs revision
{alphanumeric}+ { SAVE_TOKEN; return LABEL; }
! { return TOKEN(BANG); }
%%
instructionds.h:
#pragma once
class Instruction {
public:
virtual void print();
};
class STP : public Instruction{
private:
//Possible inputs
int Rn1;
int Rn2;
int Xn;
bool SP;
long long immediate;
//Instruction Modes
bool is64;
bool isPreindex;
public:
STP(bool is64, bool isPreindex, int n1, int n2, int Xn, long long immediate);
void print();
};
AssemblyBlock.h:
#pragma once
#include "instructionds.h"
struct InstStruct {
Instruction* line;
struct InstStruct *prev;
struct InstStruct *next;
};
class AssemblyBlock {
private:
struct InstStruct *head;
public:
AssemblyBlock();
void insert(Instruction *inst);
void display();
};
I can add the .cpp files for the classes if necessary later.
When I compile the code using the following commands, I get these errors. The compiler doesn't seem to read the headers. I used a test file to make sure that the classes I built work outside of bison and everything worked perfectly. If anyone knows more about this, I really appreciate your help.
mattersonline#mattersonline-VirtualBox:~/Documents/flex/soonergy$ bison -d -o parser.cpp assembly_bison.y
assembly_bison.y: warning: 1 shift/reduce conflict [-Wconflicts-sr]
mattersonline#mattersonline-VirtualBox:~/Documents/flex/soonergy$ flex -o tokens.cpp assembly_lexer.l
mattersonline#mattersonline-VirtualBox:~/Documents/flex/soonergy$ g++ -o parser parser.cpp tokens.cpp AssemblyBlock.cpp instructionds.cpp
assembly_bison.y:15:5: error: ‘STP’ does not name a type
STP *input;
^~~
assembly_bison.y: In function ‘int yyparse()’:
assembly_bison.y:56:13: error: ‘input’ was not declared in this scope
input = new STP(true, true, $2, $4, -1, $9);
^~~~~
assembly_bison.y:56:13: note: suggested alternative: ‘ino_t’
input = new STP(true, true, $2, $4, -1, $9);
^~~~~
ino_t
assembly_bison.y:56:25: error: expected type-specifier before ‘STP’
input = new STP(true, true, $2, $4, -1, $9);
^~~
};
The issue seems to be that STP is both the name of a type and the name of a token in the grammar. Renaming your type or token to something else should fix this.
Hope this helps!

Why does this scanner not eat whitespaces?

These are my lexer-definitions, there are many lexer-definitions but this one is mine. I have several regexes trying to capture and ignore whitespace, from this sample. The error I get is that in line 1: 14 there is a $undefined Symbold to be found - that is of asci-value 32.
Also known as space.
OIL_VERSION = "314159OS";
CPU AT91SAM7S256
{
//Test Coment
OS HTOSEK
{
STATUS = EXTENDED;
STARTUPHOOK = TRUE;
ERRORHOOK = FALSE;
SHUTDOWNHOOK = FALSE;
PRETASKHOOK = FALSE;
POSTTASKHOOK = FALSE;
USEGETSERVICEID = FALSE;
USEPARAMETERACCESS = FALSE;
USERESSCHEDULER = FALSE;
USR_STACK_SIZE=3000;
};
/* Definition of application mode */
APPMODE appmode1{};
/* Definition of resource */
RESOURCE resource1
{
RESOURCEPROPERTY = STANDARD;
};
/* Definition of event */
EVENT event1
{
MASK = AUTO;
};
...
Lex Capture Definitions:
%{ /*** C/C++ Declarations ***/
#define MAX_INCLUDE_DEPTH 16
#include <string>
#include <sstream>
#define SSTR( x ) dynamic_cast< std::ostringstream & >( \
( std::ostringstream() << std::dec << x ) ).str()
#include "scanner.h"
/* import the parser's token type into a local typedef */
typedef implementation::Parser::token token;
typedef implementation::Parser::token_type token_type;
/* By default yylex returns int, we use token_type. Unfortunately yyterminate
* by default returns 0, which is not of token_type. */
#define yyterminate() return token::END
/* This disables inclusion of unistd.h, which is not available under Visual C++
* on Win32. The C++ scanner uses STL streams instead. */
#define YY_NO_UNISTD_H
static int once = 0;
static int lineno = 1;
static void nextLine()
{
lineno++;
}
//convert a str to int
int fromInt(char *s)
{
int i;
int m;
m = 1;
i = 0;
if (s[0]=='-'){
m = -1;
i = 1;
}
else if(s[0]=='+')
i = 1;
return((atoi(s+i))*m);
}
int fromHex(char *s)
{
return((int)strtol(s, NULL, 16));
}
int LineCounter=0;
%}
/*** Flex Declarations and Options ***/
/* enable c++ scanner class generation */
%option c++
/* change the name of the scanner class. results in "ExampleFlexLexer" */
%option prefix="Example"
/* the manual says "somewhat more optimized" */
%option batch
/* enable scanner to generate debug output. disable this for release
* versions. */
%option debug
/* no support for include files is planned */
%option yywrap nounput
/* enables the use of start condition stacks */
%option stack
%x C_COMMENT
%x incl
/* The following paragraph suffices to track locations accurately. Each time
* yylex is invoked, the begin position is moved onto the end position. */
%{
#define YY_USER_ACTION yylloc->columns(yyleng);
%}
%% /*** Regular Expressions Part ***/
/* code to place at the beginning of yylex() */
%{
// reset location
yylloc->step();
%}
"/*" { BEGIN(C_COMMENT); }
<C_COMMENT>"*/" { BEGIN(INITIAL); }
<C_COMMENT>. { }
"=" { return(token::EQ);
}
"[" { return(token::LBRACK);
}
"]" { return(token::RBRACK);
}
"OS" { return(token::OSEK);
}
"EVENT" { return(token::EVENT);
}
"TASK" { return(token::TASK);
}
"ALARM" { return(token::ALARM);
}
"COUNTER" { return(token::COUNTER);
}
"OIL_VERSION" { return(token::OIL_VERSION);
}
"APPMODE" { return(token::APPMODE);
}
"CPU" { return (token::CPU);
}
"true"|"TRUE" { yylval->integerVal =1; return(token::VAL_BOOL);
}
"false"|"FALSE" { yylval->integerVal =0; return(token::VAL_BOOL);
}
"BOOLEAN" { return(token::BOOLEAN);
}
"INT" { return(token::INT);
}
"{" { return(token::LBRACE);
}
"}" { return(token::RBRACE);
}
":" { return(token::COLON);
}
"," { return(token::COMMA);
}
";" { return(token::SEMI);
}
([_A-Za-z])([a-zA-Z0-9!^_])* {yylval->stringVal = new std::string(yytext, yyleng);
return(token::STRING);
}
(([+-])?([0-9])*) {yylval->integerVal = fromInt( yytext );
return(token::NUMERAL);
}
(("0x")([0-9ABCDEFabcdef])*) {yylval->integerVal = fromHex( yytext );
return(token::NUMERAL);
}
(([-+]?[1-9][0-9]+\.[0-9]*)|([-+]?[0-9]*\.[0-9]+)|([-+]?[1-9]+))([eE][-+]?[0-9]+)?(f)? { yylval->doubleVal=atof(yytext);
return (token::VAL_FLOAT);
}
[\n\r]+ {
//yylloc->lines(yyleng);
yylloc->step();
LineCounter++;
//return token::EOL;
}
[\r\n]+ {
//yylloc->lines(yyleng);
yylloc->step();LineCounter++;
//return token::EOL;
}
[\t\r]+ { /* gobble up white-spaces */ yylloc->step(); }
[\s]+ { yylloc->step(); }
\"([^\"])*\" {
yytext[yyleng-1]= 0;
yylval->stringVal = new std::string( yytext, yyleng);
return(token::STRING);
}
. {
unsigned int temp;
temp= (unsigned int)(*yytext);
std::stringstream str2;
str2<<temp;
std::cout<<"Unknown character"<<*yytext<<" as Asci-value : "<<str2.str()<<std::endl;
return static_cast<token_type>(*yytext);
}
%% /*** Additional Code ***/
namespace implementation {
Scanner::Scanner(std::istream* in,
std::ostream* out)
: ExampleFlexLexer(in, out)
{
}
Scanner::~Scanner()
{
}
void Scanner::set_debug(bool b)
{
yy_flex_debug = b;
}
}
/* This implementation of ExampleFlexLexer::yylex() is required to fill the
* vtable of the class ExampleFlexLexer. We define the scanner's main yylex
* function via YY_DECL to reside in the Scanner class instead. */
#ifdef yylex
#undef yylex
#endif
int ExampleFlexLexer::yylex()
{
std::cerr << "in ExampleFlexLexer::yylex() !" << std::endl;
return 0;
}
/* When the scanner receives an end-of-file indication from YY_INPUT, it then
* checks the yywrap() function. If yywrap() returns false (zero), then it is
* assumed that the function has gone ahead and set up `yyin' to point to
* another input file, and scanning continues. If it returns true (non-zero),
* then the scanner terminates, returning 0 to its caller. */
int ExampleFlexLexer::yywrap()
{
return 1;
}
I modified the last rule, so it simply doesent try to cast any unknown text and print out the ascisymbols it captures.. resulting in 32 47 47 32 " // ".
Will try to print out the stream..
flex does not implement perlisms such as \s. The only backslash escape sequences it recognized are standard C escapes such as \n. If you want to recognise a space character, use " ".
By the way, [\n\r]+ and [\r\n]+ recognize exactly the same thing: one or more repetitions of a single character which is either a newline or a return. So the second such rule will never match. I think flex will warn you about that.

regex (in flex) for complete general sentence

I am definig tokens inside flex as
%%
#[^\\\" \n\(\),=\{\}#~]+ {yylval.sval = strdup(yytext + 1); return ENTRYTYPE;}
[A-Za-z][A-Za-z0-9:"]* { yylval.sval = strdup(yytext); return KEY; }
\"([^"]|\\.)*\"|\{([^"]|\\.)*\} { yylval.sval = strdup(yytext); return VALUE; }
[ \t\n] ; /* ignore whitespace */
[{}=,] { return *yytext; }
. { fprintf(stderr, "Unrecognized character %c in input\n", *yytext); }
%%
(Though, not a good way)
The problem is the VALUE variable are doing fine for a quoted string, of the form "some quote"; but not for the form when they are enclosed by braces (of the form {some sentences}) as tried.
What is messy there?
I think that you want this, instead:
\"([^"]|\\.)*\"|\{([^\}]|\\.)*\} { yylval.sval = strdup(yytext); return VALUE; }
Even better, the following will be clearer and easier to maintain:
\"([^"]|\\.)*\" { yylval.sval = strdup(yytext); return VALUE; }
\{([^\}]|\\.)*\} { yylval.sval = strdup(yytext); return VALUE; }
Update
I have escaped the right brace in the character class expressions.