Spirit Lex: Which token definition generated this token? - c++

Sorry if this is a newbie question, but I need to know which token definition produced a certain token. When I print the token ID, I just get an integer. I need to know which regex generated this token.
Edit:
Here's how I define my tokens:
template <typename LexerT>
class Tokens: public lex::lexer<LexerT>
{
public:
Tokens(const std::string& input):
lineNo_(1)
{
using boost::spirit::lex::_start;
using boost::spirit::lex::_end;
using boost::spirit::lex::_pass;
using boost::phoenix::ref;
using boost::phoenix::construct;
// macros
this->self.add_pattern
("EXP", "(e|E)(\\+|-)?\\d+")
("SUFFIX", "[yzafpnumkKMGTPEZY]")
("INTEGER", "-?\\d+")
("FLOAT", "-?(((\\d+)|(\\d*\\.\\d+)|(\\d+\\.\\d*))({EXP}|{SUFFIX})?)")
("SYMBOL", "[a-zA-Z_?#](\\w|\\?|#)*")
("STRING", "\\\"([^\\\"]|\\\\\\\")*\\\"");
// whitespaces and comments
whitespaces_ = "\\s+";
comments_ = "(;[^\\n]*\\n)|(\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/)";
// literals
integer_ = "{INTEGER}";
float_ = "{FLOAT}";
symbol_ = "{SYMBOL}";
string_ = "{STRING}";
// operators
quote_ = "'";
backquote_ = '`';
// ... other tokens
// whitespace and comment rules
this->self += whitespaces_ [ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore];
this->self += comments_ [ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore];
// literal rules
this->self += integer_ | float_ | string_ | symbol_;
// this->self += ... other tokens
}
~Tokens() {}
size_t lineNo() { return lineNo_; }
private:
// ignored tokens
lex::token_def<lex::omit> whitespaces_, comments_;
// literal tokens
lex::token_def<int> integer_;
lex::token_def<std::string> float_, symbol_, string_;
// operator tokens
lex::token_def<> quote_, backquote_;
// ... other token definitions of type lex::token_def<>
// current line number
size_t lineNo_;
};
Thanks,
Haitham

From the docs http://www.boost.org/doc/libs/1_49_0/libs/spirit/doc/html/spirit/lex/tutorials/lexer_quickstart2.html:
To ensure every token gets assigned a id the Spirit.Lex library internally assigns unique numbers to the token definitions, starting with the constant defined by boost::spirit::lex::min_token_id
So you can in fact get the token id incrementally assigned. However, to make things a little bit more friendly/robust, I'd suggest makeing a helper function to determine the name of the token, so you can do something like this:
while (iter != end && token_is_valid(*iter))
{
std::cout << "Token: " <<
(iter->id() - lex::min_token_id) << ": " <<
toklexer.nameof(iter) << " ('" << iter->value() << "')\n";
++iter;
}
if (iter == end) { std::cout << "lineNo: " << toklexer.lineNo() << "\n"; }
Which, for input like:
const std::string str = "symbol \"string\" \n"
"this /* is a comment */\n"
"31415926E-7 123";
Would print:
Token: 5: symbol_ ('symbol')
Token: 4: string_ ('"string"')
Token: 5: symbol_ ('this')
Token: 3: float_ ('31415926E-7')
Token: 2: integer_ ('123')
lineNo: 3
Notes
I don't think it is possible to identify down to the pattern expression, since the information is not exposed and no longer available once the token is returned by the lexer
I think I remember seeing tokens with debug info (similar to qi::rule<>::name()?) but I can't currently find the documentation for it. The implementation of the Tokens::nameof(It) function would be greatly simplified if you could reuse the debug-name.
CODE
Fully working demo code (slightly adapted to Boost 1_49-1_57, GCC -std=c++0x):
Live On Coliru
#define BOOST_RESULT_OF_USE_DECLTYPE
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/phoenix/function/adapt_callable.hpp>
namespace qi = boost::spirit::qi;
namespace lex = boost::spirit::lex;
namespace phx = boost::phoenix;
///////////////////////////////////////////////////////////////////////////
// irrelevant for question: needed this locally to make it work with my boost
// version
namespace detail {
struct count {
template<class It1, class It2, class T> struct result { typedef ptrdiff_t type; };
template<class It1, class It2, class T>
typename result<It1, It2, T>::type operator()(It1 f, It2 l, T const& x) const {
return std::count(f, l, x);
}
};
}
BOOST_PHOENIX_ADAPT_CALLABLE(count, detail::count, 3);
///////////////////////////////////////////////////////////////////////////
template <typename LexerT>
class Tokens: public lex::lexer<LexerT>
{
public:
Tokens():
lineNo_(1)
{
using lex::_start;
using lex::_end;
using lex::_pass;
using phx::ref;
// macros
this->self.add_pattern
("EXP", "(e|E)(\\+|-)?\\d+")
("SUFFIX", "[yzafpnumkKMGTPEZY]")
("INTEGER", "-?\\d+")
("FLOAT", "-?(((\\d+)|(\\d*\\.\\d+)|(\\d+\\.\\d*))({EXP}|{SUFFIX})?)")
("SYMBOL", "[a-zA-Z_?#](\\w|\\?|#)*")
("STRING", "\\\"([^\\\"]|\\\\\\\")*\\\"");
// whitespaces and comments
whitespaces_ = "\\s+";
comments_ = "(;[^\\n]*\\n)|(\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/)";
// literals
integer_ = "{INTEGER}";
float_ = "{FLOAT}";
symbol_ = "{SYMBOL}";
string_ = "{STRING}";
// operators
quote_ = "'";
backquote_ = '`';
// ... other tokens
// whitespace and comment rules
//this->self.add(whitespaces_, 1001)
//(comments_, 1002);
this->self = whitespaces_ [phx::ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore]
| comments_ [phx::ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore];
// literal rules
this->self += integer_ | float_ | string_ | symbol_;
// this->self += ... other tokens
}
template <typename TokIter>
std::string nameof(TokIter it)
{
if (it->id() == whitespaces_.id()) return "whitespaces_";
if (it->id() == comments_.id()) return "comments_";
if (it->id() == integer_.id()) return "integer_";
if (it->id() == float_.id()) return "float_";
if (it->id() == symbol_.id()) return "symbol_";
if (it->id() == string_.id()) return "string_";
if (it->id() == quote_.id()) return "quote_";
if (it->id() == backquote_.id()) return "backquote_";
return "other";
}
~Tokens() {}
size_t lineNo() { return lineNo_; }
private:
// ignored tokens
lex::token_def</*lex::omit*/> whitespaces_, comments_;
// literal tokens
lex::token_def<int> integer_;
lex::token_def<std::string> float_, symbol_, string_;
// operator tokens
lex::token_def<> quote_, backquote_;
// ... other token definitions of type lex::token_def<>
// current line number
size_t lineNo_;
};
int main()
{
const std::string str = "symbol \"string\" \n"
"this /* is a comment */\n"
"31415926E-7 123";
typedef lex::lexertl::token<char const*> token_type;
typedef lex::lexertl::actor_lexer<token_type> lexer_type;
Tokens<lexer_type> toklexer;
char const* first = str.c_str();
char const* last = &first[str.size()];
lexer_type::iterator_type iter = toklexer.begin(first, last);
lexer_type::iterator_type end = toklexer.end();
while (iter != end && token_is_valid(*iter))
{
std::cout << "Token: " <<
(iter->id() - lex::min_token_id) << ": " <<
toklexer.nameof(iter) << " ('" << iter->value() << "')\n";
++iter;
}
if (iter == end) { std::cout << "lineNo: " << toklexer.lineNo() << "\n"; }
else {
std::string rest(first, last);
std::cout << "Lexical analysis failed\n" << "stopped at: \""
<< rest << "\"\n";
}
return 0;
}

Related

boost::spirit::karma grammar: Comma delimited output from struct with optionals attributes

I need a comma delimited output from a struct with optionals. For example, if I have this struct:
MyStruct
{
boost::optional<std::string> one;
boost::optional<int> two;
boost::optional<float> three;
};
An output like: { "string", 1, 3.0 } or { "string" } or { 1, 3.0 } and so on.
Now, I have code like this:
struct MyStruct
{
boost::optional<std::string> one;
boost::optional<int> two;
boost::optional<float> three;
};
BOOST_FUSION_ADAPT_STRUCT
(MyStruct,
one,
two,
three)
template<typename Iterator>
struct MyKarmaGrammar : boost::spirit::karma::grammar<Iterator, MyStruct()>
{
MyKarmaGrammar() : MyKarmaGrammar::base_type(request_)
{
using namespace std::literals::string_literals;
namespace karma = boost::spirit::karma;
using karma::int_;
using karma::double_;
using karma::string;
using karma::lit;
using karma::_r1;
key_ = '"' << string(_r1) << '"';
str_prop_ = key_(_r1) << ':'
<< string
;
int_prop_ = key_(_r1) << ':'
<< int_
;
dbl_prop_ = key_(_r1) << ':'
<< double_
;
//REQUEST
request_ = '{'
<< -str_prop_("one"s) <<
-int_prop_("two"s) <<
-dbl_prop_("three"s)
<< '}'
;
}
private:
//GENERAL RULES
boost::spirit::karma::rule<Iterator, void(std::string)> key_;
boost::spirit::karma::rule<Iterator, double(std::string)> dbl_prop_;
boost::spirit::karma::rule<Iterator, int(std::string)> int_prop_;
boost::spirit::karma::rule<Iterator, std::string(std::string)> str_prop_;
//REQUEST
boost::spirit::karma::rule<Iterator, MyStruct()> request_;
};
int main()
{
using namespace std::literals::string_literals;
MyStruct request = {std::string("one"), 2, 3.1};
std::string generated;
std::back_insert_iterator<std::string> sink(generated);
MyKarmaGrammar<std::back_insert_iterator<std::string>> serializer;
boost::spirit::karma::generate(sink, serializer, request);
std::cout << generated << std::endl;
}
This works but I need a comma delimited output. I tried with a grammar like:
request_ = '{'
<< (str_prop_("one"s) |
int_prop_("two"s) |
dbl_prop_("three"s)) % ','
<< '}'
;
But I receive this compile error:
/usr/include/boost/spirit/home/support/container.hpp:194:52: error: no type named ‘const_iterator’ in ‘struct MyStruct’
typedef typename Container::const_iterator type;
thanks!
Your struct is not a container, therefore list-operator% will not work. The documentation states it expects the attribute to be a container type.
So, just like in the Qi counterpart I showed you to create a conditional delim production:
delim = (&qi::lit('}')) | ',';
You'd need something similar here. However, everything about it is reversed. Instead of "detecting" the end of the input sequence from the presence of a {, we need to track the absense of preceding field from "not having output a field since opening brace yet".
That's a bit trickier since the required state cannot come from the same source as the input. We'll use a parser-member for simplicity here¹:
private:
bool _is_first_field;
Now, when we generate the opening brace, we want to initialize that to true:
auto _f = px::ref(_is_first_field); // short-hand
request_ %= lit('{') [ _f = true ]
Note: Use of %= instead of = tells Spirit that we want automatic attribute propagation to happen, in spite of the presence of a Semantic Action ([ _f = true ]).
Now, we need to generate the delimiter:
delim = eps(_f) | ", ";
Simple. Usage is also simple, except we'll want to conditionally reset the _f:
auto reset = boost::proto::deep_copy(eps [ _f = false ]);
str_prop_ %= (delim << key_(_r1) << string << reset) | "";
int_prop_ %= (delim << key_(_r1) << int_ << reset) | "";
dbl_prop_ %= (delim << key_(_r1) << double_ << reset) | "";
A very subtle point here is that I changed to the declared rule attribute types from T to optional<T>. This allows Karma to do the magic to fail the value generator if it's empty (boost::none), and skipping the reset!
ka::rule<Iterator, boost::optional<double>(std::string)> dbl_prop_;
ka::rule<Iterator, boost::optional<int>(std::string)> int_prop_;
ka::rule<Iterator, boost::optional<std::string>(std::string)> str_prop_;
Now, let's put together some testcases:
Test Cases
Live On Coliru
#include "iostream"
#include <boost/optional/optional_io.hpp>
#include <boost/fusion/include/io.hpp>
#include <boost/spirit/include/karma.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string>
struct MyStruct {
boost::optional<std::string> one;
boost::optional<int> two;
boost::optional<double> three;
};
BOOST_FUSION_ADAPT_STRUCT(MyStruct, one, two, three)
namespace ka = boost::spirit::karma;
namespace px = boost::phoenix;
template<typename Iterator>
struct MyKarmaGrammar : ka::grammar<Iterator, MyStruct()> {
MyKarmaGrammar() : MyKarmaGrammar::base_type(request_) {
using namespace std::literals::string_literals;
using ka::int_;
using ka::double_;
using ka::string;
using ka::lit;
using ka::eps;
using ka::_r1;
auto _f = px::ref(_is_first_field);
auto reset = boost::proto::deep_copy(eps [ _f = false ]);
key_ = '"' << string(_r1) << "\":";
delim = eps(_f) | ", ";
str_prop_ %= (delim << key_(_r1) << string << reset) | "";
int_prop_ %= (delim << key_(_r1) << int_ << reset) | "";
dbl_prop_ %= (delim << key_(_r1) << double_ << reset) | "";
//REQUEST
request_ %= lit('{') [ _f = true ]
<< str_prop_("one"s) <<
int_prop_("two"s) <<
dbl_prop_("three"s)
<< '}';
}
private:
bool _is_first_field = true;
//GENERAL RULES
ka::rule<Iterator, void(std::string)> key_;
ka::rule<Iterator, boost::optional<double>(std::string)> dbl_prop_;
ka::rule<Iterator, boost::optional<int>(std::string)> int_prop_;
ka::rule<Iterator, boost::optional<std::string>(std::string)> str_prop_;
ka::rule<Iterator> delim;
//REQUEST
ka::rule<Iterator, MyStruct()> request_;
};
template <typename T> std::array<boost::optional<T>, 2> option(T const& v) {
return { { v, boost::none } };
}
int main() {
using namespace std::literals::string_literals;
for (auto a : option("one"s))
for (auto b : option(2))
for (auto c : option(3.1))
for (auto request : { MyStruct { a, b, c } }) {
std::string generated;
std::back_insert_iterator<std::string> sink(generated);
MyKarmaGrammar<std::back_insert_iterator<std::string>> serializer;
ka::generate(sink, serializer, request);
std::cout << boost::fusion::as_vector(request) << ":\t" << generated << "\n";
}
}
Printing:
( one 2 3.1): {"one":one, "two":2, "three":3.1}
( one 2 --): {"one":one, "two":2}
( one -- 3.1): {"one":one, "three":3.1}
( one -- --): {"one":one}
(-- 2 3.1): {"two":2, "three":3.1}
(-- 2 --): {"two":2}
(-- -- 3.1): {"three":3.1}
(-- -- --): {}
¹ Note this limits re-entrant use of the parser, as well as making it non-const etc. karma::locals are the true answer to that, adding a little more complexity

How to tokenize C++ using Boost Regex

I'm currently working on a tokenizer for a class using boost regex. I'm not too familiar with boost, so I may be way off base on what I have so far but anyway, here is what I'm using:
regex re("[\\s*,()=;<>\+-]{1,2}");
sregex_token_iterator i(text.begin(), text.end(), re, -1);
sregex_token_iterator j;
sregex_token_iterator begin(text.begin(), text.end(), re), end;
unsigned count = 0;
while(i != j)
{
if(*i != ' ' && *i != '\n')
{
count++;
cout << "From i - " << count << " " << *i << endl;
}
i++;
if(*begin != ' ' && *begin != '\n')
{
count++;
cout << "Form j - " << count << " " << *begin << endl;
}
begin++;
}
cout << "There were " << count << " tokens found." << endl;
So, basically, I'm using the spaces and the symbols as delimiters, but I'm still outputting both (since I still want the symbols to be tokens). Like I said, I'm not extremely familiar with boost, so I'm not positive if I'm taking the right approach.
My end goal is to split a file that has a simple c++ block of code and tokenize it, here's the example file I am using:
#define MAX 5
int main(int argc)
{
for(int i = 0; i < MAX; i ++)
{
cout << "i is equal to " << i << endl;
}
return 0;
}
I'm having trouble with the fact that it is counting next lines and blank spaces as tokens, and I need them to be thrown away really. Also, I'm having a hard time with the "++" token, I can't seem to figure out the right expression for it to count "++".
Any help would be greatly appreciated!
Thanks!
Tim
First off,
Boost has Boost Wave which has (several, I think) ready-made tokenizers for C++ source
Boost has Spirit Lex which is a lexer that can tokenize based on regex patterns and some state support. It allows both dynamic lexer tables and statically generated lexer tables
In case you're interested in using Lex I ran a quick & dirty finger exercise for myself: it tokenizes itself Live On Coliru.
Notes:
A Lex tokenizer plays nicely with Boost Spirit Qi for parsing (though in all honesty, I prefer doing Spirit grammars directly on the source iterators).
It exposes an iterator interface, allthough my example leverages the callback interface to display the tokens:
int main()
{
typedef boost::spirit::istream_iterator It;
typedef lex::lexertl::token<It, boost::mpl::vector<int, double>, boost::mpl::true_ > token_type;
tokens<lex::lexertl::actor_lexer<token_type> > lexer;
std::ifstream ifs("main.cpp");
ifs >> std::noskipws;
It first(ifs), last;
bool ok = lex::tokenize(first, last, lexer, process_token());
std::cout << "\nTokenization " << (ok?"succeeded":"failed") << "; remaining input: '" << std::string(first,last) << "'\n";
}
Which is tokenized in the output as (trimming the preceding output):
[int][main][(][)][{][typedef][boost][::][spirit][::][istream_iterator][It][;][typedef][lex][::][lexertl][::][token][<][It][,][boost][::][mpl][::][vector][<][int][,][double][>][,][boost][::][mpl][::][true_][>][token_type][;][tokens][<][lex][::][lexertl][::][actor_lexer][<][token_type][>][>][lexer][;][std][::][ifstream][ifs][(]["main.cpp"][)][;][ifs][>>][std][::][noskipws][;][It][first][(][ifs][)][,][last][;][bool][ok][=][lex][::][tokenize][(][first][,][last][,][lexer][,][process_token][(][)][)][;][std][::][cout][<<]["\nTokenization "][<<][(][ok][?]["succeeded"][:]["failed"][)][<<]["; remaining input: '"][<<][std][::][string][(][first][,][last][)][<<]["'\n"][;][}]
Tokenization succeeded; remaining input: ''
You should actually want a different lexer state for parsing the preprocessor directives (line-ends become meaningful and several other expressions/keywords are valid). In real life, there's often a separate preprocessor step doing its own lexing here. (The fallout of this can be seen when lexing the include file specifications, e.g.)
ordering of tokens in the lexer is critical for the result
in this sample, you'd always match the & token as a binop_. You'd
probably want to match a ampersand_ token and decide at parse time
whether it's a binary operator (bitwise-and), unary operator (adress-of), reference type-qualifier etc. C++ is really interesting to parse :|
Comments are supported!
digraphs/trigraphs are not supported :)
pragmas, line/file directives etc. are unsupported
All in all, this should be pretty usable if you wanted to make, say, a simple syntax highlighter or formatter. Anything beyond that should require some more parsing/semantic analysis.
Full Listing:
#include <boost/spirit/include/support_istream_iterator.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <fstream>
#include <sstream>
#include <boost/lexical_cast.hpp>
namespace lex = boost::spirit::lex;
template <typename Lexer>
struct tokens : lex::lexer<Lexer>
{
tokens()
{
pound_ = "#";
define_ = "define";
if_ = "if";
else_ = "else";
endif_ = "endif";
ifdef_ = "ifdef";
ifndef_ = "ifndef";
defined_ = "defined";
keyword_ = "for|break|continue|while|do|switch|case|default|if|else|return|goto|throw|catch"
"static|volatile|auto|void|int|char|signed|unsigned|long|double|float|"
"delete|new|virtual|override|final|"
"typename|template|using|namespace|extern|\"C\"|"
"friend|public|private|protected|"
"class|struct|enum|"
"register|thread_local|noexcept|constexpr";
scope_ = "::";
dot_ = '.';
arrow_ = "->";
star_ = '*';
popen_ = '(';
pclose_ = ')';
bopen_ = '{';
bclose_ = '}';
iopen_ = '[';
iclose_ = ']';
colon_ = ':';
semic_ = ';';
comma_ = ',';
tern_q_ = '?';
relop_ = "==|!=|<=|>=|<|>";
assign_ = '=';
incr_ = "\\+\\+";
decr_ = "--";
binop_ = "[-+/%&|^]|>>|<<";
unop_ = "[-+~!]";
real_ = "[-+]?[0-9]+(e[-+]?[0-9]+)?f?";
int_ = "[-+]?[0-9]+";
identifier_ = "[a-zA-Z_][a-zA-Z0-9_]*";
ws_ = "[ \\t\\r\\n]";
line_comment_ = "\\/\\/.*?[\\r\\n]";
block_comment_ = "\\/\\*.*?\\*\\/";
this->self.add_pattern
("SCHAR", "\\\\(x[0-9a-fA-F][0-9a-fA-F]|[\\\\\"'0tbrn])|[^\"\\\\'\\r\\n]")
;
string_lit = "\\\"('|{SCHAR})*?\\\"";
char_lit = "'(\\\"|{SCHAR})'";
this->self +=
pound_ | define_ | if_ | else_ | endif_ | ifdef_ | ifndef_ | defined_
| keyword_ | scope_ | dot_ | arrow_ | star_ | popen_ | pclose_ | bopen_ | bclose_ | iopen_ | iclose_ | colon_ | semic_ | comma_ | tern_q_
| relop_ | assign_ | incr_ | decr_ | binop_ | unop_
| int_ | real_ | identifier_ | string_lit | char_lit
// ignore whitespace and comments
| ws_ [ lex::_pass = lex::pass_flags::pass_ignore ]
| line_comment_ [ lex::_pass = lex::pass_flags::pass_ignore ]
| block_comment_[ lex::_pass = lex::pass_flags::pass_ignore ]
;
}
private:
lex::token_def<> pound_, define_, if_, else_, endif_, ifdef_, ifndef_, defined_;
lex::token_def<> keyword_, scope_, dot_, arrow_, star_, popen_, pclose_, bopen_, bclose_, iopen_, iclose_, colon_, semic_, comma_, tern_q_;
lex::token_def<> relop_, assign_, incr_, decr_, binop_, unop_;
lex::token_def<int> int_;
lex::token_def<double> real_;
lex::token_def<> identifier_, string_lit, char_lit;
lex::token_def<lex::omit> ws_, line_comment_, block_comment_;
};
struct token_value : boost::static_visitor<std::string>
{
template <typename... T> // the token value can be a variant over any of the exposed attribute types
std::string operator()(boost::variant<T...> const& v) const {
return boost::apply_visitor(*this, v);
}
template <typename T> // the default value is a pair of iterators into the source sequence
std::string operator()(boost::iterator_range<T> const& v) const {
return { v.begin(), v.end() };
}
template <typename T>
std::string operator()(T const& v) const {
// not taken unless used in Spirit Qi rules, I guess
return std::string("attr<") + typeid(v).name() + ">(" + boost::lexical_cast<std::string>(v) + ")";
}
};
struct process_token
{
template <typename T>
bool operator()(T const& token) const {
std::cout << '[' /*<< token.id() << ":" */<< print(token.value()) << "]";
return true;
}
token_value print;
};
#if 0
std::string read(std::string fname)
{
std::ifstream ifs(fname);
std::ostringstream oss;
oss << ifs.rdbuf();
return oss.str();
}
#endif
int main()
{
typedef boost::spirit::istream_iterator It;
typedef lex::lexertl::token<It, boost::mpl::vector<int, double>, boost::mpl::true_ > token_type;
tokens<lex::lexertl::actor_lexer<token_type> > lexer;
std::ifstream ifs("main.cpp");
ifs >> std::noskipws;
It first(ifs), last;
bool ok = lex::tokenize(first, last, lexer, process_token());
std::cout << "\nTokenization " << (ok?"succeeded":"failed") << "; remaining input: '" << std::string(first,last) << "'\n";
}

boost::spirit access position iterator from semantic actions

Lets say I have code like this (line numbers for reference):
1:
2:function FuncName_1 {
3: var Var_1 = 3;
4: var Var_2 = 4;
5: ...
I want to write a grammar that parses such text, puts all indentifiers (function and variable names) infos into a tree (utree?).
Each node should preserve: line_num, column_num and symbol value. example:
root: FuncName_1 (line:2,col:10)
children[0]: Var_1 (line:3, col:8)
children[1]: Var_1 (line:4, col:9)
I want to put it into the tree because I plan to traverse through that tree and for each node I must know the 'context': (all parent nodes of current nodes).
E.g, while processing node with Var_1, I must know that this is a name for local variable for function FuncName_1 (that is currently being processed as node, but one level earlier)
I cannot figure out few things
Can this be done in Spirit with semantic actions and utree's ? Or should I use variant<> trees ?
How to pass to the node those three informations (column,line,symbol_name) at the same time ? I know I must use pos_iterator as iterator type for grammar but how to access those information in sematic action ?
I'm a newbie in Boost so I read the Spirit documentaiton over and over, I try to google my problems but I somehow cannot put all the pieces together ot find the solution. Seems like there was no one me with such use case like mine before (or I'm just not able to find it)
Looks like the only solutions with position iterator are the ones with parsing error handling, but this is not the case I'm interested in.
The code that only parses the code I was taking about is below but I dont know how to move forward with it.
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/support_line_pos_iterator.hpp>
namespace qi = boost::spirit::qi;
typedef boost::spirit::line_pos_iterator<std::string::const_iterator> pos_iterator_t;
template<typename Iterator=pos_iterator_t, typename Skipper=qi::space_type>
struct ParseGrammar: public qi::grammar<Iterator, Skipper>
{
ParseGrammar():ParseGrammar::base_type(SourceCode)
{
using namespace qi;
KeywordFunction = lit("function");
KeywordVar = lit("var");
SemiColon = lit(';');
Identifier = lexeme [alpha >> *(alnum | '_')];
VarAssignemnt = KeywordVar >> Identifier >> char_('=') >> int_ >> SemiColon;
SourceCode = KeywordFunction >> Identifier >> '{' >> *VarAssignemnt >> '}';
}
qi::rule<Iterator, Skipper> SourceCode;
qi::rule<Iterator > KeywordFunction;
qi::rule<Iterator, Skipper> VarAssignemnt;
qi::rule<Iterator> KeywordVar;
qi::rule<Iterator> SemiColon;
qi::rule<Iterator > Identifier;
};
int main()
{
std::string const content = "function FuncName_1 {\n var Var_1 = 3;\n var Var_2 = 4; }";
pos_iterator_t first(content.begin()), iter = first, last(content.end());
ParseGrammar<pos_iterator_t> resolver; // Our parser
bool ok = phrase_parse(iter,
last,
resolver,
qi::space);
std::cout << std::boolalpha;
std::cout << "\nok : " << ok << std::endl;
std::cout << "full : " << (iter == last) << std::endl;
if(ok && iter == last)
{
std::cout << "OK: Parsing fully succeeded\n\n";
}
else
{
int line = get_line(iter);
int column = get_column(first, iter);
std::cout << "-------------------------\n";
std::cout << "ERROR: Parsing failed or not complete\n";
std::cout << "stopped at: " << line << ":" << column << "\n";
std::cout << "remaining: '" << std::string(iter, last) << "'\n";
std::cout << "-------------------------\n";
}
return 0;
}
This has been a fun exercise, where I finally put together a working demo of on_success[1] to annotate AST nodes.
Let's assume we want an AST like:
namespace ast
{
struct LocationInfo {
unsigned line, column, length;
};
struct Identifier : LocationInfo {
std::string name;
};
struct VarAssignment : LocationInfo {
Identifier id;
int value;
};
struct SourceCode : LocationInfo {
Identifier function;
std::vector<VarAssignment> assignments;
};
}
I know, 'location information' is probably overkill for the SourceCode node, but you know... Anyways, to make it easy to assign attributes to these nodes without requiring semantic actions or lots of specifically crafted constructors:
#include <boost/fusion/adapted/struct.hpp>
BOOST_FUSION_ADAPT_STRUCT(ast::Identifier, (std::string, name))
BOOST_FUSION_ADAPT_STRUCT(ast::VarAssignment, (ast::Identifier, id)(int, value))
BOOST_FUSION_ADAPT_STRUCT(ast::SourceCode, (ast::Identifier, function)(std::vector<ast::VarAssignment>, assignments))
There. Now we can declare the rules to expose these attributes:
qi::rule<Iterator, ast::SourceCode(), Skipper> SourceCode;
qi::rule<Iterator, ast::VarAssignment(), Skipper> VarAssignment;
qi::rule<Iterator, ast::Identifier()> Identifier;
// no skipper, no attributes:
qi::rule<Iterator> KeywordFunction, KeywordVar, SemiColon;
We don't (essentially) modify the grammar, at all: attribute propagation is "just automatic"[2] :
KeywordFunction = lit("function");
KeywordVar = lit("var");
SemiColon = lit(';');
Identifier = as_string [ alpha >> *(alnum | char_("_")) ];
VarAssignment = KeywordVar >> Identifier >> '=' >> int_ >> SemiColon;
SourceCode = KeywordFunction >> Identifier >> '{' >> *VarAssignment >> '}';
The magic
How do we get the source location information attached to our nodes?
auto set_location_info = annotate(_val, _1, _3);
on_success(Identifier, set_location_info);
on_success(VarAssignment, set_location_info);
on_success(SourceCode, set_location_info);
Now, annotate is just a lazy version of a calleable that is defined as:
template<typename It>
struct annotation_f {
typedef void result_type;
annotation_f(It first) : first(first) {}
It const first;
template<typename Val, typename First, typename Last>
void operator()(Val& v, First f, Last l) const {
do_annotate(v, f, l, first);
}
private:
void static do_annotate(ast::LocationInfo& li, It f, It l, It first) {
using std::distance;
li.line = get_line(f);
li.column = get_column(first, f);
li.length = distance(f, l);
}
static void do_annotate(...) { }
};
Due to way in which get_column works, the functor is stateful (as it remembers the start iterator)[3]. As you can see do_annotate just accepts anything that derives from LocationInfo.
Now, the proof of the pudding:
std::string const content = "function FuncName_1 {\n var Var_1 = 3;\n var Var_2 = 4; }";
pos_iterator_t first(content.begin()), iter = first, last(content.end());
ParseGrammar<pos_iterator_t> resolver(first); // Our parser
ast::SourceCode program;
bool ok = phrase_parse(iter,
last,
resolver,
qi::space,
program);
std::cout << std::boolalpha;
std::cout << "ok : " << ok << std::endl;
std::cout << "full: " << (iter == last) << std::endl;
if(ok && iter == last)
{
std::cout << "OK: Parsing fully succeeded\n\n";
std::cout << "Function name: " << program.function.name << " (see L" << program.printLoc() << ")\n";
for (auto const& va : program.assignments)
std::cout << "variable " << va.id.name << " assigned value " << va.value << " at L" << va.printLoc() << "\n";
}
else
{
int line = get_line(iter);
int column = get_column(first, iter);
std::cout << "-------------------------\n";
std::cout << "ERROR: Parsing failed or not complete\n";
std::cout << "stopped at: " << line << ":" << column << "\n";
std::cout << "remaining: '" << std::string(iter, last) << "'\n";
std::cout << "-------------------------\n";
}
This prints:
ok : true
full: true
OK: Parsing fully succeeded
Function name: FuncName_1 (see L1:1:56)
variable Var_1 assigned value 3 at L2:3:14
variable Var_2 assigned value 4 at L3:3:15
Full Demo Program
See it Live On Coliru
Also showing:
error handling, e.g.:
Error: expecting "=" in line 3:
var Var_2 - 4; }
^---- here
ok : false
full: false
-------------------------
ERROR: Parsing failed or not complete
stopped at: 1:1
remaining: 'function FuncName_1 {
var Var_1 = 3;
var Var_2 - 4; }'
-------------------------
BOOST_SPIRIT_DEBUG macros
A bit of a hacky way to conveniently stream the LocationInfo part of any AST node, sorry :)
//#define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/support_line_pos_iterator.hpp>
#include <iomanip>
namespace qi = boost::spirit::qi;
namespace phx= boost::phoenix;
typedef boost::spirit::line_pos_iterator<std::string::const_iterator> pos_iterator_t;
namespace ast
{
namespace manip { struct LocationInfoPrinter; }
struct LocationInfo {
unsigned line, column, length;
manip::LocationInfoPrinter printLoc() const;
};
struct Identifier : LocationInfo {
std::string name;
};
struct VarAssignment : LocationInfo {
Identifier id;
int value;
};
struct SourceCode : LocationInfo {
Identifier function;
std::vector<VarAssignment> assignments;
};
///////////////////////////////////////////////////////////////////////////
// Completely unnecessary tweak to get a "poor man's" io manipulator going
// so we can do `std::cout << x.printLoc()` on types of `x` deriving from
// LocationInfo
namespace manip {
struct LocationInfoPrinter {
LocationInfoPrinter(LocationInfo const& ref) : ref(ref) {}
LocationInfo const& ref;
friend std::ostream& operator<<(std::ostream& os, LocationInfoPrinter const& lip) {
return os << lip.ref.line << ':' << lip.ref.column << ':' << lip.ref.length;
}
};
}
manip::LocationInfoPrinter LocationInfo::printLoc() const { return { *this }; }
// feel free to disregard this hack
///////////////////////////////////////////////////////////////////////////
}
BOOST_FUSION_ADAPT_STRUCT(ast::Identifier, (std::string, name))
BOOST_FUSION_ADAPT_STRUCT(ast::VarAssignment, (ast::Identifier, id)(int, value))
BOOST_FUSION_ADAPT_STRUCT(ast::SourceCode, (ast::Identifier, function)(std::vector<ast::VarAssignment>, assignments))
struct error_handler_f {
typedef qi::error_handler_result result_type;
template<typename T1, typename T2, typename T3, typename T4>
qi::error_handler_result operator()(T1 b, T2 e, T3 where, T4 const& what) const {
std::cerr << "Error: expecting " << what << " in line " << get_line(where) << ": \n"
<< std::string(b,e) << "\n"
<< std::setw(std::distance(b, where)) << '^' << "---- here\n";
return qi::fail;
}
};
template<typename It>
struct annotation_f {
typedef void result_type;
annotation_f(It first) : first(first) {}
It const first;
template<typename Val, typename First, typename Last>
void operator()(Val& v, First f, Last l) const {
do_annotate(v, f, l, first);
}
private:
void static do_annotate(ast::LocationInfo& li, It f, It l, It first) {
using std::distance;
li.line = get_line(f);
li.column = get_column(first, f);
li.length = distance(f, l);
}
static void do_annotate(...) {}
};
template<typename Iterator=pos_iterator_t, typename Skipper=qi::space_type>
struct ParseGrammar: public qi::grammar<Iterator, ast::SourceCode(), Skipper>
{
ParseGrammar(Iterator first) :
ParseGrammar::base_type(SourceCode),
annotate(first)
{
using namespace qi;
KeywordFunction = lit("function");
KeywordVar = lit("var");
SemiColon = lit(';');
Identifier = as_string [ alpha >> *(alnum | char_("_")) ];
VarAssignment = KeywordVar > Identifier > '=' > int_ > SemiColon; // note: expectation points
SourceCode = KeywordFunction >> Identifier >> '{' >> *VarAssignment >> '}';
on_error<fail>(VarAssignment, handler(_1, _2, _3, _4));
on_error<fail>(SourceCode, handler(_1, _2, _3, _4));
auto set_location_info = annotate(_val, _1, _3);
on_success(Identifier, set_location_info);
on_success(VarAssignment, set_location_info);
on_success(SourceCode, set_location_info);
BOOST_SPIRIT_DEBUG_NODES((KeywordFunction)(KeywordVar)(SemiColon)(Identifier)(VarAssignment)(SourceCode))
}
phx::function<error_handler_f> handler;
phx::function<annotation_f<Iterator>> annotate;
qi::rule<Iterator, ast::SourceCode(), Skipper> SourceCode;
qi::rule<Iterator, ast::VarAssignment(), Skipper> VarAssignment;
qi::rule<Iterator, ast::Identifier()> Identifier;
// no skipper, no attributes:
qi::rule<Iterator> KeywordFunction, KeywordVar, SemiColon;
};
int main()
{
std::string const content = "function FuncName_1 {\n var Var_1 = 3;\n var Var_2 - 4; }";
pos_iterator_t first(content.begin()), iter = first, last(content.end());
ParseGrammar<pos_iterator_t> resolver(first); // Our parser
ast::SourceCode program;
bool ok = phrase_parse(iter,
last,
resolver,
qi::space,
program);
std::cout << std::boolalpha;
std::cout << "ok : " << ok << std::endl;
std::cout << "full: " << (iter == last) << std::endl;
if(ok && iter == last)
{
std::cout << "OK: Parsing fully succeeded\n\n";
std::cout << "Function name: " << program.function.name << " (see L" << program.printLoc() << ")\n";
for (auto const& va : program.assignments)
std::cout << "variable " << va.id.name << " assigned value " << va.value << " at L" << va.printLoc() << "\n";
}
else
{
int line = get_line(iter);
int column = get_column(first, iter);
std::cout << "-------------------------\n";
std::cout << "ERROR: Parsing failed or not complete\n";
std::cout << "stopped at: " << line << ":" << column << "\n";
std::cout << "remaining: '" << std::string(iter, last) << "'\n";
std::cout << "-------------------------\n";
}
return 0;
}
[1] sadly un(der)documented, except for the conjure sample(s)
[2] well, I used as_string to get proper assignment to Identifier without too much work
[3] There could be smarter ways about this in terms of performance, but for now, let's keep it simple

Boost Spirit Signals Successful Parsing Despite Token Being Incomplete

I have a very simple path construct that I am trying to parse with boost spirit.lex.
We have the following grammar:
token := [a-z]+
path := (token : path) | (token)
So we're just talking about colon separated lower-case ASCII strings here.
I have three examples "xyz", "abc:xyz", "abc:xyz:".
The first two should be deemed valid. The third one, which has a trailing colon, should not be deemed valid. Unfortunately the parser I have recognizes all three as being valid. The grammar should not allow an empty token, but apparently spirit is doing just that. What am I missing to get the third one rejected?
Also, if you read the code below, in comments there is another version of the parser that demands that all paths end with semi-colons. I can get appropriate behavior when I activate those lines, (i.e. rejection of "abc:xyz:;"), but this is not really what I want.
Anyone have any ideas?
Thanks.
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <iostream>
#include <string>
using namespace boost::spirit;
using boost::phoenix::val;
template<typename Lexer>
struct PathTokens : boost::spirit::lex::lexer<Lexer>
{
PathTokens()
{
identifier = "[a-z]+";
separator = ":";
this->self.add
(identifier)
(separator)
(';')
;
}
boost::spirit::lex::token_def<std::string> identifier, separator;
};
template <typename Iterator>
struct PathGrammar
: boost::spirit::qi::grammar<Iterator>
{
template <typename TokenDef>
PathGrammar(TokenDef const& tok)
: PathGrammar::base_type(path)
{
using boost::spirit::_val;
path
=
(token >> tok.separator >> path)[std::cerr << _1 << "\n"]
|
//(token >> ';')[std::cerr << _1 << "\n"]
(token)[std::cerr << _1 << "\n"]
;
token
= (tok.identifier) [_val=_1]
;
}
boost::spirit::qi::rule<Iterator> path;
boost::spirit::qi::rule<Iterator, std::string()> token;
};
int main()
{
typedef std::string::iterator BaseIteratorType;
typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::string> > TokenType;
typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
typedef PathTokens<LexerType>::iterator_type TokensIterator;
typedef std::vector<std::string> Tests;
Tests paths;
paths.push_back("abc");
paths.push_back("abc:xyz");
paths.push_back("abc:xyz:");
/*
paths.clear();
paths.push_back("abc;");
paths.push_back("abc:xyz;");
paths.push_back("abc:xyz:;");
*/
for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
{
std::string str = *iter;
std::cerr << "*****" << str << "*****\n";
PathTokens<LexerType> tokens;
PathGrammar<TokensIterator> grammar(tokens);
BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();
bool r = boost::spirit::lex::tokenize_and_parse(first, last, tokens, grammar);
std::cerr << r << " " << (first==last) << "\n";
}
}
I addition to to what llonesmiz already said, here's a trick using qi::eoi that I sometimes use:
path = (
(token >> tok.separator >> path) [std::cerr << _1 << "\n"]
| token [std::cerr << _1 << "\n"]
) >> eoi;
This makes the grammar require eoi (end-of-input) at the end of a successful match. This leads to the desired result:
http://liveworkspace.org/code/23a7adb11889bbb2825097d7c553f71d
*****abc*****
abc
1 1
*****abc:xyz*****
xyz
abc
1 1
*****abc:xyz:*****
xyz
abc
0 1
The problem lies in the meaning of first and last after your call to tokenize_and_parse. first==last checks if your string has been completely tokenized, you can't infer anything about grammar. If you isolate the parsing like this, you obtain the expected result:
PathTokens<LexerType> tokens;
PathGrammar<TokensIterator> grammar(tokens);
BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();
LexerType::iterator_type lexfirst = tokens.begin(first,last);
LexerType::iterator_type lexlast = tokens.end();
bool r = parse(lexfirst, lexlast, grammar);
std::cerr << r << " " << (lexfirst==lexlast) << "\n";
This is what I finally ended up with. It uses the suggestions from both #sehe and #llonesmiz. Note the conversion to std::wstring and the use of actions in the grammar definition, which were not present in the original post.
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/bind.hpp>
#include <iostream>
#include <string>
//
// This example uses boost spirit to parse a simple
// colon-delimited grammar.
//
// The grammar we want to recognize is:
// identifier := [a-z]+
// separator = :
// path= (identifier separator path) | identifier
//
// From the boost spirit perspective this example shows
// a few things I found hard to come by when building my
// first parser.
// 1. How to flag an incomplete token at the end of input
// as an error. (use of boost::spirit::eoi)
// 2. How to bind an action on an instance of an object
// that is taken as input to the parser.
// 3. Use of std::wstring.
// 4. Use of the lexer iterator.
//
// This using directive will cause issues with boost::bind
// when referencing placeholders such as _1.
// using namespace boost::spirit;
//! A class that tokenizes our input.
template<typename Lexer>
struct Tokens : boost::spirit::lex::lexer<Lexer>
{
Tokens()
{
identifier = L"[a-z]+";
separator = L":";
this->self.add
(identifier)
(separator)
;
}
boost::spirit::lex::token_def<std::wstring, wchar_t> identifier, separator;
};
//! This class provides a callback that echoes strings to stderr.
struct Echo
{
void echo(boost::fusion::vector<std::wstring> const& t) const
{
using namespace boost::fusion;
std::wcerr << at_c<0>(t) << "\n";
}
};
//! The definition of our grammar, as described above.
template <typename Iterator>
struct Grammar : boost::spirit::qi::grammar<Iterator>
{
template <typename TokenDef>
Grammar(TokenDef const& tok, Echo const& e)
: Grammar::base_type(path)
{
using boost::spirit::_val;
path
=
((token >> tok.separator >> path)[boost::bind(&Echo::echo, e,::_1)]
|
(token)[boost::bind(&Echo::echo, &e, ::_1)]
) >> boost::spirit::eoi; // Look for end of input.
token
= (tok.identifier) [_val=boost::spirit::qi::_1]
;
}
boost::spirit::qi::rule<Iterator> path;
boost::spirit::qi::rule<Iterator, std::wstring()> token;
};
int main()
{
// A set of typedefs to make things a little clearer. This stuff is
// well described in the boost spirit documentation/examples.
typedef std::wstring::iterator BaseIteratorType;
typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::wstring> > TokenType;
typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
typedef Tokens<LexerType>::iterator_type TokensIterator;
typedef LexerType::iterator_type LexerIterator;
// Define some paths to parse.
typedef std::vector<std::wstring> Tests;
Tests paths;
paths.push_back(L"abc");
paths.push_back(L"abc:xyz");
paths.push_back(L"abc:xyz:");
paths.push_back(L":");
// Parse 'em.
for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
{
std::wstring str = *iter;
std::wcerr << L"*****" << str << L"*****\n";
Echo e;
Tokens<LexerType> tokens;
Grammar<TokensIterator> grammar(tokens, e);
BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();
// Have the lexer consume our string.
LexerIterator lexFirst = tokens.begin(first, last);
LexerIterator lexLast = tokens.end();
// Have the parser consume the output of the lexer.
bool r = boost::spirit::qi::parse(lexFirst, lexLast, grammar);
// Print the status and whether or note all output of the lexer
// was processed.
std::wcerr << r << L" " << (lexFirst==lexLast) << L"\n";
}
}

Cross-platform way to get line number of an INI file where given option was found

Looking for some C++ library (like boost::program_options) that is able to return line number of an INI file, where the given option or section was found.
Use cases:
I ask that library to find value "vvv" in a section "[SSS]". Library returns line number where "vvv" in section "[SSS]" is found, or -1. It gives me an ability to say "line 55: vvv must be < 256".
I iterate INI file for sections and validate their names. When some wild secsion is found, i tell: "line 55: section [Hahaha] is unknown".
update: i know about "INI is older than mammoth", but currently i have to port large windows project to cross-platform and cannot get rid of .ini files soon.
Once again, took the opportunity to play with Boost Spirit. This time I got to play with line_pos_iterator.
Here is the fruit of my labour: https://gist.github.com/1425972
When POSITIONINFO == 0
input is streaming
output is raw strings (well, map<string, map<string, string> > for the sections)
When POSITIONINFO == 1
input is buffered
output is textnode_t:
struct textnode_t {
int sline, eline, scol, ecol;
string_t text;
};
This means that the resulting map<textnode_t, map<textnode_t, textnode_t> > is able to report exactly what (line,col) start and end points mark the individual text nodes. See test output for a demo
Comments (#, /* ... */ style) have been implemented
Whitespace is 'tolerated'
name = value # use a comment to force inclusion of trailing whitespace
alternative = escape\ with slash\
De-escaping of the slashes is left as an exercise
Errors are also reported with full position info if enabled
NOTE C++11 support is NOT required, but I used it to dump the result of the parse. I'm too lazy to write it with C++03 verbose iterator style. :)
All code, makefile, example.ini can be found here: https://gist.github.com/1425972
Code
/* inireader.h */
#pragma once
#define POSITIONINFO 0
#include <map>
#include <string>
#include <iterator>
#include <boost/tuple/tuple_comparison.hpp>
template <typename S=std::string, typename Cmp=std::less<S> >
class IniFile
{
public:
IniFile(Cmp cmp=Cmp()) : _cmp(cmp)
{}
IniFile(const std::string& filename, Cmp cmp=Cmp()) : _cmp(cmp)
{ open(filename); }
void open(const std::string& filename);
typedef S string_t;
#if POSITIONINFO
struct textnode_t
{
int sline, eline,
scol, ecol;
string_t text;
operator const string_t&() const { return text; }
friend std::ostream& operator<<(std::ostream& os, const textnode_t& t)
{
os << "[L:" << t.sline << ",C" << t.scol << " .. L" << t.eline << ",C" << t.ecol << ":";
for (typename string_t::const_iterator it=t.text.begin(); it!=t.text.end(); ++it)
switch (*it)
{
case '\r' : os << "\\r"; break;
case '\n' : os << "\\n"; break;
case '\t' : os << "\\t"; break;
case '\0' : os << "\\0"; break;
default: os << *it ; break;
}
return os << "]";
}
bool operator<(const textnode_t& o) const
{ return boost::tie(text/*, sline, eline, scol, ecol*/) <
boost::tie(o.text/*, o.sline, o.eline, o.scol, o.ecol*/); }
textnode_t() : sline(0), eline(0), scol(0), ecol(0) { }
};
#else
typedef string_t textnode_t;
#endif
typedef std::pair<textnode_t, textnode_t> keyvalue_t;
typedef std::map<textnode_t, textnode_t> section_t;
typedef std::map<textnode_t, section_t> sections_t;
private:
Cmp _cmp;
};
///////////////////////////////////////
// template implementation
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
#include <boost/spirit/include/support_line_pos_iterator.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/adapted/std_pair.hpp>
#include <fstream>
namespace qi = boost::spirit::qi;
namespace phx= boost::phoenix;
namespace inireader
{
struct printer
{
printer(std::ostream& os) : _os(os) {}
std::ostream& _os;
typedef boost::spirit::utf8_string string;
void element(string const& tag, string const& value, int depth) const
{
for (int i = 0; i < (depth*4); ++i) // indent to depth
_os << ' ';
_os << "tag: " << tag;
if (value != "")
_os << ", value: " << value;
_os << std::endl;
}
};
void print_info(std::ostream& os, boost::spirit::info const& what)
{
using boost::spirit::basic_info_walker;
printer pr(os);
basic_info_walker<printer> walker(pr, what.tag, 0);
boost::apply_visitor(walker, what.value);
}
template <typename It, typename Skipper, typename Ini>
struct Grammar : qi::grammar<It, typename Ini::sections_t(), Skipper>
{
typedef typename Ini::string_t string_t;
typedef typename Ini::textnode_t textnode_t;
struct textbuilder
{
template <typename> struct result { typedef textnode_t type; };
textbuilder(It begin) : _begin(begin) { }
textnode_t operator()(const boost::iterator_range<It>& iters) const
{
#if !POSITIONINFO
return textnode_t(std::begin(iters), std::end(iters));
#else
using boost::spirit::get_line;
using boost::spirit::get_line_start;
using boost::spirit::get_column;
textnode_t element;
element.text = string_t (std::begin(iters) , std::end(iters));
element.sline = get_line (std::begin(iters));
element.eline = get_line (std::end(iters));
It sol = get_line_start (_begin , std::begin(iters));
element.scol = get_column (sol , std::begin(iters));
element.ecol = get_column (sol , std::end(iters));
return element;
#endif
}
private:
const It _begin;
} makenode;
Grammar(It begin) : Grammar::base_type(inifile), makenode(begin)
{
using namespace qi;
txt_ch = (lit('\\') > char_) | (char_ - (eol | '#' | "/*"));
key = raw [ lexeme [ +(txt_ch - char_("=")) ] ] [ _val = phx::bind(makenode, _1) ];
value = raw [ lexeme [ +txt_ch ] ] [ _val = phx::bind(makenode, _1) ];
pair %= key > '=' > value;
heading = ('[' > raw [ +~char_(']') ] > ']') [ _val = phx::bind(makenode, _1) ];
section %= heading >> +eol >> -((pair-heading) % +eol);
inifile %= -(section % +eol) >> *eol > eoi;
comment =
('#' >> *(char_ - eol))
| ("/*" > *(char_ - "*/") > "*/");
//BOOST_SPIRIT_DEBUG_NODE(comment);
//BOOST_SPIRIT_DEBUG_NODE(txt_ch);
BOOST_SPIRIT_DEBUG_NODE(heading);
BOOST_SPIRIT_DEBUG_NODE(section);
BOOST_SPIRIT_DEBUG_NODE(key);
BOOST_SPIRIT_DEBUG_NODE(value);
BOOST_SPIRIT_DEBUG_NODE(pair);
BOOST_SPIRIT_DEBUG_NODE(inifile);
}
typedef typename Ini::keyvalue_t keyvalue_t;
typedef typename Ini::section_t section_t;
typedef typename Ini::sections_t sections_t;
typedef typename string_t::value_type Char;
qi::rule<It> comment;
qi::rule<It, Char()> txt_ch;
qi::rule<It, textnode_t(), Skipper> key, value, heading;
qi::rule<It, keyvalue_t(), Skipper> pair;
qi::rule<It, std::pair<textnode_t, section_t>(), Skipper> section;
qi::rule<It, sections_t(), Skipper> inifile;
};
template <typename It, typename Builder>
typename Builder::template result<void>::type
fragment(const It& first, const It& last, const Builder& builder)
{
size_t len = std::distance(first, last);
It frag_end = first;
std::advance(frag_end, std::min(10ul, len));
return builder(boost::iterator_range<It>(first, frag_end));
}
}
template <typename S, typename Cmp>
void IniFile<S, Cmp>::open(const std::string& filename)
{
using namespace qi;
std::ifstream ifs(filename.c_str());
ifs.unsetf(std::ios::skipws);
#if POSITIONINFO
typedef std::string::const_iterator RawIt;
typedef boost::spirit::line_pos_iterator<RawIt> It;
typedef rule<It> Skipper;
std::string buffer(std::istreambuf_iterator<char>(ifs), (std::istreambuf_iterator<char>()));
It f(buffer.begin()), l(buffer.end());
#else
typedef boost::spirit::istream_iterator It;
typedef rule<It> Skipper;
It f(ifs), l;
#endif
inireader::Grammar<It, Skipper, IniFile<S, Cmp> > grammar(f);
Skipper skip = char_(" \t") | grammar.comment;
try
{
sections_t data;
bool ok = phrase_parse(f, l, grammar, skip, data);
if (ok)
{
std::cout << "Parse success!" << std::endl;
///////// C++11 specific features for quick display //////////
for (auto& section : data)
{
std::cout << "[" << section.first << "]" << std::endl;
for (auto& pair : section.second)
std::cout << pair.first << " = " << pair.second << std::endl;
///////// End C++11 specific /////////////////////////////////
}
} else
{
std::cerr << "Parse failed" << std::endl;
}
} catch (const qi::expectation_failure<It>& e)
{
std::cerr << "Exception: " << e.what() <<
" " << inireader::fragment(e.first, e.last, grammar.makenode) << "... ";
inireader::print_info(std::cerr, e.what_);
}
if (f!=l)
{
std::cerr << "Stopped at: '" << inireader::fragment(f, l, grammar.makenode) << "'" << std::endl;
}
}
Demo Input
[Cat1]
name1=100 #skipped
name2=200 \#not \\skipped
name3= dhfj dhjgfd/* skipped
*/
[Cat_2]
UsagePage=9
Usage=19
Offset=0x1204
/*
[Cat_2_bak]
UsagePage=9
Usage=19
Offset=0x1204
*/
[Cat_3]
UsagePage=12
Usage=39
#Usage4=39
Offset=0x12304
Demo Output (POSITIONINFO == 0)
Parse success!
[Cat1]
name1 = 100
name2 = 200 \#not \\skipped
name3 = dhfj dhjgfd
[Cat_2]
Offset = 0x1204
Usage = 19
UsagePage = 9
[Cat_3]
Offset = 0x12304
Usage = 39
UsagePage = 12
Demo Output (POSITIONINFO == 1)
Parse success!
[[L:1,C2 .. L1,C6:Cat1]]
[L:2,C2 .. L2,C7:name1] = [L:2,C8 .. L2,C12:100 ]
[L:6,C2 .. L6,C7:name2] = [L:6,C8 .. L6,C27:200 \#not \\skipped]
[L:7,C2 .. L7,C7:name3] = [L:7,C11 .. L7,C22:dhfj dhjgfd]
[[L:13,C3 .. L13,C8:Cat_2]]
[L:16,C2 .. L16,C8:Offset] = [L:16,C9 .. L16,C15:0x1204]
[L:15,C2 .. L15,C7:Usage] = [L:15,C8 .. L15,C10:19]
[L:14,C2 .. L14,C11:UsagePage] = [L:14,C12 .. L14,C13:9]
[[L:25,C3 .. L25,C8:Cat_3]]
[L:29,C2 .. L29,C8:Offset] = [L:29,C9 .. L29,C16:0x12304]
[L:27,C2 .. L27,C7:Usage] = [L:27,C8 .. L27,C10:39]
[L:26,C2 .. L26,C11:UsagePage] = [L:26,C12 .. L26,C14:12]