How does one prevent X3 symbol parsers from matching partial tokens? In the example below, I want to match "foo", but not "foobar". I tried throwing the symbol parser in a lexeme directive as one would for an identifier, but then nothing matches.
Thanks for any insights!
#include <string>
#include <iostream>
#include <iomanip>
#include <boost/spirit/home/x3.hpp>
int main() {
boost::spirit::x3::symbols<int> sym;
sym.add("foo", 1);
for (std::string const input : {
"foo",
"foobar",
"barfoo"
})
{
using namespace boost::spirit::x3;
std::cout << "\nParsing " << std::left << std::setw(20) << ("'" + input + "':");
int v;
auto iter = input.begin();
auto end = input.end();
bool ok;
{
// what's right rule??
// this matches nothing
// auto r = lexeme[sym - alnum];
// this matchs prefix strings
auto r = sym;
ok = phrase_parse(iter, end, r, space, v);
}
if (ok) {
std::cout << v << " Remaining: " << std::string(iter, end);
} else {
std::cout << "Parse failed";
}
}
}
Qi used to have distinct in their repository.
X3 doesn't.
The thing that solves it for the case you showed is a simple lookahead assertion:
auto r = lexeme [ sym >> !alnum ];
You could make a distinct helper easily too, e.g.:
auto kw = [](auto p) { return lexeme [ p >> !(alnum | '_') ]; };
Now you can just parse kw(sym).
Live On Coliru
#include <iostream>
#include <boost/spirit/home/x3.hpp>
int main() {
boost::spirit::x3::symbols<int> sym;
sym.add("foo", 1);
for (std::string const input : { "foo", "foobar", "barfoo" }) {
std::cout << "\nParsing '" << input << "': ";
auto iter = input.begin();
auto const end = input.end();
int v = -1;
bool ok;
{
using namespace boost::spirit::x3;
auto kw = [](auto p) { return lexeme [ p >> !(alnum | '_') ]; };
ok = phrase_parse(iter, end, kw(sym), space, v);
}
if (ok) {
std::cout << v << " Remaining: '" << std::string(iter, end) << "'\n";
} else {
std::cout << "Parse failed";
}
}
}
Prints
Parsing 'foo': 1 Remaining: ''
Parsing 'foobar': Parse failed
Parsing 'barfoo': Parse failed
Related
I'm trying to parse this type of string
1.2e3ex
1.2e3 ex
And have set up
x3::float_ >> "ex"
Unfortunately, this fails to parse
1ex
Full example code:
#include <iostream>
#include <boost/spirit/home/x3.hpp>
namespace x3 = boost::spirit::x3;
const auto parser = x3::float_ >> "em";
int main()
{
std::string input = "1em";
auto first = input.begin();
auto last = input.end();
float value{};
bool result = x3::phrase_parse(first, last, parser, x3::blank, value);
if(result)
{
if(first == last)
std::cout << "parse succesful: " << value << '\n';
else
std::cout << "incomplete parse: " << value << '\n';
}
else
std::cout << "parse unsuccesful\n";
}
Available live on Coliru as well.
It seems I'd need to jump through some hoops,
struct non_scientific_float_policy : x3::real_policies<float>
{
template <typename Iterator>
static bool parse_exp(Iterator& first, Iterator const& last)
{
return false;
}
};
const auto non_scientific_float = x3::real_parser<float, non_scientific_float_policy>{};
and provide an alternative:
const auto parser = non_scientific_float >> "em" | x3::float_ >> "em";
Is there no other way?
You can solve the issue by tuning the real policy parse_exp in such way that exponent detection must expect not only [eE] character but [eE][-+]?[0-9].
#include <iostream>
#include <boost/spirit/home/x3.hpp>
namespace x3 = boost::spirit::x3;
template <typename T>
struct alt_real_policies : x3::real_policies<T>
{
template <typename Iterator>
static bool parse_exp(Iterator& first, Iterator const& last)
{
Iterator save = first;
if (x3::real_policies<T>::parse_exp(first, last)) {
Iterator iter = first;
if (x3::extract_int<x3::unused_type, 10, 1, 1>::call(iter, last, x3::unused))
return true;
}
first = save;
return false;
}
};
const x3::real_parser<float, alt_real_policies<float>> altfloat;
const auto parser = altfloat >> "em";
int main()
{
std::string input = "1em";
auto first = input.begin();
auto last = input.end();
float value{};
bool result = x3::phrase_parse(first, last, parser, x3::blank, value);
if (result)
{
if (first == last)
std::cout << "parse succesful: " << value << '\n';
else
std::cout << "incomplete parse: " << value << '\n';
}
else
std::cout << "parse unsuccesful\n";
}
http://coliru.stacked-crooked.com/a/f60f334c960cb602
You can use an alternative policy for non-greedy parsing of the exponent. The very simplest I can think of is:
Live On Coliru
#include <boost/spirit/home/x3.hpp>
#include <iostream>
namespace x3 = boost::spirit::x3;
template <typename T>
struct no_exponent : x3::real_policies<T> {
template <typename It>
static bool parse_exp(It, It) { return false; }
};
x3::real_parser<double, no_exponent<double> > noexp_;
const auto parser = (x3::float_ | noexp_) >> "em";
int main() {
std::string input = "-1.67em";
auto first = input.begin();
auto last = input.end();
float value{};
bool result = x3::phrase_parse(first, last, parser, x3::blank, value);
if (result) {
if (first == last)
std::cout << "parse succesful: " << value << '\n';
else
std::cout << "incomplete parse: " << value << '\n';
} else
{
std::cout << "parse unsuccesful\n";
}
}
Printing:
parse succesful: -1.67
I have a string (nested strings even) that are formatted like a C++ braced initializer list. I want to tokenize them one level at a time into a vector of strings.
So when I input "{one, two, three}" to the function should output a three element vector
"one",
"two",
"three"
To complicate this, it needs to support quoted tokens and preserve nested lists:
Input String: "{one, {2, \"three four\"}}, \"five, six\", {\"seven, eight\"}}"
Output is a four element vector:
"one",
"{2, \"three four\"}",
"five, six",
"{\"seven, eight\"}"
I've looked at a few other SO posts:
Using Boost Tokenizer escaped_list_separator with different parameters
Boost split not traversing inside of parenthesis or braces
And used those to start a solution, but this seems slightly too complicated for the tokenizer (because of the braces):
#include <boost/algorithm/string.hpp>
#include <boost/tokenizer.hpp>
std::vector<std::string> TokenizeBracedList(const std::string& x)
{
std::vector<std::string> tokens;
std::string separator1("");
std::string separator2(",\n\t\r");
std::string separator3("\"\'");
boost::escaped_list_separator<char> elements(separator1, separator2, separator3);
boost::tokenizer<boost::escaped_list_separator<char>> tokenizer(x, elements);
for(auto i = std::begin(tokenizer); i != std::end(tokenizer); ++i)
{
auto token = *i;
boost::algorithm::trim(token);
tokens.push_back(token);
}
return tokens;
}
With this, even in the trivial case, it doesn't strip the opening and closing braces.
Boost and C++17 are fair game for a solution.
Simple (Flat) Take
Defining a flat data structure like:
using token = std::string;
using tokens = std::vector<token>;
We can define an X3 parser like:
namespace Parser {
using namespace boost::spirit::x3;
rule<struct list_, token> item;
auto quoted = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
auto bare = lexeme [ +(graph-','-'}') ];
auto list = '{' >> (item % ',') >> '}';
auto sublist = raw [ list ];
auto item_def = sublist | quoted | bare;
BOOST_SPIRIT_DEFINE(item)
}
Live On Wandbox
#include <boost/spirit/home/x3.hpp>
#include <iostream>
#include <iomanip>
using token = std::string;
using tokens = std::vector<token>;
namespace x3 = boost::spirit::x3;
namespace Parser {
using namespace boost::spirit::x3;
rule<struct list_, token> item;
auto quoted = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
auto bare = lexeme [ +(graph-','-'}') ];
auto list = '{' >> (item % ',') >> '}';
auto sublist = raw [ list ];
auto item_def = sublist | quoted | bare;
BOOST_SPIRIT_DEFINE(item)
}
int main() {
for (std::string const input : {
R"({one, "five, six"})",
R"({one, {2, "three four"}, "five, six", {"seven, eight"}})",
})
{
auto f = input.begin(), l = input.end();
std::vector<std::string> parsed;
bool ok = phrase_parse(f, l, Parser::list, x3::space, parsed);
if (ok) {
std::cout << "Parsed: " << parsed.size() << " elements\n";
for (auto& el : parsed) {
std::cout << " - " << std::quoted(el, '\'') << "\n";
}
} else {
std::cout << "Parse failed\n";
}
if (f != l)
std::cout << "Remaining unparsed: " << std::quoted(std::string{f, l}) << "\n";
}
}
Prints
Parsed: 2 elements
- 'one'
- 'five, six'
Parsed: 4 elements
- 'one'
- '{2, "three four"}'
- 'five, six'
- '{"seven, eight"}'
Nested Data
Changing the datastructure to be a bit more specific/realistic:
namespace ast {
using value = boost::make_recursive_variant<
double,
std::string,
std::vector<boost::recursive_variant_>
>::type;
using list = std::vector<value>;
}
Now we can change the grammar, as we no longer need to treat sublist as if it is a string:
namespace Parser {
using namespace boost::spirit::x3;
rule<struct item_, ast::value> item;
auto quoted = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
auto bare = lexeme [ +(graph-','-'}') ];
auto list = x3::rule<struct list_, ast::list> {"list" }
= '{' >> (item % ',') >> '}';
auto item_def = list | double_ | quoted | bare;
BOOST_SPIRIT_DEFINE(item)
}
Everything "still works": Live On Wandbox
#include <boost/spirit/home/x3.hpp>
#include <iostream>
#include <iomanip>
namespace ast {
using value = boost::make_recursive_variant<
double,
std::string,
std::vector<boost::recursive_variant_>
>::type;
using list = std::vector<value>;
}
namespace x3 = boost::spirit::x3;
namespace Parser {
using namespace boost::spirit::x3;
rule<struct item_, ast::value> item;
auto quoted = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
auto bare = lexeme [ +(graph-','-'}') ];
auto list = x3::rule<struct list_, ast::list> {"list" }
= '{' >> (item % ',') >> '}';
auto item_def = list | double_ | quoted | bare;
BOOST_SPIRIT_DEFINE(item)
}
struct pretty_printer {
using result_type = void;
std::ostream& _os;
int _indent;
pretty_printer(std::ostream& os, int indent = 0) : _os(os), _indent(indent) {}
void operator()(ast::value const& v) { boost::apply_visitor(*this, v); }
void operator()(double v) { _os << v; }
void operator()(std::string s) { _os << std::quoted(s); }
void operator()(ast::list const& l) {
_os << "{\n";
_indent += 2;
for (auto& item : l) {
_os << std::setw(_indent) << "";
operator()(item);
_os << ",\n";
}
_indent -= 2;
_os << std::setw(_indent) << "" << "}";
}
};
int main() {
pretty_printer print{std::cout};
for (std::string const input : {
R"({one, "five, six"})",
R"({one, {2, "three four"}, "five, six", {"seven, eight"}})",
})
{
auto f = input.begin(), l = input.end();
ast::value parsed;
bool ok = phrase_parse(f, l, Parser::item, x3::space, parsed);
if (ok) {
std::cout << "Parsed: ";
print(parsed);
std::cout << "\n";
} else {
std::cout << "Parse failed\n";
}
if (f != l)
std::cout << "Remaining unparsed: " << std::quoted(std::string{f, l}) << "\n";
}
}
Prints:
Parsed: {
"one",
"five, six",
}
Parsed: {
"one",
{
2,
"three four",
},
"five, six",
{
"seven, eight",
},
}
I have the following code:
int main()
{
string s = "server ('m1.labs.teradata.com') username ('use\\')r_*5') password('u\" er 5') dbname ('default')";
regex re("(\'[!-~]+\')");
sregex_token_iterator i(s.begin(), s.end(), re, 1);
sregex_token_iterator j;
unsigned count = 0;
while(i != j)
{
cout << "the token is "<<*i<< endl;
count++;
}
cout << "There were " << count << " tokens found." << endl;
return 0;
}
Using the above regex, I wanted to extract the string between the paranthesis and single quote:, The out put should look like :
the token is 'm1.labs.teradata.com'
the token is 'use\')r_*5'
the token is 'u" er 5'
the token is 'default'
There were 4 tokens found.
Basically, the regex supposed to extract everything between " (' " and " ') ". It can be anything space , special character, quote or a closing parathesis.
I has earlier used the following regex:
boost::regex re_arg_values("(\'[!-~]+\')");
But is was not accepting space. Please can someone help me out with this. Thanks in advance.
Here's a sample of using Spirit X3 to create grammar to actually parse this. I'd like to parse into a map of (key->value) pairs, which makes a lot more sense than just blindly assuming the names are always the same:
using Config = std::map<std::string, std::string>;
using Entry = std::pair<std::string, std::string>;
Now, we setup some grammar rules using X3:
namespace parser {
using namespace boost::spirit::x3;
auto value = quoted("'") | quoted('"');
auto key = lexeme[+alpha];
auto pair = key >> '(' >> value >> ')';
auto config = skip(space) [ *as<Entry>(pair) ];
}
The helpers as<> and quoted are simple lambdas:
template <typename T> auto as = [](auto p) { return rule<struct _, T> {} = p; };
auto quoted = [](auto q) { return lexeme[q >> *('\\' >> char_ | char_ - q) >> q]; };
Now we can parse the string into a map directly:
Config parse_config(std::string const& cfg) {
Config parsed;
auto f = cfg.begin(), l = cfg.end();
if (!parse(f, l, parser::config, parsed))
throw std::invalid_argument("Parse failed at " + std::string(f,l));
return parsed;
}
And the demo program
int main() {
Config cfg = parse_config("server ('m1.labs.teradata.com') username ('use\\')r_*5') password('u\" er 5') dbname ('default')");
for (auto& setting : cfg)
std::cout << "Key " << setting.first << " has value " << setting.second << "\n";
}
Prints
Key dbname has value default
Key password has value u" er 5
Key server has value m1.labs.teradata.com
Key username has value use')r_*5
LIVE DEMO
Live On Coliru
#include <iostream>
#include <boost/spirit/home/x3.hpp>
#include <boost/fusion/adapted/std_pair.hpp>
#include <map>
using Config = std::map<std::string, std::string>;
using Entry = std::pair<std::string, std::string>;
namespace parser {
using namespace boost::spirit::x3;
template <typename T> auto as = [](auto p) { return rule<struct _, T> {} = p; };
auto quoted = [](auto q) { return lexeme[q >> *(('\\' >> char_) | (char_ - q)) >> q]; };
auto value = quoted("'") | quoted('"');
auto key = lexeme[+alpha];
auto pair = key >> '(' >> value >> ')';
auto config = skip(space) [ *as<Entry>(pair) ];
}
Config parse_config(std::string const& cfg) {
Config parsed;
auto f = cfg.begin(), l = cfg.end();
if (!parse(f, l, parser::config, parsed))
throw std::invalid_argument("Parse failed at " + std::string(f,l));
return parsed;
}
int main() {
Config cfg = parse_config("server ('m1.labs.teradata.com') username ('use\\')r_*5') password('u\" er 5') dbname ('default')");
for (auto& setting : cfg)
std::cout << "Key " << setting.first << " has value " << setting.second << "\n";
}
Bonus
If you want to learn how to extract the raw input: just try
auto source = skip(space) [ *raw [ pair ] ];
as in this:
using RawSettings = std::vector<std::string>;
RawSettings parse_raw_config(std::string const& cfg) {
RawSettings parsed;
auto f = cfg.begin(), l = cfg.end();
if (!parse(f, l, parser::source, parsed))
throw std::invalid_argument("Parse failed at " + std::string(f,l));
return parsed;
}
int main() {
for (auto& setting : parse_raw_config(text))
std::cout << "Raw: " << setting << "\n";
}
Which prints: Live On Coliru
Raw: server ('m1.labs.teradata.com')
Raw: username ('use\')r_*5')
Raw: password('u" er 5')
Raw: dbname ('default')
Fixing a few syntax and style issues:
you need to escape \ in C strings
you had a " in s, making a syntax error
#include <boost/regex.hpp>
#include <boost/range/iterator_range.hpp>
#include <iostream>
int main() {
std::string s = "server ('m1.labs.teradata.com') username ('use\')r_*5') password('u' er 5') dbname ('default')";
boost::regex re(R"(('([^'\\]*(?:\\[\s\S][^'\\]*)*)'))");
size_t count = 0;
for (auto tok : boost::make_iterator_range(boost::sregex_token_iterator(s.begin(), s.end(), re, 1), {})) {
std::cout << "Token " << ++count << " is " << tok << "\n";
}
}
Prints
Token 1 is 'm1.labs.teradata.com'
Token 2 is 'use'
Token 3 is ') password('
Token 4 is ' er 5'
Token 5 is 'default'
My problem is the following. I have an ast node which is defined as like the following:
struct foo_node{
std::vector<std::string> value;
}
and I have a parser like this for parsing into the struct, which works fine:
typedef x3::rule<struct foo_node_class, foo_node> foo_node_type;
const foo_node_type foo_node = "foo_node";
auto const foo_node_def = "(" >> +x3::string("bar") >> ")";
Now I want to achieve that the parser also parses "bar", without brackets, but only if its a single bar. I tried to do it like this:
auto const foo_node_def = x3::string("bar")
| "(" > +x3::string("bar") > ")";
but this gives me a compile time error, since x3::string("bar") returns a string and not a std::vector<std::string>.
My question is, how can I achieve, that the x3::string("bar") parser (and every other parser which returns a string) parses into a vector?
The way to parse a single element and expose it as a single-element container attribute is x3::repeat(1) [ p ]:
Live On Coliru
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <iostream>
namespace x3 = boost::spirit::x3;
struct foo_node {
std::vector<std::string> value;
};
BOOST_FUSION_ADAPT_STRUCT(foo_node, value)
namespace rules {
auto const bar
= x3::string("bar");
auto const foo_node
= '(' >> +bar >> ')'
| x3::repeat(1) [ +bar ]
;
}
int main() {
for (std::string const input : {
"bar",
"(bar)",
"(barbar)",
})
{
auto f = input.begin(), l = input.end();
foo_node data;
bool ok = x3::parse(f, l, rules::foo_node, data);
if (ok) {
std::cout << "Parse success: " << data.value.size() << " elements\n";
} else {
std::cout << "Parse failed\n";
}
if (f != l)
std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
}
}
Prints
Parse success: 1 elements
Parse success: 1 elements
Parse success: 2 elements
I am currentl adding expectation points to my grammar in X3.
Now I came accross an rule, which looks like this.
auto const id_string = +x3::char("A-Za-z0-9_);
auto const nested_identifier_def =
x3::lexeme[
*(id_string >> "::")
>> *(id_string >> ".")
>> id_string
];
I am wondering how I can add conditional expectation points to this rule.
Like "if there is a "::" then there musst follow an id_string" or "when there is a . then there musst follow an id_string"
and so on.
How can I achieve such a behaviour for such a rule?
I'd write it exactly the way you intend it:
auto const identifier
= lexeme [+char_("A-Za-z0-9_")];
auto const qualified_id
= identifier >> *("::" > identifier);
auto const simple_expression // only member expressions supported now
= qualified_id >> *('.' > identifier);
With a corresponding AST:
namespace AST {
using identifier = std::string;
struct qualified_id : std::vector<identifier> { using std::vector<identifier>::vector; };
struct simple_expression {
qualified_id lhs;
std::vector<identifier> rhs;
};
}
LIVE DEMO
Live On Coliru
#include <iostream>
#include <string>
#include <vector>
namespace AST {
using identifier = std::string;
struct qualified_id : std::vector<identifier> { using std::vector<identifier>::vector; };
struct simple_expression {
qualified_id lhs;
std::vector<identifier> rhs;
};
}
#include <boost/fusion/adapted.hpp>
BOOST_FUSION_ADAPT_STRUCT(AST::simple_expression, lhs, rhs)
#include <boost/spirit/home/x3.hpp>
namespace Parser {
using namespace boost::spirit::x3;
auto const identifier
= rule<struct identifier_, AST::identifier> {}
= lexeme [+char_("A-Za-z0-9_")];
auto const qualified_id
= rule<struct qualified_id_, AST::qualified_id> {}
= identifier >> *("::" > identifier);
auto const simple_expression // only member expressions supported now
= rule<struct simple_expression_, AST::simple_expression> {}
= qualified_id >> *('.' > identifier);
}
int main() {
using It = std::string::const_iterator;
for (std::string const input : { "foo", "foo::bar", "foo.member", "foo::bar.member.subobject" }) {
It f = input.begin(), l = input.end();
AST::simple_expression data;
bool ok = phrase_parse(f, l, Parser::simple_expression, boost::spirit::x3::space, data);
if (ok) {
std::cout << "Parse success: ";
for (auto& el : data.lhs) std::cout << "::" << el;
for (auto& el : data.rhs) std::cout << "." << el;
std::cout << "\n";
}
else {
std::cout << "Parse failure ('" << input << "')\n";
}
if (f != l)
std::cout << "Remaining unparsed input: '" << std::string(f, l) << "'\n";
}
}
Prints
Parse success: ::foo
Parse success: ::foo::bar
Parse success: ::foo.member
Parse success: ::foo::bar.member.subobject