Boost.Spirit X3 compile time explodes with recursive rule

Boost.Spirit X3 compile time explodes with recursive rule - c++

The following program takes 10s to compile. When I change the parenProcess rule below to '(' >> process >> ')' the compiler spends CPU but does not seem to finish. (I tried making a smaller reproducible program -- by removing rules between the process and parenProcess, but then the compile time no longer exploded).
How do I fix the compile (time) when embedding process instead?
(Minor other question: is there a nicer way to make rule 'x' and 'xActual'?)
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <iostream>
#include <string>
#include <vector>
namespace wccs_parser {
namespace x3 = boost::spirit::x3;
namespace ascii = x3::ascii;
using x3::long_;
using x3::ulong_;
using x3::lexeme;
//--- Ast structures
struct AstChannel {
std::string label;
bool complement;
};
struct AstAction {
AstChannel channel;
uint32_t weight;
};
struct AstRenaming {
std::string from;
std::string to;
};
struct AstNullProcess;
struct AstActionPrefixProcess;
struct AstChoiceProcess;
struct AstCompositionProcess;
struct AstRestrictionProcess;
struct AstRenamingProcess;
struct AstConstantProcess;
using AstAnyProcess = x3::variant<
x3::forward_ast<AstNullProcess>,
x3::forward_ast<AstActionPrefixProcess>,
x3::forward_ast<AstChoiceProcess>,
x3::forward_ast<AstCompositionProcess>,
x3::forward_ast<AstRestrictionProcess>,
x3::forward_ast<AstRenamingProcess>,
x3::forward_ast<AstConstantProcess>
>;
struct AstNullProcess {};
struct AstActionPrefixProcess {
AstAction action;
AstAnyProcess subProcess;
};
struct AstChoiceProcess {
std::vector<AstAnyProcess> subProcesses;
};
struct AstCompositionProcess {
std::vector<AstAnyProcess> subProcesses;
};
struct AstRestrictionProcess {
AstAnyProcess subProcess;
std::vector<std::string> labels;
};
struct AstRenamingProcess {
AstAnyProcess subProcess;
std::vector<AstRenaming> renamings;
};
struct AstConstantProcess {
std::string processName;
};
} // End namespace
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstChannel, label, complement)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstAction, channel, weight)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstRenaming, from, to)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstActionPrefixProcess, action, subProcess)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstChoiceProcess, subProcesses)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstCompositionProcess, subProcesses)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstRestrictionProcess, subProcess, labels)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstRenamingProcess, subProcess, renamings)
BOOST_FUSION_ADAPT_STRUCT(wccs_parser::AstConstantProcess, processName)
namespace wccs_parser {
//--- Rules
auto const constantName = x3::rule<struct constantRule, std::string> {"constantName"} =
x3::lexeme[ascii::upper >> *(ascii::alnum)];
auto const label = x3::rule<struct labelRule, std::string> {"label"} =
x3::lexeme[ascii::lower >> *(ascii::alnum)];
auto const channel = x3::rule<struct channelRule, AstChannel> {"channel"} =
label >> x3::matches['!'];
auto const action = x3::rule<struct actionRule, AstAction> {"action"} =
'<' >> channel >> ',' >> ulong_ >> '>';
auto renamingPair = x3::rule<struct renamingPairRule, AstRenaming> {"renamingPair"} =
label > "=>" > label;
x3::rule<struct processRule, AstAnyProcess> process{"process"};
auto const nullProcess = x3::rule<struct nullProcessRule, AstNullProcess> {"nullProcess"} = '0' >> x3::attr(AstNullProcess());
auto const constant = x3::rule<struct constantRule, AstConstantProcess> {"constant"} = constantName;
/// HERE:
auto const parenProcess = '(' > nullProcess > ')';
auto const primitive = x3::rule<struct primitiveRule, AstAnyProcess> {"primitive"} =
parenProcess
| nullProcess
| constant;
auto const restrictionActual = x3::rule<struct restrictionActual, AstRestrictionProcess> {"restrictionActual"} =
primitive >> '\\' >> '{' >> label % ',' >> '}';
auto const restriction = x3::rule<struct restrictionRule, AstAnyProcess> {"restriction"} =
primitive >> !x3::lit('\\')
| restrictionActual;
auto const renamingActual = x3::rule<struct renamingActualRule, AstRenamingProcess> {"renamingActual"} =
restriction >> '[' >> renamingPair % ',' >> ']';
auto const renaming = x3::rule<struct renamingRule, AstAnyProcess> {"renaming"} =
restriction >> !x3::lit('[')
| renamingActual;
x3::rule<struct actionPrefixingRule, AstAnyProcess> actionPrefix{"actionPrefix"};
auto const actionPrefixActual = x3::rule<struct actionPrefixActualRule, AstActionPrefixProcess> {"actionPrefixActual"} =
action > ('.' > actionPrefix);
auto const actionPrefix_def =
actionPrefixActual
| renaming;
BOOST_SPIRIT_DEFINE(actionPrefix)
auto const compositionActual = x3::rule<struct choiceActualrule, AstCompositionProcess> {"compositionActual"} =
actionPrefix % '|';
auto const composition = x3::rule<struct compositionRule, AstAnyProcess> {"composition"} =
actionPrefix >> !x3::lit('|')
| compositionActual;
auto const choiceActual = x3::rule<struct choiceActualrule, AstChoiceProcess> {"choiceActual"} =
composition % '+';
auto const choice = x3::rule<struct choiceRule, AstAnyProcess> {"choice"} =
composition >> !x3::lit('+')
| choiceActual;
auto const process_def = choice;
BOOST_SPIRIT_DEFINE(process)
auto const entry = x3::skip(ascii::space) [process];
} //End namespace
int main() {
std::string str("0 + (0)");
wccs_parser::AstAnyProcess root;
auto iter = str.begin();
auto end = str.end();
bool r = parse(iter, end, wccs_parser::entry, root);
if (r) {
std::cout << str << std::endl << std::endl << " Parses OK: " << std::endl;
}
else {
std::cout << "Parsing failed\n";
}
if (iter != end) std::cout << "Partial match" << std::endl;
return 0;
}

This is a known problem. CppEvans (?) on the mailing list claims to have a workaround on a branch, but that branch is far behind and the changes very intrusive, so I can't vet it/vouch for it.
So, the right recourse would be to post on the mailing list in a bid to get the main developer(s) involved, and raise awareness of this stopping issue.
Regardless, without changing the behaviour of your code, you can use a shorthand:
template <typename T> auto rule = [](const char* name = typeid(T).name()) {
struct _{};
return x3::rule<_, T> {name};
};
template <typename T> auto as = [](auto p) { return rule<T>() = p; };
This will make it much more convenient to write the repetitive Ast coercions:
auto constantName = as<std::string>(x3::lexeme[ascii::upper >> *(ascii::alnum)]);
auto label = as<std::string>(x3::lexeme[ascii::lower >> *(ascii::alnum)]);
auto channel = as<AstChannel>(label >> x3::matches['!']);
auto action = as<AstAction>('<' >> channel >> ',' >> x3::ulong_ >> '>');
auto renamingPair = as<AstRenaming>(label > "=>" > label);
auto nullProcess = as<AstNullProcess>(x3::omit['0']);
auto constant = as<AstConstantProcess>(constantName);
auto parenProcess = '(' > nullProcess > ')';
auto primitive = rule<AstAnyProcess> ("primitive")
= parenProcess
| nullProcess
| constant;
auto restrictionActual = as<AstRestrictionProcess>(primitive >> '\\' >> '{' >> label % ',' >> '}');
auto restriction = rule<AstAnyProcess> ("restriction")
= primitive >> !x3::lit('\\')
| restrictionActual
;
auto renamingActual = as<AstRenamingProcess>(restriction >> '[' >> renamingPair % ',' >> ']');
auto renaming = rule<AstAnyProcess> ("renaming")
= restriction >> !x3::lit('[')
| renamingActual
;
auto actionPrefixActual = as<AstActionPrefixProcess>(action > ('.' > actionPrefix));
auto actionPrefix_def = actionPrefixActual | renaming;
auto compositionActual = as<AstCompositionProcess>(actionPrefix % '|');
auto composition = rule<AstAnyProcess> ("composition")
= actionPrefix >> !x3::lit('|')
| compositionActual
;
auto choiceActual = as<AstChoiceProcess>(composition % '+');
auto choice = rule<AstAnyProcess> ("choice")
= composition >> !x3::lit('+')
| choiceActual
;
auto process_def = choice;
BOOST_SPIRIT_DEFINE(actionPrefix, process)
auto const entry = x3::skip(ascii::space) [process];
Program still runs with same output.

Related

boost x3 grammar for structs with multiple constructors

Trying to figure out how to parse structs that have multiple constructors or overloaded constructors. For example in this case, a range struct that contains either a range or a singleton case where the start/end of the range is equal.
case 1: look like
"start-stop"
case 2:
"start"
For the range case
auto range_constraint = x3::rule<struct test_struct, MyRange>{} = (x3::int_ >> x3::lit("-") >> x3::int_);
works
but
auto range_constraint = x3::rule<struct test_struct, MyRange>{} = x3::int_ | (x3::int_ >> x3::lit("-") >> x3::int_);
unsurprisingly, won't match the signature and fails to compile.
Not sure what the fix is?
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <iostream>
namespace x3 = boost::spirit::x3;
struct MyRange
{
size_t start;
size_t end;
// little bit weird because should be end+1, but w/e
explicit MyRange(size_t start, size_t end = 0) : start(start), end(end == 0 ? start : end)
{
}
};
BOOST_FUSION_ADAPT_STRUCT(MyRange, start, end)
// BOOST_FUSION_ADAPT_STRUCT(MyRange, start)
//
int main()
{
auto range_constraint = x3::rule<struct test_struct, MyRange>{} = (x3::int_ >> x3::lit("-") >> x3::int_);
// auto range_constraint = x3::rule<struct test_struct, MyRange>{} = x3::int_ | (x3::int_ >> x3::lit("-") >> x3::int_);
for (std::string input :
{"1-2", "1","1-" ,"garbage"})
{
auto success = x3::phrase_parse(input.begin(), input.end(),
// Begin grammar
range_constraint,
// End grammar
x3::ascii::space);
std::cout << "`" << input << "`"
<< "-> " << success<<std::endl;
}
return 0;
}

It's important to realize that sequence adaptation by definition uses default construction with subsequent sequence element assignment.
Another issue is branch ordering in PEG grammars. int_ will always success where int_ >> '‑' >> int_ would so you would never match the range version.
Finally, to parse size_t usually prefer uint_/uint_parser<size_t> :)
Things That Don't Work
There are several ways to skin this cat. For one, there's BOOST_FUSION_ADAPT_STRUCT_NAMED, which would allow you to do
BOOST_FUSION_ADAPT_STRUCT_NAMED(MyRange, Range, start, end)
BOOST_FUSION_ADAPT_STRUCT_NAMED(MyRange, SingletonRange, start)
So one pretty elaborate would seem to spell it out:
auto range = x3::rule<struct _, Range>{} = uint_ >> '-' >> uint_;
auto singleton = x3::rule<struct _, SingletonRange>{} = uint_;
auto rule = x3::rule<struct _, MyRange>{} = range | singleton;
TIL that this doesn't even compile, apparently Qi was differently: Live On Coliru
X3 requires the attribute to be default-constructible whereas Qi would attempt to bind to the passed-in attribute reference first.
Even in the Qi version you can see that the fact Fusion sequences will be default-contructed-then-memberwise-assigned leads to results you didn't expect or want:
`1-2` -> true
-- [1,NIL)
`1` -> true
-- [1,NIL)
`1-` -> true
-- [1,NIL)
`garbage` -> false
What Works
Instead of doing the complicated things, do the simple thing. Anytime you see an optional value you can usually provide a default value. Alternatively you can not use Sequence adaptation at all, and go straight to semantic actions.
Semantic Actions
The simplest way would be to have specific branches:
auto assign1 = [](auto& ctx) { _val(ctx) = MyRange(_attr(ctx)); };
auto assign2 = [](auto& ctx) { _val(ctx) = MyRange(at_c<0>(_attr(ctx)), at_c<1>(_attr(ctx))); };
auto rule = x3::rule<void, MyRange>{} =
(uint_ >> '-' >> uint_)[assign2] | uint_[assign1];
Slighty more advanced, but more efficient:
auto assign1 = [](auto& ctx) { _val(ctx) = MyRange(_attr(ctx)); };
auto assign2 = [](auto& ctx) { _val(ctx) = MyRange(_val(ctx).start, _attr(ctx)); };
auto rule = x3::rule<void, MyRange>{} = uint_[assign1] >> -('-' >> uint_[assign2]);
Lastly, we can move towards defaulting the optional end:
auto rule = x3::rule<void, MyRange>{} =
(uint_ >> ('-' >> uint_ | x3::attr(MyRange::unspecified))) //
[assign];
Now the semantic action will have to deal with the variant end type:
auto assign = [](auto& ctx) {
auto start = at_c<0>(_attr(ctx));
_val(ctx) = apply_visitor( //
[=](auto end) { return MyRange(start, end); }, //
at_c<1>(_attr(ctx)));
};
Also Live On Coliru
Simplify?
I'd consider modeling the range explicitly as having an optional end:
struct MyRange {
MyRange() = default;
MyRange(size_t s, boost::optional<size_t> e = {}) : start(s), end(e) {
assert(!e || *e >= s);
}
size_t size() const { return end? *end - start : 1; }
bool empty() const { return size() == 0; }
size_t start = 0;
boost::optional<size_t> end = 0;
};
Now you can directly use the optional to construct:
auto assign = [](auto& ctx) {
_val(ctx) = MyRange(at_c<0>(_attr(ctx)), at_c<1>(_attr(ctx)));
};
auto rule = x3::rule<void, MyRange>{} = (uint_ >> -('-' >> uint_))[assign];
Actually, here we can go back to using adapted sequences, although with different semantics:
Live On Coliru
#include <boost/fusion/adapted.hpp>
#include <boost/spirit/home/x3.hpp>
#include <iomanip>
#include <iostream>
namespace x3 = boost::spirit::x3;
struct MyRange {
size_t start = 0;
boost::optional<size_t> end = 0;
};
static inline std::ostream& operator<<(std::ostream& os, MyRange const& mr) {
if (mr.end)
return os << "[" << mr.start << "," << *mr.end << ")";
else
return os << "[" << mr.start << ",)";
}
BOOST_FUSION_ADAPT_STRUCT(MyRange, start, end)
int main() {
x3::uint_parser<size_t> uint_;
auto rule = x3::rule<void, MyRange>{} = uint_ >> -('-' >> uint_);
for (std::string const input : {"1-2", "1", "1-", "garbage"}) {
MyRange into;
auto success = phrase_parse(input.begin(), input.end(), rule, x3::space, into);
std::cout << quoted(input, '`') << " -> " << std::boolalpha << success
<< std::endl;
if (success) {
std::cout << " -- " << into << "\n";
}
}
}
Summarizing
I hope these strategies give you all the things you needed. Pay close attention to the semantics of your range. Specifically, I never payed any attention to difference between "1" and "1-". You might want one to be [1,2) and the other to be [1,inf), both to be equivalent, or the second one might even be considered invalid?
Stepping back even further, I'd suggest that maybe you just needed
using Bound = std::optional<size_t>;
using MyRange = std::pair<Bound, Bound>;
Which you could parse directly with:
auto boundary = -x3::uint_parser<size_t>{};
auto rule = x3::rule<void, MyRange>{} = boundary >> '-' >> boundary;
It would allow for more inputs:
for (std::string const input : {"-2", "1-2", "1", "1-", "garbage"}) {
MyRange into;
auto success = phrase_parse(input.begin(), input.end(), rule, x3::space, into);
std::cout << quoted(input, '`') << " -> " << std::boolalpha << success
<< std::endl;
if (success) {
std::cout << " -- " << into << "\n";
}
}
Prints: Live On Coliru
`-2` -> true
-- [,2)
`1-2` -> true
-- [1,2)
`1` -> false
`1-` -> true
-- [1,)
`garbage` -> false

Cleanest way to handle both quoted and unquoted strings in Spirit.X3

Buon giorno,
I have to parse something such as:
foo: 123
"bar": 456
The quotes should be removed if they are here. I tried:
((+x3::alnum) | ('"' >> (+x3::alnum) >> '"'))
But the parser actions for this are of type variant<string, string> ; is there a way to make it so that the parser understands that those two are equivalent, and for my action to only get a single std::string as argument in its call?
edit: minimal repro (live on godbolt: https://gcc.godbolt.org/z/GcE8Pj4r5) :
#include <boost/spirit/home/x3.hpp>
using namespace boost::spirit;
// action handlers
struct handlers {
void create_member(const std::string& str) { }
};
// rules
static const x3::rule<struct id_obj_mem> obj_mem = "obj_mem";
#define EVENT(e) ([](auto& ctx) { x3::get<handlers>(ctx).e(x3::_attr(ctx)); })
static const auto obj_mem_def = ((
((+x3::alnum) | ('"' >> (+x3::alnum) >> '"'))
>> ':' >> x3::lit("123"))[EVENT(create_member)] % ',');
BOOST_SPIRIT_DEFINE(obj_mem)
// execution
int main()
{
handlers r;
std::string str = "foo: 123";
auto first = str.begin();
auto last = str.end();
bool res = phrase_parse(
first,
last,
boost::spirit::x3::with<handlers>(r)[obj_mem_def],
boost::spirit::x3::ascii::space);
}

I too consider this a kind of defect. X3 is definitely less "friendly" in terms of the synthesized attribute types. I guess it's just a tacit side-effect of being more core-language oriented, where attribute assignment is effectively done via default "visitor" actions.
Although I understand the value of keeping the magic to a minimum, and staying close to "pure C++", I vastly prefer the Qi way of synthesizing attributes here. I believe it has proven a hard problem to fix, as this problem has been coming/going in some iterations of X3.
I've long decided to basically fix it myself with variations of this idiom:
template <typename T> struct as_type {
auto operator()(auto p) const { return x3::rule<struct Tag, T>{} = p; }
};
static constexpr as_type<std::string> as_string{};
Now I'd write that as:
auto quoted = '"' >> +x3::alnum >> '"';
auto name = as_string(+x3::alnum | quoted);
auto prop = (name >> ':' >> "123")[EVENT(create_member)] % ',';
That will compile no problem:
Live On Coliru
#include <boost/spirit/home/x3.hpp>
#include <iomanip>
#include <iostream>
namespace x3 = boost::spirit::x3;
struct handlers {
void create_member(std::string const& str) {
std::cerr << __FUNCTION__ << " " << std::quoted(str) << "\n";
}
};
namespace Parser {
#define EVENT(e) ([](auto& ctx) { get<handlers>(ctx).e(_attr(ctx)); })
template <typename T> struct as_type {
auto operator()(auto p) const { return x3::rule<struct Tag, T>{} = p; }
};
static constexpr as_type<std::string> as_string{};
auto quoted = '"' >> +x3::alnum >> '"';
auto name = as_string(+x3::alnum | quoted);
auto prop = (name >> ':' >> "123")[EVENT(create_member)] % ',';
auto grammar = x3::skip(x3::space)[prop];
} // namespace Parser
int main() {
handlers r;
std::string const str = "foo: 123";
auto first = str.begin(), last = str.end();
bool res = parse(first, last, x3::with<handlers>(r)[Parser::grammar]);
return res ? 1 : 0;
}
Prints
create_member "foo"
Interesting Links
Spirit X3, How to get attribute type to match rule type?
Combining rules at runtime and returning rules
spirit x3 cannot propagate attributes of type optional<vector>
etc.

Parsing text including Unicode using spirit crashes with boost-1.78, but OK with boost-1.67, Why?

I wrote the following code which crashes with boost-1.78;
While, I replace std::string input = "geo_dip_subdivision:(+国 -民)"; with std::string input = "geo_dip_subdivision:(+1 -2)";, it runs as expected.
Also, it runs as expected with boost-1.67 and std::string input = "geo_dip_subdivision:(+国 -民)";
So, it is a problem related to Unicode. But I don't know what is the problem, and why it seems running as expected in boost-1.67.
Any help?
#include <string.h>
#define BOOST_SPIRIT_UNICODE
#include <boost/phoenix.hpp>
#include <boost/phoenix/operator.hpp>
#include <boost/spirit/include/qi.hpp>
namespace DB {
using std::vector;
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
#define str_pattern (('"' > *(qi::unicode::char_ - ('"')) | "\\\"") > '"')
#define sym_open (char_('[') | char_('{'))
#define sym_close (char_(']') | char_('}'))
struct query_tree;
typedef boost::variant<std::string, query_tree> node;
struct query_tree {
vector<node> must;
vector<node> must_not;
vector<node> should;
query_tree() = default;
query_tree(int type, query_tree& old, const query_tree& v)
{
if (type == 3) {
assert(old.should.size() == 0 || old.must.size() == 0);
assert(v.should.size() + v.must.size() + v.must_not.size() == 1);
if (old.should.size() > 0) {
must = std::move(old.should);
} else {
must = std::move(old.must);
}
must_not = std::move(old.must_not);
if (v.should.size() > 0) {
must.push_back(v.should[0]);
} else if (v.must.size() > 0) {
must.push_back(v.must[0]);
} else {
must_not.push_back(v.must_not[0]);
}
} else {
must = std::move(old.must);
must_not = std::move(old.must_not);
should = std::move(old.should);
push_back(type, v);
}
}
query_tree(int type, const std::string& n) { push_back(type, n); }
template <typename T> void push_back(int type, const T& v)
{
if (type == 0) {
must.push_back(v);
} else if (type == 1) {
must_not.push_back(v);
} else {
should.push_back(v);
}
}
query_tree(query_tree& old, const query_tree& v)
{
must = std::move(old.must);
for (size_t i = 0; i < v.must.size(); i++) {
must.push_back(v.must[i]);
}
must_not = std::move(old.must_not);
for (size_t i = 0; i < v.must_not.size(); i++) {
must_not.push_back(v.must_not[i]);
}
should = std::move(old.should);
for (size_t i = 0; i < v.should.size(); i++) {
should.push_back(v.should[i]);
}
}
};
template <typename It, typename Skipper = qi::space_type>
struct parser : qi::grammar<It, query_tree(), Skipper> {
parser() : parser::base_type(query)
{
using namespace qi;
part1 = raw[lexeme[*(str_pattern | qi::unicode::char_ - (char_(')') | char_('(')))]];
part2 = part1[_val = _1] > *(parenthese[_val = _val + _1]) >
(char_(')')[_val = _val + _1] | part2[_val = _val + _1]);
parenthese = char_('(')[_val = _1] > part2[_val = _val + _1];
range = raw[lexeme[sym_open > *(char_ - sym_close) > sym_close]];
name = raw[lexeme[+(qi::unicode::char_ - (':' | space | ')')) > ':']];
other_value = raw[lexeme[+(qi::unicode::char_ - space - ')')]];
string_value = raw[lexeme[str_pattern]];
field =
name[_val = _1] > (string_value | parenthese | range | other_value)[_val = _val + _1];
group = '(' > query > ')';
must = "+" > (group[_val = _1] | field[_val = phx::construct<query_tree>(0, _1)]);
must_not = (string("-") | string("NOT")) >
(group[_val = _1] | field[_val = phx::construct<query_tree>(0, _1)]);
should = group[_val = _1] | field[_val = phx::construct<query_tree>(2, _1)];
expr = (must[_val = phx::construct<query_tree>(0, _val, _1)] |
must_not[_val = phx::construct<query_tree>(1, _val, _1)] |
should[_val = phx::construct<query_tree>(2, _val, _1)]);
And = expr[_val = phx::construct<query_tree>(_val, _1)] >
*((string("AND") | string("&&")) >
expr[_val = phx::construct<query_tree>(3, _val, _1)]);
Or = And[_val = _1] >
*((string("OR") | string("||")) > And[_val = phx::construct<query_tree>(_val, _1)]);
query = *(Or[_val = phx::construct<query_tree>(_val, _1)]);
}
private:
qi::rule<It, std::string(), Skipper> field, name, string_value, other_value;
qi::rule<It, std::string(), qi::no_skip_type> parenthese, part1, part2, range;
qi::rule<It, query_tree(), Skipper> must, must_not, should, query, expr, group, And, Or;
};
std::string parse_from_lucene(std::string& input)
{
auto f(std::begin(input)), l(std::end(input));
parser<decltype(f)> p;
std::string str;
try {
query_tree result;
bool ok = qi::phrase_parse(f, l, p, qi::space, result);
if (!ok) {
throw "invalid input: " + input;
}
} catch (const qi::expectation_failure<decltype(f)>& e) {
throw "expectation_failure at '" + std::string(e.first, e.last) + "'\n";
}
return str;
}
};
int main()
{
std::string input = "geo_dip_subdivision:(+国 -民)";
input = DB::parse_from_lucene(input);
std::cout << input << std::endl;
return 0;
}

Mmm. There are many iffy things about the grammar.
over-use of semantic actions (Boost Spirit: "Semantic actions are evil"?)
using qi::no_skip_type as a... skipper?
mixing char_/lit/raw
mixing primitives from unicode and standard encoding
throwing literals
the str_pattern macro had a bug (see fixed below)
the logic to parse open/close braces seems buggy since it doesn't check matching pairs
parsing into a query_tree, but returning a default-constructed str - the output will always be empty?
note: you don't check that the full input is parsed; partial parses may lead to surprising errors
note: some logic in the type==3 constructor seems buggy (should becomes must?)
If you gave me a reference to the grammar documentation and a list of examples that you expect to parse, I think I'd be able to simplify this to about half the code while removing these errors.
Here's an initial set of refactorings I see that were necessary:
Live On Coliru
#define BOOST_SPIRIT_UNICODE
#include <string.h>
#include <boost/phoenix.hpp>
#include <boost/phoenix/operator.hpp>
#include <boost/spirit/include/qi.hpp>
namespace DB {
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
namespace encoding = qi::unicode;
#define str_pattern ('"' > *("\\\"" | encoding::char_ - ('"')) > '"') // BUG
#define sym_open (encoding::char_("[{")) // BUG
#define sym_close (encoding::char_("]}")) // BUG
struct query_tree;
using String = std::u32string;
using node = boost::variant<String, query_tree>;
struct query_tree {
std::vector<node> must;
std::vector<node> must_not;
std::vector<node> should;
query_tree() = default;
query_tree(int type, query_tree& old, const query_tree& v) {
if (type == 3) {
assert(old.should.size() == 0 || old.must.size() == 0);
assert(v.should.size() + v.must.size() + v.must_not.size() == 1);
if (old.should.size() > 0) {
must = std::move(old.should); // PROBABLY BUG?
} else {
must = std::move(old.must);
}
must_not = std::move(old.must_not);
if (v.should.size() > 0) {
must.push_back(v.should[0]); // PROBABLY BUG?
} else if (v.must.size() > 0) {
must.push_back(v.must[0]);
} else {
must_not.push_back(v.must_not[0]);
}
} else {
must = std::move(old.must);
must_not = std::move(old.must_not);
should = std::move(old.should);
push_back(type, v);
}
}
query_tree(int type, const String& n) { push_back(type, n); }
template <typename T> void push_back(int type, const T& v) {
if (type == 0) {
must.push_back(v);
} else if (type == 1) {
must_not.push_back(v);
} else {
should.push_back(v);
}
}
query_tree(query_tree& old, const query_tree& v) {
must = std::move(old.must);
for (size_t i = 0; i < v.must.size(); i++) {
must.push_back(v.must[i]);
}
must_not = std::move(old.must_not);
for (size_t i = 0; i < v.must_not.size(); i++) {
must_not.push_back(v.must_not[i]);
}
should = std::move(old.should);
for (size_t i = 0; i < v.should.size(); i++) {
should.push_back(v.should[i]);
}
}
};
template <typename It> struct parser : qi::grammar<It, query_tree()> {
parser() : parser::base_type(start) {
using encoding::char_;
using encoding::space;
using encoding::string;
using qi::lexeme;
using qi::raw;
using namespace qi::labels;
auto SET = boost::proto::deep_copy(_val = _1);
auto ADD = boost::proto::deep_copy(_val = _val + _1);
#define TREE(...) _val = phx::construct<query_tree>(__VA_ARGS__)
part1 = raw[*(str_pattern | char_ - char_("()"))];
part2 = part1[SET] > *(parenthese[ADD]) >
(char_(')')[ADD] | part2[ADD]);
parenthese = char_('(')[SET] > part2[ADD];
range = raw[sym_open > *(char_ - sym_close) > sym_close];
name = raw[+(encoding::char_ - (':' | space | ')')) > ':'];
other_value = raw[+(encoding::char_ - space - ')')];
string_value = raw[str_pattern];
field = name[SET] >
(string_value | parenthese | range | other_value)[ADD];
group = '(' > query > ')';
must = "+" > (group[SET] |
field[_val = phx::construct<query_tree>(0, _1)]);
must_not = (string("-") | string("NOT")) >
(group[SET] |
field[_val = phx::construct<query_tree>(0, _1)]);
should =
group[SET] | field[_val = phx::construct<query_tree>(2, _1)];
expr = (must[_val = phx::construct<query_tree>(0, _val, _1)] |
must_not[_val = phx::construct<query_tree>(1, _val, _1)] |
should[_val = phx::construct<query_tree>(2, _val, _1)]);
And = expr[_val = phx::construct<query_tree>(_val, _1)] >
*((string("AND") | string("&&")) >
expr[_val = phx::construct<query_tree>(3, _val, _1)]);
Or = And[SET] >
*((string("OR") | string("||")) >
And[_val = phx::construct<query_tree>(_val, _1)]);
query = *(Or[_val = phx::construct<query_tree>(_val, _1)]);
start = qi::skip(encoding::space) [ query > qi::eoi ];
}
private:
qi::rule<It, query_tree()> start;
//
qi::rule<It, query_tree(), encoding::space_type> must, must_not, should,
query, expr, group, And, Or;
qi::rule<It, String(), encoding::space_type> field;
// lexemes:
qi::rule<It, String()> parenthese, part1, part2, range, name,
string_value, other_value;
};
String parse_from_lucene(String const& input) {
auto f(std::begin(input)), l(std::end(input));
parser<decltype(f)> static const p{};
String str;
try {
query_tree result;
qi::parse(f, l, qi::eps > p, result);
} catch (const qi::expectation_failure<decltype(f)>& e) {
throw std::runtime_error("expectation_failure at '" +
std::string(e.first, e.last) + "'\n");
}
return str;
}
} // namespace DB
#include <codecvt>
#include <locale>
int main()
{
DB::String input = U"geo_dip_subdivision:(+国 -民)";
input = DB::parse_from_lucene(input);
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
std::cout << converter.to_bytes(input) << std::endl;
}
With no output, as expected.

Boost Spirit X3: Collapsing one-element lists

Say I have a (simplified) recursive grammar like this:
OrExpr := AndExpr % "or"
AndExpr := Term % "and"
Term := ParenExpr | String
ParenExpr := '(' >> OrExpr >> ')'
String := lexeme['"' >> *(char_ - '"') >> '"']
So this works, but the problem is that it will wrap everything in multiple layers of expression. For example, the string "hello" and ("world" or "planet" or "globe") would parse as OrExpr(AndExpr("hello", OrExpr(AndExpr("world"), AndExpr("planet"), AndExpr("globe")))) (playing fast and loose with the syntax, but hopefully you understand). What I'd like is for the one-element nodes to be collapsed into their parent, so it would end up as AndExpr("hello", OrExpr("world", "parent", "globe"))
This can be solved with actions and using a state machine that only constructs the outer object if there's more than one child inside it. But I'm wondering if there's a way to fix this problem without using parser actions?
EDIT: Almost minimal example
Coliru
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <iostream>
namespace x3 = boost::spirit::x3;
namespace burningmime::setmatch::ast
{
// an expression node (either an AND or an OR)
struct Expr;
// child of an expression -- either another expression, or a terminal
struct Node : x3::variant<std::string, x3::forward_ast<Expr>>
{
using base_type::base_type;
using base_type::operator=;
};
// tags for expression type
enum OPER
{
OPER_AND = 1,
OPER_OR = 2
};
// see above
struct Expr
{
OPER op;
std::vector<Node> children;
};
// for debugging purposes; this will print all the expressions
struct AstPrinter
{
void operator()(const Expr& node) const
{
std::cout << (node.op == OPER_AND ? "And(" : "Or(");
bool first = true;
for(const auto& child : node.children)
{
if(!first) std::cout << ", ";
first = false;
boost::apply_visitor(*this, child);
}
std::cout << ")";
}
void operator()(const std::string& node) const
{
std::cout << node;
}
};
}
// these need to be at top-level scope
// basically this adds compile-time type information, so the parser knows where to put various attributes
BOOST_FUSION_ADAPT_STRUCT(burningmime::setmatch::ast::Expr, op, children)
#define DECLARE_RULE(NAME, TYPE) static const x3::rule<class NAME, TYPE> NAME = #NAME;
#define KEYWORD(X) static const auto kw_##X = x3::no_case[#X];
#define DEFINE_RULE(NAME, GRAMMAR) \
static const auto NAME##_def = GRAMMAR; \
BOOST_SPIRIT_DEFINE(NAME)
namespace burningmime::setmatch::parser
{
// we need to pre-declare the rules so they can be used recursively
DECLARE_RULE(Phrase, std::string)
DECLARE_RULE(Term, ast::Node)
DECLARE_RULE(AndExpr, ast::Expr)
DECLARE_RULE(OrExpr, ast::Expr)
DECLARE_RULE(ParenExpr, ast::Expr)
// keywords
KEYWORD(and)
KEYWORD(or)
static const auto lparen = x3::lit('(');
static const auto rparen = x3::lit(')');
// helper parsers
static const auto keywords = kw_and | kw_or | lparen | rparen;
static const auto word = x3::lexeme[+(x3::char_ - x3::ascii::space - lparen - rparen)];
static const auto bareWord = word - keywords;
static const auto quotedString = x3::lexeme[x3::char_('"') >> *(x3::char_ - '"') >> x3::char_('"')];
DEFINE_RULE(Phrase, quotedString | bareWord)
DEFINE_RULE(Term, ParenExpr | Phrase)
DEFINE_RULE(ParenExpr, lparen >> OrExpr >> rparen)
DEFINE_RULE(AndExpr, x3::attr(ast::OPER_AND) >> (Term % kw_and))
DEFINE_RULE(OrExpr, x3::attr(ast::OPER_OR) >> (AndExpr % kw_or))
}
namespace burningmime::setmatch
{
void parseRuleFluent(const char* buf)
{
ast::Expr root;
auto start = buf, end = start + strlen(buf);
bool success = x3::phrase_parse(start, end, parser::OrExpr, x3::ascii::space, root);
if(!success || start != end)
throw std::runtime_error(std::string("Could not parse rule: ") + buf);
printf("Result of parsing: %s\n=========================\n", start);
ast::Node root2(root);
boost::apply_visitor(ast::AstPrinter(), root2);
}
}
int main()
{
burningmime::setmatch::parseRuleFluent(R"#("hello" and ("world" or "planet" or "globe"))#");
}

#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <iostream>
namespace x3 = boost::spirit::x3;
namespace burningmime::setmatch::ast
{
// an expression node (either an AND or an OR)
struct Expr;
// child of an expression -- either another expression, or a terminal
struct Node : x3::variant<std::string, x3::forward_ast<Expr>>
{
using base_type::base_type;
using base_type::operator=;
};
// tags for expression type
enum OPER
{
OPER_AND = 1,
OPER_OR = 2
};
// see above
struct Expr
{
OPER op;
std::vector<Node> children;
};
// for debugging purposes; this will print all the expressions
struct AstPrinter
{
void operator()(const Expr& node) const
{
std::cout << (node.op == OPER_AND ? "And(" : "Or(");
bool first = true;
for(const auto& child : node.children)
{
if(!first) std::cout << ", ";
first = false;
boost::apply_visitor(*this, child);
}
std::cout << ")";
}
void operator()(const std::string& node) const
{
std::cout << node;
}
};
}
// these need to be at top-level scope
// basically this adds compile-time type information, so the parser knows where to put various attributes
BOOST_FUSION_ADAPT_STRUCT(burningmime::setmatch::ast::Expr, op, children)
#define DECLARE_RULE(NAME, TYPE) static const x3::rule<class NAME##_r, TYPE> NAME = #NAME;
#define KEYWORD(X) static const auto kw_##X = x3::no_case[#X];
#define DEFINE_RULE(NAME, GRAMMAR) \
static const auto NAME##_def = GRAMMAR; \
BOOST_SPIRIT_DEFINE(NAME)
namespace burningmime::setmatch::parser
{
// we need to pre-declare the rules so they can be used recursively
DECLARE_RULE(Phrase, std::string)
DECLARE_RULE(Term, ast::Node)
DECLARE_RULE(AndExpr, ast::Node)
DECLARE_RULE(OrExpr, ast::Node)
DECLARE_RULE(ParenExpr, ast::Node)
// keywords
KEYWORD(and)
KEYWORD(or)
static const auto lparen = x3::lit('(');
static const auto rparen = x3::lit(')');
// helper parsers
static const auto keywords = kw_and | kw_or | lparen | rparen;
static const auto word = x3::lexeme[+(x3::char_ - x3::ascii::space - lparen - rparen)];
static const auto bareWord = word - keywords;
static const auto quotedString = x3::lexeme[x3::char_('"') >> *(x3::char_ - '"') >> x3::char_('"')];
DEFINE_RULE(Phrase, quotedString | bareWord)
DEFINE_RULE(Term, ParenExpr | Phrase)
DEFINE_RULE(ParenExpr, lparen >> OrExpr >> rparen)
template <ast::OPER Op>
struct make_node
{
template <typename Context >
void operator()(Context const& ctx) const
{
if (_attr(ctx).size() == 1)
_val(ctx) = std::move(_attr(ctx)[0]);
else
_val(ctx) = ast::Expr{ Op, std::move(_attr(ctx)) };
}
};
DEFINE_RULE(AndExpr, (Term % kw_and)[make_node<ast::OPER_AND>{}])
DEFINE_RULE(OrExpr, (AndExpr % kw_or)[make_node<ast::OPER_OR>{}])
}
namespace burningmime::setmatch
{
void parseRuleFluent(const char* buf)
{
ast::Node root;
auto start = buf, end = start + strlen(buf);
bool success = x3::phrase_parse(start, end, parser::OrExpr, x3::ascii::space, root);
if (!success || start != end)
throw std::runtime_error(std::string("Could not parse rule: ") + buf);
printf("Result of parsing: %s\n=========================\n", start);
boost::apply_visitor(ast::AstPrinter(), root);
}
}
int main()
{
burningmime::setmatch::parseRuleFluent(R"#("hello" and ("world" or "planet" or "globe"))#");
}
https://wandbox.org/permlink/kMSHOHG0pgwGr0zv
Output:
Result of parsing:
=========================
And("hello", Or("world", "planet", "globe"))

Optimizing the grammar

Also have asked the question at boost spirit mailing list
http://boost.2283326.n4.nabble.com/Spirit-X3-Boost-1-59-Compilation-never-finishes-for-a-recursive-grammar-td4693813.html
I am working on creating an xpath2.0 parser as per the RFC. It's basically a subproject of another project that I am working on.
After some initial success, I did the mistake of writing a bunch of grammar rules and AST instead of compiling and testing it at every point. After that I basically had a novel of template error messages to read (my fault actually).
Below I present a reduced grammar for the xpath (not particularly as per RFC), which does not finish compilation OR I had to stop the process when my mac started slowing down after about 7 mins.
#include <iostream>
#include <string>
#include <vector>
#include <boost/optional.hpp>
#include <boost/optional/optional_io.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/fusion/include/io.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
namespace x3 = boost::spirit::x3;
namespace ast {
struct or_expression;
struct function_call;
template <typename T>
struct operation_sequence_entry
{
std::string op;
T expr;
};
struct primary_expression: x3::variant<
std::string,
x3::forward_ast<or_expression>,
std::string,
int32_t,
uint32_t,
double,
x3::forward_ast<function_call>
>
{
using base_type::base_type;
using base_type::operator=;
};
struct filter_expression
{
primary_expression prim_expr;
std::vector<x3::forward_ast<or_expression>> predicates;
};
struct path_expression: x3::variant<
boost::optional<filter_expression>,
boost::optional<primary_expression>
>
{
using base_type::base_type;
using base_type::operator=;
};
using union_expression = std::vector<path_expression>;
struct unary_expression
{
union_expression expr;
};
struct eq_expression
{
using expr_seq_type = operation_sequence_entry<unary_expression>;
unary_expression lhs_expr;
std::vector<expr_seq_type> rhs_expr;
};
struct and_expression
{
using expr_seq_type = operation_sequence_entry<eq_expression>;
eq_expression lhs_expr;
std::vector<expr_seq_type> rhs_expr;
};
struct or_expression
{
using expr_seq_type = operation_sequence_entry<and_expression>;
and_expression lhs_expr;
std::vector<expr_seq_type> rhs_expr;
};
struct function_call
{
std::string func_name;
std::vector<or_expression> args;
};
}
BOOST_FUSION_ADAPT_TPL_STRUCT(
(T),
(ast::operation_sequence_entry)(T),
(std::string, op),
(T, expr)
);
BOOST_FUSION_ADAPT_STRUCT(
ast::unary_expression,
(ast::union_expression, expr)
);
BOOST_FUSION_ADAPT_STRUCT(
ast::eq_expression,
(ast::unary_expression, lhs_expr),
(std::vector<typename ast::eq_expression::expr_seq_type>, rhs_expr)
);
BOOST_FUSION_ADAPT_STRUCT(
ast::and_expression,
(ast::eq_expression, lhs_expr),
(std::vector<typename ast::and_expression::expr_seq_type>, rhs_expr)
);
BOOST_FUSION_ADAPT_STRUCT(
ast::or_expression,
(ast::and_expression, lhs_expr),
(std::vector<typename ast::or_expression::expr_seq_type>, rhs_expr)
);
BOOST_FUSION_ADAPT_STRUCT(
ast::function_call,
(std::string, func_name),
(std::vector<ast::or_expression>, args)
);
BOOST_FUSION_ADAPT_STRUCT(
ast::filter_expression,
(ast::primary_expression, prim_expr),
(std::vector<x3::forward_ast<ast::or_expression>>, predicates)
);
namespace grammar {
// Bring in the spirit parsers
using x3::lexeme;
using x3::alpha;
using x3::alnum;
using x3::ascii::char_;
using x3::ascii::string;
using x3::lit;
using x3::ascii::digit;
using x3::int_;
using x3::uint_;
using x3::double_;
template<typename T>
auto as = [](auto p) { return x3::rule<struct _, T>{} = as_parser(p); };
auto str_ = [](const char* lit) { return x3::string(lit); };
x3::rule<class path_expr, ast::path_expression> path_expr = "path-expr";
auto ncname = x3::rule<class ncname, std::string>{"ncname"}
= x3::lexeme[+(char_ - ':')]
;
auto qname = x3::rule<class qname, std::string>{"qname"}
= as<std::string>(ncname >> char_(':') >> ncname)
| as<std::string>(ncname)
;
auto union_expr = x3::rule<class union_expr, ast::union_expression>{"union-expr"}
= path_expr % '/'
;
auto unary_expr = x3::rule<class unary_expr, ast::unary_expression>{"unary-expr"}
= -x3::lit('-') >> union_expr
;
auto equality_expr = x3::rule<class eq_expr, ast::eq_expression>{"equality-expr"}
= unary_expr
>> *(as<ast::operation_sequence_entry<ast::unary_expression>>
( (str_("=") | str_("!=")) > unary_expr )
)
;
auto and_expr = x3::rule<class and_expr, ast::and_expression>{"and-expr"}
= equality_expr
>> *(as<ast::operation_sequence_entry<ast::eq_expression>>
( str_("and") > equality_expr )
)
;
auto or_expr = x3::rule<class or_expr, ast::or_expression>{"or-expr"}
= and_expr
>> *(as<ast::operation_sequence_entry<ast::and_expression>>
( str_("or") >> and_expr )
)
;
auto function_name = as<std::string>(qname);
auto function_arg = or_expr;
auto function_call = x3::rule<class func_call, ast::function_call>{"func-call"}
= function_name > '(' > (or_expr % ',') > ')'
;
auto prim_expr = x3::rule<class prim_expr, ast::primary_expression>{"prim-expr"}
= ('$' > qname)
| ('"' > *(char_ - '"') > '"')
| ('(' > or_expr > ')')
| (int_ | uint_ | double_)
| function_call
;
auto predicate = '[' > or_expr > ']';
auto filter_expr = x3::rule<class filter_expr, ast::filter_expression>{"filter-expr"}
= prim_expr >> *(predicate)
;
auto path_expr_def = -(filter_expr) >> -(lit("/") | lit("//")) >> -(prim_expr);
BOOST_SPIRIT_DEFINE (path_expr);
}
int main() {
using x3::space;
using grammar::or_expr;
ast::or_expression oexpr;
std::string input = "$ab/$cd or $ef";
bool res = phrase_parse(input.begin(),
input.end(),
or_expr,
space,
oexpr);
if (!res) {
std::cout << "Parsing failed miserably!\n";
return 1;
}
return 0;
}
Compiling it as
g++ -std=c++14 -ftemplate-depth=1024 -o rec_ex rec_ex.cc
Compiler : Clang 3.8
Boost Version : 1.59
Based on the template error instantiation for lower template-depth, I am pretty sure that somewhere deep recursion is going on.
Is there anyway to optimize the above grammar so as to not cause this issue ?
Thanks.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Boost.Spirit X3 compile time explodes with recursive rule - c++

Related

boost x3 grammar for structs with multiple constructors

Cleanest way to handle both quoted and unquoted strings in Spirit.X3

Parsing text including Unicode using spirit crashes with boost-1.78, but OK with boost-1.67, Why?

Boost Spirit X3: Collapsing one-element lists

Optimizing the grammar

Categories

Resources