Update a parser to admit parentheses within quoted strings - c++
I need to update a parser to admit these new features, but I am not able to manage all them at a time:
The commands must admit an indeterminate number of parameters (> 0).
Parameters might be numbers, unquoted strings or quoted strings.
Parameters are separate by commas.
Within quoted strings, it shall be permitted to use opening/closing parenthesis.
(It easier to understand these requirements looking at source code example)
My current code, including checks, is as follows:
Godbolt link: https://godbolt.org/z/5d6o53n9h
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
namespace script
{
struct Command
{
enum Type { NONE, WRITE_LOG, INSERT_LABEL, START_PROCESS, END_PROCESS, COMMENT, FAIL };
Type type{ Type::NONE };
std::vector<std::string> args;
};
using Commands = std::vector<Command>;
}//namespace script
BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)
namespace script
{
namespace qi = boost::spirit::qi;
template <typename It>
class Parser : public qi::grammar<It, Commands()>
{
private:
qi::symbols<char, Command::Type> type;
qi::rule<It, Command(), qi::blank_type> none, command, comment, fail;//By its very nature "fail" must be the last one to be checked
qi::rule<It, Commands()> start;
public:
Parser() : Parser::base_type(start)
{
using namespace qi;//NOTE: "as_string" is neccessary in all args due to std::vector<std::string>
auto empty_args = copy(attr(std::vector<std::string>{}));
type.add
("WriteLog", Command::WRITE_LOG)
("InsertLabel", Command::INSERT_LABEL)
("StartProcess", Command::START_PROCESS)
("EndProcess", Command::END_PROCESS);
none = omit[*blank] >> &(eol | eoi)
>> attr(Command::NONE)
>> empty_args;//ignore args
command = type >> '('
>> as_string[lexeme[+~char_("(),\r\n")]] % ',' >> ')';
comment = lit("//")
>> attr(Command::COMMENT)
>> as_string[lexeme[*~char_("\r\n")]];
fail = omit[*~char_("\r\n")]
>> attr(Command::FAIL)
>> empty_args;//ignore args
start = skip(blank)[(none | command | comment | fail) % eol] >> eoi;
}
};
Commands parse(std::istream& in)
{
using It = boost::spirit::istream_iterator;
static const Parser<It> parser;
Commands commands;
It first(in >> std::noskipws), last;//No white space skipping
if (!qi::parse(first, last, parser, commands))
throw std::runtime_error("command parse error");
return commands;
}
}//namespace script
std::stringstream ss{
R"(// just a comment
WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)
StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};
int main()
{
using namespace script;
try
{
auto commands = script::parse(ss);
std::array args{ 0, 0, 1, 1, -1, 0, 3, -1, 0 };//Fails may have any number of arguments. It doesn't care. Sets as -1 by convenience flag
std::array types{ Command::COMMENT, Command::NONE, Command::WRITE_LOG, Command::WRITE_LOG, Command::FAIL, Command::NONE, Command::START_PROCESS, Command::FAIL, Command::NONE };
std::cout << std::boolalpha << "size correct? " << (commands.size() == 9) << std::endl;
std::cout << "types correct? " << std::equal(commands.begin(), commands.end(), types.begin(), types.end(), [](auto& cmd, auto& type) { return cmd.type == type; }) << std::endl;
std::cout << "arguments correct? " << std::equal(commands.begin(), commands.end(), args.begin(), args.end(), [](auto& cmd, auto arg) { return cmd.args.size() == arg || arg == -1; }) << std::endl;
}
catch (std::exception const& e)
{
std::cout << e.what() << "\n";
}
}
Any help with this will be appreciated.
You say you want to allow parentheses within quoted strings. But you don't even support quoted strings!
So the problem is your argument rule. Which doesn't even exist. It whould be roughly this part:
argument = +~char_("(),\r\n");
command = type >> '(' >> argument % ',' >> ')';
Where argument might be declared as
qi::rule<It, Argument()> argument;
In fact, rewriting the tests in an organized fashion, here's what we get right now:
Live On Compiler Explorer
static const Commands expected{
{Command::COMMENT, {"just a comment"}},
{Command::NONE, {}},
{Command::WRITE_LOG, {"this is a log"}},
{Command::WRITE_LOG, {"this is also (in another way) a log"}},
{Command::FAIL, {}},
{Command::NONE, {}},
{Command::START_PROCESS, {"17", "program.exe", "True"}},
{Command::FAIL, {}},
{Command::NONE, {}},
};
try {
auto parsed = script::parse(ss);
fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
(parsed == expected), parsed.size(), expected.size());
for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
if (expected[i] != parsed[i]) {
fmt::print("index #{} expected {}\n"
" actual: {}\n",
i, expected[i], parsed[i]);
} else {
fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
}
}
} catch (std::exception const& e) {
fmt::print("Exception: {}\n", e.what());
}
Prints
Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 expected Command(WRITE_LOG, ["this is a log"])
actual: Command(WRITE_LOG, ["\"this is a log\""])
index #3 expected Command(WRITE_LOG, ["this is also (in another way) a log"])
actual: Command(FAIL, [])
index #4 expected Command(FAIL, [])
actual: Command(WRITE_LOG, ["\"but this is just a fail"])
index #5 CORRECT (Command(NONE, []))
index #6 expected Command(START_PROCESS, ["17", "program.exe", "True"])
actual: Command(START_PROCESS, ["17", "\"program.exe\"", "True"])
index #7 expected Command(FAIL, [])
actual: Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))
As you can see, it fails quoted strings too, in my expectation. That's because the quoting is a language construct. In the AST (parsed results) you donot care about how exactly it was written in code. E.g. "hello\ world\041" might be equivalent too "hello world!" so both should result in the argument value hello world!.
So, let's do as we say:
argument = quoted_string | number | boolean | raw_string;
We can add a few rules:
// notice these are lexemes (no internal skipping):
qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
And define them:
quoted_string = '"' >> *~char_('"') >> '"';
number = raw[double_];
boolean = raw[bool_];
raw_string = +~char_("(),\r\n");
argument = quoted_string | number | boolean | raw_string;
(If you want to allow escaped quotes, something like this:
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
Now, I'd say you probably want Argument to be something like variant<double, std::string, bool>, instead of just std::string.
With only this change, all the problems have practically vanished: Live On Compiler Explorer:
Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 expected Command(FAIL, [])
actual: Command(START_PROCESS, ["17", "this_is_a_fail.exe, True)\n\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))
Now, index #7 looks very funky, but it's actually a well-known phenomenon in Spirit¹. Enabling BOOST_SPIRIT_DEBUG demonstrates it:
<argument>
<try>"this_is_a_fail.exe,</try>
<quoted_string>
<try>"this_is_a_fail.exe,</try>
<fail/>
</quoted_string>
<number>
<try>"this_is_a_fail.exe,</try>
<fail/>
</number>
<boolean>
<try>"this_is_a_fail.exe,</try>
<fail/>
</boolean>
<raw_string>
<try>"this_is_a_fail.exe,</try>
<success>, True)</success>
<attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,, , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
</raw_string>
<success>, True)</success>
<attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,, , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
</argument>
So, the string gets accepted as a raw string, even though it started with ". That's easily fixed, but we don't even need to. We could just apply qi::hold to avoid the duplication:
argument = qi::hold[quoted_string] | number | boolean | raw_string;
Result:
actual: Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
However, if you expect it to fail, fix that other problem:
raw_string = +~char_("\"(),\r\n"); // note the \"
Note: In the off-chance you really only require it to not start with
a quote:
raw_string = !lit('"') >> +~char_("(),\r\n");
I guess by now you see the problem with a "loose rule" like that, so I
don't recommend it.
You could express the requirement another way though, saying "if an
argument starts with '"' then is MUST be a quoted_string. Use
an expectation point there:
quoted_string = '"' > *('\\' >> char_ | ~char_('"')) > '"';
This has the effect that failure to parse a complete quoted_string
will throw an expectation_failed exception.
Summary / Listing
This is what we end up with:
Live On Compiler Explorer
//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <fmt/ranges.h>
namespace script {
using Argument = std::string;
using Arguments = std::vector<Argument>;
struct Command {
enum Type {
NONE,
WRITE_LOG,
INSERT_LABEL,
START_PROCESS,
END_PROCESS,
COMMENT,
FAIL
};
Type type{Type::NONE};
Arguments args;
auto operator<=>(Command const&) const = default;
};
using Commands = std::vector<Command>;
} // namespace script
BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)
namespace script {
namespace qi = boost::spirit::qi;
template <typename It> class Parser : public qi::grammar<It, Commands()> {
public:
Parser() : Parser::base_type(start) {
using namespace qi; // NOTE: "as_string" is neccessary in all args
auto empty_args = copy(attr(Arguments{}));
type.add //
("WriteLog", Command::WRITE_LOG) //
("InsertLabel", Command::INSERT_LABEL) //
("StartProcess", Command::START_PROCESS) //
("EndProcess", Command::END_PROCESS); //
none = omit[*blank] >> &(eol | eoi) //
>> attr(Command{Command::NONE, {}});
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
number = raw[double_];
boolean = raw[bool_];
raw_string = +~char_("\"(),\r\n");
argument = qi::hold[quoted_string] | number | boolean | raw_string;
command = type >> '(' >> argument % ',' >> ')';
comment = "//" //
>> attr(Command::COMMENT) //
>> as_string[lexeme[*~char_("\r\n")]]; //
fail = omit[*~char_("\r\n")] >> attr(Command{Command::FAIL, {}});
line = none | command | comment | fail; // keep fail last
start = skip(blank)[line % eol] >> eoi;
BOOST_SPIRIT_DEBUG_NODES((start)(line)(fail)(comment)(command)(
argument)(none)(quoted_string)(raw_string)(boolean)(number))
}
private:
qi::symbols<char, Command::Type> type;
qi::rule<It, Command(), qi::blank_type> line, none, command, comment, fail;
// notice these are lexemes (no internal skipping):
qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
qi::rule<It, Commands()> start;
};
Commands parse(std::istream& in)
{
using It = boost::spirit::istream_iterator;
static const Parser<It> parser;
Commands commands;
return qi::parse(It{in >> std::noskipws}, {}, parser, commands)
? commands
: throw std::runtime_error("command parse error");
}
struct Formatter {
static constexpr auto name(script::Command::Type type) {
return std::array{"NONE", "WRITE_LOG", "INSERT_LABEL",
"START_PROCESS", "END_PROCESS", "COMMENT",
"FAIL"}
.at(static_cast<int>(type));
}
auto parse(auto& ctx) const { return ctx.begin(); }
auto format(script::Command const& cmd, auto& ctx) const {
return format_to(ctx.out(), "Command({}, {})", name(cmd.type), cmd.args);
}
};
} // namespace script
template <> struct fmt::formatter<script::Command> : script::Formatter {};
std::stringstream ss{
R"(// just a comment
WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)
StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};
int main() {
using namespace script;
static const Commands expected{
{Command::COMMENT, {"just a comment"}},
{Command::NONE, {}},
{Command::WRITE_LOG, {"this is a log"}},
{Command::WRITE_LOG, {"this is also (in another way) a log"}},
{Command::FAIL, {}},
{Command::NONE, {}},
{Command::START_PROCESS, {"17", "program.exe", "True"}},
{Command::FAIL, {}},
{Command::NONE, {}},
};
try {
auto parsed = script::parse(ss);
fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
(parsed == expected), parsed.size(), expected.size());
for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
if (expected[i] != parsed[i]) {
fmt::print("index #{} expected {}\n"
" actual: {}\n",
i, expected[i], parsed[i]);
} else {
fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
}
}
} catch (std::exception const& e) {
fmt::print("Exception: {}\n", e.what());
}
}
Prints
Parsed all correct? true -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 CORRECT (Command(FAIL, []))
index #8 CORRECT (Command(NONE, []))
¹ see e.g. boost::spirit alternative parsers return duplicates (which links to three more of the same kind)
Related
Boost spirit core dump on parsing bracketed expression
Having some simplified grammar that should parse sequence of terminal literals: id, '<', '>' and ":action". I need to allow brackets '(' ')' that do nothing but improve reading. (Full example is there http://coliru.stacked-crooked.com/a/dca93f5c8f37a889 ) Snip of my grammar: start = expression % eol; expression = (simple_def >> -expression) | (qi::lit('(') > expression > ')'); simple_def = qi::lit('<') [qi::_val = Command::left] | qi::lit('>') [qi::_val = Command::right] | key [qi::_val = Command::id] | qi::lit(":action") [qi::_val = Command::action] ; key = +qi::char_("a-zA-Z_0-9"); When I try to parse: const std::string s = "(a1 > :action)"; Everything works like a charm. But when I little bit bring more complexity with brackets "(a1 (>) :action)" I've gotten coredump. Just for information - coredump happens on coliru, while msvc compiled example just demonstrate fail parsing. So my questions: (1) what's wrong with brackets, (2) how exactly brackets can be introduced to expression. p.s. It is simplified grammar, in real I have more complicated case, but this is a minimal reproduceable code.
You should just handle the expectation failure: terminate called after throwing an instance of 'boost::wrapexcept<boost::spir it::qi::expectation_failure<__gnu_cxx::__normal_iterator<char const*, std::__ cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >' what(): boost::spirit::qi::expectation_failure Aborted (core dumped) If you handle the expectation failure, the program will not have to terminate. Fixing The Grammar Your 'nested expression' rule only accepts a single expression. I think that expression = (simple_def >> -expression) is intended to match "1 or more `simple_def". However, the alternative branch: | ('(' > expression > ')'); doesn't accept the same: it just stops after parsing `)'. This means that your input is simply invalid according to the grammar. I suggest a simplification by expressing intent. You were on the right path with semantic typedefs. Let's avoid the "weasely" Line Of Lines (what even is that?): using Id = std::string; using Line = std::vector<Command>; using Script = std::vector<Line>; And use these typedefs consistently. Now, we can express the grammar as we "think" about it: start = skip(blank)[script]; script = line % eol; line = +simple; simple = group | command; group = '(' > line > ')'; See, by simplifying our mental model and sticking to it, we avoided the entire problem you had a hard time spotting. Here's a quick demo that includes error handling, optional debug output, both test cases and encapsulating the skipper as it is part of the grammar: Live On Compiler Explorer #include <fmt/ranges.h> #include <fmt/ostream.h> #include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/phoenix.hpp> namespace qi = boost::spirit::qi; namespace phx = boost::phoenix; enum class Command { id, left, right, action }; static inline std::ostream& operator<<(std::ostream& os, Command cmd) { switch (cmd) { case Command::id: return os << "[ID]"; case Command::left: return os << "[LEFT]"; case Command::right: return os << "[RIGHT]"; case Command::action: return os << "[ACTION]"; } return os << "[???]"; } using Id = std::string; using Line = std::vector<Command>; using Script = std::vector<Line>; template <typename It> struct ExprGrammar : qi::grammar<It, Script()> { ExprGrammar() : ExprGrammar::base_type(start) { using namespace qi; start = skip(blank)[script]; script = line % eol; line = +simple; simple = group | command; group = '(' > line > ')'; command = lit('<') [ _val = Command::left ] | lit('>') [ _val = Command::right ] | key [ _val = Command::id ] | lit(":action") [ _val = Command::action ] ; key = +char_("a-zA-Z_0-9"); BOOST_SPIRIT_DEBUG_NODES((command)(line)(simple)(group)(script)(key)); } private: qi::rule<It, Script()> start; qi::rule<It, Line(), qi::blank_type> line, simple, group; qi::rule<It, Script(), qi::blank_type> script; qi::rule<It, Command(), qi::blank_type> command; // lexemes qi::rule<It, Id()> key; }; int main() { using It = std::string::const_iterator; ExprGrammar<It> const p; for (const std::string s : { "a1 > :action\na1 (>) :action", "(a1 > :action)\n(a1 (>) :action)", "a1 (> :action)", }) { It f(begin(s)), l(end(s)); try { Script parsed; bool ok = qi::parse(f, l, p, parsed); if (ok) { fmt::print("Parsed {}\n", parsed); } else { fmt::print("Parsed failed\n"); } if (f != l) { fmt::print("Remaining unparsed: '{}'\n", std::string(f, l)); } } catch (qi::expectation_failure<It> const& ef) { fmt::print("{}\n", ef.what()); // TODO add more details :) } } } Prints Parsed {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}} Parsed {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}} Parsed {{[ID], [RIGHT], [ACTION]}} BONUS However, I think this can all be greatly simplified using qi::symbols for the commands. In fact it looks like you're only tokenizing (you confirm this when you say that the parentheses are not important). line = +simple; simple = group | command | (omit[key] >> attr(Command::id)); group = '(' > line > ')'; key = +char_("a-zA-Z_0-9"); Now you don't need Phoenix at all: Live On Compiler Explorer, printing ok? true {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}} ok? true {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}} ok? true {{[ID], [RIGHT], [ACTION]}} Even Simpler? Since I observe that you're basically tokenizing line-wise, why not simply skip the parentheses, and simplify all the way down to: script = line % eol; line = *(command | omit[key] >> attr(Command::id)); That's all. See it Live On Compiler Explorer again: #include <boost/spirit/include/qi.hpp> #include <fmt/ostream.h> #include <fmt/ranges.h> namespace qi = boost::spirit::qi; enum class Command { id, left, right, action }; using Id = std::string; using Line = std::vector<Command>; using Script = std::vector<Line>; static inline std::ostream& operator<<(std::ostream& os, Command cmd) { return os << (std::array{"ID", "LEFT", "RIGHT", "ACTION"}.at(int(cmd))); } template <typename It> struct ExprGrammar : qi::grammar<It, Script()> { ExprGrammar() : ExprGrammar::base_type(start) { using namespace qi; start = skip(skipper.alias())[line % eol]; line = *(command | omit[key] >> attr(Command::id)); key = +char_("a-zA-Z_0-9"); BOOST_SPIRIT_DEBUG_NODES((line)(key)); } private: using Skipper = qi::rule<It>; qi::rule<It, Script()> start; qi::rule<It, Line(), Skipper> line; Skipper skipper = qi::char_(" \t\b\f()"); qi::rule<It /*, Id()*/> key; // omit attribute for efficiency struct cmdsym : qi::symbols<char, Command> { cmdsym() { this->add("<", Command::left) (">", Command::right) (":action", Command::action); } } command; }; int main() { using It = std::string::const_iterator; ExprGrammar<It> const p; for (const std::string s : { "a1 > :action\na1 (>) :action", "(a1 > :action)\n(a1 (>) :action)", "a1 (> :action)", }) try { It f(begin(s)), l(end(s)); Script parsed; bool ok = qi::parse(f, l, p, parsed); fmt::print("ok? {} {}\n", ok, parsed); if (f != l) fmt::print(" -- Remaining '{}'\n", std::string(f, l)); } catch (qi::expectation_failure<It> const& ef) { fmt::print("{}\n", ef.what()); // TODO add more details :) } } Prints ok? true {{ID, RIGHT, ACTION}, {ID, RIGHT, ACTION}} ok? true {{ID, RIGHT, ACTION}, {ID, RIGHT, ACTION}} ok? true {{ID, RIGHT, ACTION}} Note I very subtly changed +() to *() so it would accept empty lines as well. This may or may not be what you want
How to capture the value parsed by a boost::spirit::x3 parser to be used within the body of a semantic action?
I have a parser for string literals, and I'd like to attach a semantic action to the parser that will manipulate the parsed value. It seems that boost::spirit::x3::_val() returns a reference to the parsed value when given the context, but for some reason the parsed string always enters the body of the semantic action as just an empty string, which obviously makes it difficult to read from it. It is the right string though, I've made sure by checking the addresses. Anyone know how I could have a reference to the parsed value within the semantic action attached to the parser? This here is the parser I currently use: x3::lexeme[quote > *("\\\"" >> x3::attr('\"') | ~x3::char_(quote)) > quote] And I'd like to add the semantic action to the end of it. Thank you in advance! EDIT: it seems that whenever I attach any semantic action in general to the parser, the value is nullified. I suppose the question now is how could I access the value before that happens? I just need to be able to manipulate the parsed string before it is given to the AST.
In X3, semantic actions are much simpler. They're unary callables that take just the context. Then you use free functions to extract information from the context: x3::_val(ctx) is like qi::_val x3::_attr(ctx) is like qi::_0 (or qi::_1 for simple parsers) x3::_pass(ctx) is like qi::_pass So, to get your semantic action, you could do: auto qstring = x3::rule<struct rule_type, std::string> {"qstring"} = x3::lexeme[quote > *("\\" >> x3::char_(quote) | ~x3::char_(quote)) > quote] ; Now to make a very odd string rule that reverses the text (after de-escaping) and requires the number of characters to be an odd-number: auto odd_reverse = [](auto& ctx) { auto& attr = x3::_attr(ctx); auto& val = x3::_val(ctx); x3::traits::move_to(attr, val); std::reverse(val.begin(), val.end()); x3::_pass(ctx) = val.size() % 2 == 0; }; auto odd_string = x3::rule<struct odd_type, std::string> {"odd_string"} = qstring [ odd_reverse ] ; DEMO Live On Coliru #include <boost/spirit/home/x3.hpp> #include <iostream> #include <iomanip> int main() { namespace x3 = boost::spirit::x3; auto constexpr quote = '"'; auto qstring = x3::rule<struct rule_type, std::string> {"qstring"} = x3::lexeme[quote > *("\\" >> x3::char_(quote) | ~x3::char_(quote)) > quote] ; auto odd_reverse = [](auto& ctx) { auto& attr = x3::_attr(ctx); auto& val = x3::_val(ctx); x3::traits::move_to(attr, val); std::reverse(val.begin(), val.end()); x3::_pass(ctx) = val.size() % 2 == 0; }; auto odd_string = x3::rule<struct odd_type, std::string> {"odd_string"} = qstring [ odd_reverse ] ; for (std::string const input : { R"("test \"hello\" world")", R"("test \"hello\" world!")", }) { std::string output; auto f = begin(input), l = end(input); if (x3::phrase_parse(f, l, odd_string, x3::blank, output)) { std::cout << "[" << output << "]\n"; } else { std::cout << "Failed\n"; } if (f != l) { std::cout << "Remaining unparsed: " << std::quoted(std::string(f,l)) << "\n"; } } } Printing [dlrow "olleh" tset] Failed Remaining unparsed: "\"test \\\"hello\\\" world!\"" UPDATE To the added question: EDIT: it seems that whenever I attach any semantic action in general to the parser, the value is nullified. I suppose the question now is how could I access the value before that happens? I just need to be able to manipulate the parsed string before it is given to the AST. Yes, if you attach an action, automatic attribute propagation is inhibited. This is the same in Qi, where you could assign rules with %= instead of = to force automatic attribute propagation. To get the same effect in X3, use the third template argument to x3::rule: x3::rule<X, T, true> to indicate you want automatic propagation. Really, try not to fight the system. In practice, the automatic transformation system is way more sophisticated than I am willing to re-discover on my own, so I usually post-process the whole AST or at most apply some minor tweaks in an action. See also Boost Spirit: "Semantic actions are evil"?
Regular expression to validate syntax of fields in any order, with acceptable values
Consider the following situation: We want to use a regular expression to validate the syntax of a command with X number of fields - one mandatory, two optional. The three fields can be shown in any order, with any number of spaces separating them, and have limited dictionaries of acceptable values Mandatory Field: "-foo" Optional Field 1: Can be either of "-handle" "-bar" or "-mustache" Optional Field 2: Can be either of "-meow" "-mix" or "-want" Examples of valid inputs: -foo -foo -bar -foo-want -foo -meow-bar -foo-mix-mustache -handle -foo-meow -mustache-foo -mustache -mix -foo -want-foo -want-meow-foo -want-foo-meow Examples of invalid inputs: woof -handle-meow -ha-foondle meow -foobar stackoverflow - handle -foo -mix -handle -mix -foo -handle -bar -foo -handle -mix -sodium I guess you can say, there are three capture groups, with the first being mandatory and the last two being optional: (\-foo){1} (\-handle|\-bar|\-mustache)? (\-meow|\-mix|\-want)? But I'm not sure on how to write it so that these can be in any order, possibly separated by any amount of spaces, and with nothing else. What I have so far is three forward-looking capture groups: (% signs indicating stuff to be completed) ^(?=.*?(foo))(?=.*?(\-handle|\-bar|\-mustache))(?=.*?(\-meow|\-mix|\-want))%Verify that group 1 is present once, optional groups 2 and 3 zero or one times, in any order, with any spaces%$ Adding a new capture group is simple enough, or expanding the acceptable inputs for an existing group, but I'm definitely stumped on the backreferencing, and not quite sure how on how expanding the checks to accomodate a 4th group would affect the backreferencing. Or would it make more sense to just use something like boost::split or boost::tokenize on the "-" character, then iterate through them, counting the tokens that fit into group 1, 2, 3, and "none of the above," and verifying the counts? It seems like it should be a simple extension or application of a boost library.
You mention boost. Have you looked at program_options? http://www.boost.org/doc/libs/1_55_0/doc/html/program_options/tutorial.html
Indeed, a context-free grammar would be fine. Let's parse your command into a structure like: struct Command { std::string one, two, three; }; Now, when we adapt that as a fusion sequence, we can write a Spirit Qi grammar for it and enjoy automagic attribute propagation: CommandParser() : CommandParser::base_type(start) { using namespace qi; command = field(Ref(&f1)) ^ field(Ref(&f2)) ^ field(Ref(&f3)); field = '-' >> raw[lazy(*_r1)]; f1 += "foo"; f2 += "handle", "bar", "mustache"; f3 += "meow", "mix", "want"; start = skip(blank) [ command >> eoi ] >> eps(is_valid(_val)); } Here, everything is straight-forward: the permutation parser (operator^) allows all three fields in any order. f1, f2, f3 are the accepted symbols (Options, below) for the respective fields. The start rule, finally, adds the skipping of blanks, and checks at the end (have we reached eoi? is the mandatory field present?). Live Demo Live On Coliru #include <boost/fusion/adapted/struct.hpp> struct Command { std::string one, two, three; }; BOOST_FUSION_ADAPT_STRUCT(Command, one, two, three) #include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/phoenix.hpp> namespace qi = boost::spirit::qi; template <typename It> struct CommandParser : qi::grammar<It, Command()> { CommandParser() : CommandParser::base_type(start) { using namespace qi; command = field(Ref(&f1)) ^ field(Ref(&f2)) ^ field(Ref(&f3)); field = '-' >> raw[lazy(*_r1)]; f1 += "foo"; f2 += "handle", "bar", "mustache"; f3 += "meow", "mix", "want"; start = skip(blank) [ command >> eoi ] >> eps(is_valid(_val)); } private: // mandatory field check struct is_valid_f { bool operator()(Command const& cmd) const { return cmd.one.size(); } }; boost::phoenix::function<is_valid_f> is_valid; // rules and skippers using Options = qi::symbols<char>; using Ref = Options const*; using Skipper = qi::blank_type; qi::rule<It, Command()> start; qi::rule<It, Command(), Skipper> command; qi::rule<It, std::string(Ref)> field; // option values Options f1, f2, f3; }; boost::optional<Command> parse(std::string const& input) { using It = std::string::const_iterator; Command cmd; bool ok = parse(input.begin(), input.end(), CommandParser<It>{}, cmd); return boost::make_optional(ok, cmd); } #include <iomanip> void run_test(std::string const& input, bool expect_valid) { auto result = parse(input); std::cout << (expect_valid == !!result?"PASS":"FAIL") << "\t" << std::quoted(input) << "\n"; if (result) { using boost::fusion::operator<<; std::cout << " --> Parsed: " << *result << "\n"; } } int main() { char const* valid[] = { "-foo", "-foo -bar", "-foo-want", "-foo -meow-bar", "-foo-mix-mustache", "-handle -foo-meow", "-mustache-foo", "-mustache -mix -foo", "-want-foo", "-want-meow-foo", "-want-foo-meow", }; char const* invalid[] = { "woof", "-handle-meow", "-ha-foondle", "meow", "-foobar", "stackoverflow", "- handle -foo -mix", "-handle -mix", "-foo -handle -bar", "-foo -handle -mix -sodium", }; std::cout << " === Positive test cases:\n"; for (auto test : valid) run_test(test, true); std::cout << " === Negative test cases:\n"; for (auto test : invalid) run_test(test, false); } Prints === Positive test cases: PASS "-foo" --> Parsed: (foo ) PASS "-foo -bar" --> Parsed: (foo bar ) PASS "-foo-want" --> Parsed: (foo want) PASS "-foo -meow-bar" --> Parsed: (foo bar meow) PASS "-foo-mix-mustache" --> Parsed: (foo mustache mix) PASS "-handle -foo-meow" --> Parsed: (foo handle meow) PASS "-mustache-foo" --> Parsed: (foo mustache ) PASS "-mustache -mix -foo" --> Parsed: (foo mustache mix) PASS "-want-foo" --> Parsed: (foo want) FAIL "-want-meow-foo" FAIL "-want-foo-meow" === Negative test cases: PASS "woof" PASS "-handle-meow" PASS "-ha-foondle" PASS "meow" PASS "-foobar" PASS "stackoverflow" PASS "- handle -foo -mix" PASS "-handle -mix" PASS "-foo -handle -bar" PASS "-foo -handle -mix -sodium"
This is a brute force solution which should work for fairly simple cases. The idea is to build up a regular expression out of all the permutations of the order in which these capture group can appear. In the test data there are only 6 permutations. Obviously this method could get unwieldily pretty easily. // Build all the permutations into a regex. std::regex const e{[]{ std::string e; char const* grps[] = { "\\s*(-foo)", "\\s*(-handle|-bar|-mustache)?", "\\s*(-meow|-mix|-want)?", }; // initial permutation std::sort(std::begin(grps), std::end(grps)); auto sep = ""; do { e = e + sep + "(?:"; for(auto const* g: grps) e += g; e += ")"; sep = "|"; // separate each permutation with | } while(std::next_permutation(std::begin(grps), std::end(grps))); return e; }(), std::regex_constants::optimize}; // Do some tests std::vector<std::string> const tests = { "-foo", "-foo -bar", "-foo-want", "-foo -meow-bar", "-foo-mix-mustache", "-handle -foo-meow", "-mustache-foo", "-mustache -mix -foo", "-want-foo", "-want-meow-foo", "-want-foo-meow", "woof", "-handle-meow", "-ha-foondle", "meow", "-foobar", "stackoverflow", "- handle -foo -mix", "-handle -mix", "-foo -handle -bar", "-foo -handle -mix -sodium", }; std::smatch m; for(auto const& test: tests) { if(!std::regex_match(test, m, e)) { std::cout << "Invalid: " << test << '\n'; continue; } std::cout << "Valid: " << test << '\n'; }
in boost::spirit::lex, it takes longest time to do first parsing, following parsing will be much shorter
I feed a series of text into my sip parser.the first one takes the longest time, no matter which is the first one.I wonder if there is any initialization work when spirit::lex do the first parsing? template <typename Lexer> struct sip_token : lex::lexer<Lexer> { sip_token() { this->self.add_pattern ("KSIP", "sip:") ("KSIPS", "sips:") ("USERINFO", "[0-9a-zA-Z-_.!~*'()]+(:[0-9a-zA-Z-_.!~*'()&=+$,]*)?#") ("DOMAINLBL", "([0-9a-zA-Z]|([0-9a-zA-Z][0-9a-zA-Z-]*[0-9a-zA-Z]))") ("TOPLBL", "[a-zA-Z]|([a-zA-Z][0-9a-zA-Z-]*[0-9a-zA-Z-])") ("INVITE", "INVITE") ("ACK", "ACK") ("OPTIONS", "OPTIONS") ("BYE", "BYE") ("CANCEL", "CANCEL") ("REGISTER", "REGISTER") ("METHOD", "({INVITE}|{ACK}|{OPTIONS}|{BYE}|{CANCEL}|{REGISTER})") ("SIPVERSION", "SIP\\/[0-9]\\.[0-9]") ("PROTOCOAL", "SIP\\/[^/]+\\/UDP") ("IPV4ADDR", "(\\d{1,3}\\.){3}\\d{1,3}") ("HOSTNAME", "[^ \t\r\n]+") ("SIPURL", "{KSIP}{USERINFO}?{HOSTNAME}(:[0-9]+)?") ("SIPSURL", "{KSIPS}{USERINFO}?{HOSTNAME}(:[0-9]+)?") ("SENTBY", "({HOSTNAME}|{IPV4ADDR})(:[0-9]+)?") ("GENPARM", "[^ ;\\n]+=[^ ;\r\\n]+") ("TOKEN", "[0-9a-zA-Z-.!%*_+~`']+") ("NAMEADDR", "({TOKEN} )?<({SIPURL}|{SIPSURL})>") ("STATUSCODE", "\\d{3}") ("REASONPHRASE", "[0-9a-zA-Z-_.!~*'()&=+$,]*") ("CR", "\\r") ("LF", "\\n") ; this->self.add ("{METHOD} {SIPURL} {SIPVERSION}", T_REQ_LINE) ("{SIPVERSION} {STATUSCODE} {REASONPHRASE}", T_STAT_LINE) ("{CR}?{LF}", T_CRLF) ("Via: {PROTOCOAL} {SENTBY}(;{GENPARM})*", T_VIA) ("To: {NAMEADDR}(;{GENPARM})*", T_TO) ("From: {NAMEADDR}(;{GENPARM})*", T_FROM) ("[0-9a-zA-Z -_.!~*'()&=+$,;/?:#]+", T_OTHER) ; } }; grammar: template <typename Iterator> struct sip_grammar : qi::grammar<Iterator> { template <typename TokenDef> sip_grammar(TokenDef const& tok) : sip_grammar::base_type(start) { using boost::phoenix::ref; using boost::phoenix::size; using boost::spirit::qi::eol; start = request | response; response = stat_line >> *(msg_header) >> qi::token(T_CRLF); request = req_line >> *(msg_header) >> qi::token(T_CRLF); stat_line = qi::token(T_STAT_LINE) >> qi::token(T_CRLF); req_line = qi::token(T_REQ_LINE) >> qi::token(T_CRLF); msg_header = (qi::token(T_VIA) | qi::token(T_TO) | qi::token(T_FROM) | qi::token(T_OTHER)) >> qi::token(T_CRLF); } std::size_t c, w, l; qi::rule<Iterator> start, response, request, stat_line, req_line, msg_header; }; timing: gettimeofday(&t1, NULL); bool r = lex::tokenize_and_parse(first, last, siplexer, g); gettimeofday(&t2, NULL); result: pkt1 time=40945(us) pkt2 time=140 pkt3 time=60 pkt4 time=74 pkt5 time=58 pkt6 time=51
Clearly, it does :) Lex will likely generate a DFA (one for each Lexer state, maybe). This is most likely the thing that takes the most time. Use a profiler to be certain :/ Now, you can make sure the tables are initialized before first use, or use the The Static Lexer Model to prevent the startup cost This means you'll write an 'extra' main to generate the DFA as C++ code: #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/lex_generate_static_lexertl.hpp> #include <fstream> #include "sip_token.hpp" using namespace boost::spirit; int main(int argc, char* argv[]) { // create the lexer object instance needed to invoke the generator sip_token<lex::lexertl::lexer<> > my_lexer; // the token definition std::ofstream out(argc < 2 ? "sip_token_static.hpp" : argv[1]); // invoke the generator, passing the token definition, the output stream // and the name suffix of the tables and functions to be generated // // The suffix "sip" used below results in a type lexertl::static_::lexer_sip // to be generated, which needs to be passed as a template parameter to the // lexertl::static_lexer template (see word_count_static.cpp). return lex::lexertl::generate_static_dfa(my_lexer, out, "sip") ? 0 : -1; } An example of the code generated is here (in the word-count example from the tutorial): http://www.boost.org/doc/libs/1_54_0/libs/spirit/example/lex/static_lexer/word_count_static.hpp
Parsing a SQL INSERT with Boost Spirit Classic
I'm trying to learn Boost Spirit and as an exercise, I've tried to parse a SQL INSERT statement using Boost Spirit Classic. This is the string I'm trying to parse: INSERT INTO example_tab (cola, colb, colc, cold) VALUES (vala, valb, valc, vald); From this SELECT example I've created this little grammar: struct microsql_grammar : public grammar<microsql_grammar> { template <typename ScannerT> struct definition { definition(microsql_grammar const& self) { keywords = "insert", "into", "values"; chlit<> LPAREN('('); chlit<> RPAREN(')'); chlit<> SEMI(';'); chlit<> COMMA(','); typedef inhibit_case<strlit<> > token_t; token_t INSERT = as_lower_d["insert"]; token_t INTO = as_lower_d["into"]; token_t VALUES = as_lower_d["values"]; identifier = nocase_d [ lexeme_d [ (alpha_p >> *(alnum_p | '_')) ] ]; string_literal = lexeme_d [ ch_p('\'') >> +( anychar_p - ch_p('\'') ) >> ch_p('\'') ]; program = +(query); query = insert_into_clause >> SEMI; insert_into_clause = insert_clause >> into_clause; insert_clause = INSERT >> INTO >> identifier >> LPAREN >> var_list_clause >> RPAREN; into_clause = VALUES >> LPAREN >> var_list_clause >> RPAREN; var_list_clause = list_p( identifier, COMMA ); } rule<ScannerT> const& start() const { return program; } symbols<> keywords; rule<ScannerT> identifier, string_literal, program, query, insert_into_clause, insert_clause, into_clause, var_list_clause; }; }; Using a minimal to test it: void test_it(const string& my_example) { microsql_grammar g; if (!parse(example.c_str(), g, space_p).full) { // point a - FAIL throw new exception(); } // point b - OK } Unfortunately it always enters the point A and throws the exception. Since I'm new to this, I have no idea where my error lies. I have two questions: What's the proper way to debug parsing errors when using Boost Spirit? Why parsing fails in this example?
To get visibility into what is failing to parse, assign the result of parse to a parse_info<>, then log/examine the parse_info<>::stop field, which in this case should be a const char * pointing at the last byte of you input string that matched your grammar. microsql_grammar g; parse_info<std::string::const_iterator> result = parse(example.begin(), example.end(), g, space_p) if (!result.full) { std::string parsed(example.begin(), result.stop); std::cout << parsed << std::endl; // point a - FAIL } // point b - OK Apologies if this doesn't compile, but should be a starting point.