How to overcome a Boost Spirit AST snafu - c++

For starters, I have an AST in which I have to do a forward declaration, but apparently this is not exactly kosher in the latest C++ compilers?
Overcoming this I can work through the rest of the grammar, I believe. For reference, I am writing the parser more or less faithfully to the Google Protobuf v2 specification.
If memory serves, this has something to do with perhaps introducing a type def? And/or Boost Spirit recursive descent, i.e. recursive_wrapper? But it's been a little while, I'm a bit fuzzy on those details. Would someone mind taking a look?
But for the forward declaration issue I think the posted code is mostly grammar complete. TBD are Protobuf service, rpc, stream, and, of course, comments.
There may be a couple of variant gremlins lurking in there as well I'm not sure what to do with; i.e. how to synthesize a "nil" or empty_statement, for instance, pops up a couple of times throughout the grammatical alternatives.

How does one end up with such a vast body of untested code? I suppose it makes sense to look at a minimized version of this code from scratch and stop at the earliest point it stops working, instead of postponing sanity checks until it's become unmanageable.¹
I'm going to point you at some places where you can see what to do.
Recursive using declaration with boost variant
C++ Mutually Recursive Variant Type (Again)
I have to warn I don't think std::variant or std::optional are supported yet by Qi. I could be wrong.
Review And Fixup Round
I spent entirely too much time trying to fix the many issues, subtle and not so subtle.
I'll be happy to explain a bit, but for now I'm just dropping the result:
Live On Coliru
#define BOOST_SPIRIT_DEBUG
#include <iostream>
#include <string>
#include <vector>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/qi_auto.hpp>
//#include <boost/container/vector.hpp>
namespace AST {
using boost::variant;
using boost::optional;
enum class bool_t { false_, true_ };
enum class syntax_t { proto2 };
using str_t = std::string;
struct full_id_t {
std::string full_id;
};
using int_t = intmax_t;
using float_t = double;
/// See: http://www.boost.org/doc/libs/1_68_0/libs/spirit/example/qi/compiler_tutorial/calc8/ast.hpp
/// Specifically, struct nil {}.
struct empty_statement_t {};
// TODO: TBD: we may need/want to dissect this one still further... i.e. to ident, message/enum-name, etc.
struct element_type_t : std::string {
using std::string::string;
using std::string::operator=;
};
// TODO: TBD: let's not get too fancy with the inheritance, ...
// TODO: TBD: however, scanning the other types, we could potentially do more of it, strategically, here and there
struct msg_type_t : element_type_t {};
struct enum_type_t : element_type_t {};
struct package_t {
std::string full_id;
};
using const_t = variant<full_id_t, int_t, float_t, str_t, bool_t>;
struct import_modifier_t {
std::string val;
};
struct import_t {
optional<import_modifier_t> mod;
std::string target_name;
};
struct option_t {
std::string name;
const_t val;
};
using label_t = std::string;
using type_t = variant<std::string, msg_type_t, enum_type_t>;
// TODO: TBD: could potentially get more meta-dissected based on the specification:
struct field_opt_t {
std::string name;
const_t val;
};
struct field_t {
label_t label; // this would benefit from being an enum instead
type_t type;
std::string name;
int_t number;
std::vector<field_opt_t> opts;
};
// TODO: TBD: add extend_t after msg_t ...
struct field_t;
struct enum_t;
struct msg_t;
struct extend_t;
struct extensions_t;
struct group_t;
struct option_t;
struct oneof_t;
struct map_field_t;
struct reserved_t;
using msg_body_t = std::vector<variant<
field_t,
enum_t,
msg_t,
extend_t,
extensions_t,
group_t,
option_t,
oneof_t,
map_field_t,
reserved_t,
empty_statement_t
>>;
struct group_t {
label_t label;
std::string name;
int_t number;
msg_body_t body;
};
struct oneof_field_t {
type_t type;
std::string name;
int_t number;
optional<std::vector<field_opt_t>> opts;
};
struct oneof_t {
std::string name;
std::vector<variant<oneof_field_t, empty_statement_t>> choices;
};
struct key_type_t {
std::string val;
};
struct map_field_t {
key_type_t key_type;
type_t type;
std::string name;
int_t number;
optional<std::vector<field_opt_t>> opts;
};
struct range_t {
int_t min;
optional<int_t> max;
};
struct extensions_t {
std::vector<range_t> ranges;
};
struct reserved_t {
variant<std::vector<range_t>, std::vector<std::string>> val;
};
struct enum_val_opt_t {
std::string name;
const_t val;
};
struct enum_field_t {
std::string name;
std::string ordinal;
std::vector<enum_val_opt_t> opt; // consistency
};
using enum_body_t = std::vector<variant<option_t, enum_field_t, empty_statement_t> >;
struct enum_t {
std::string name;
enum_body_t body;
};
struct msg_t {
std::string name;
// TODO: TBD: here is another case where forward declaration is necessary in terms of the AST definition.
msg_body_t body;
};
struct extend_t {
using content_t = variant<field_t, group_t, empty_statement_t>;
// TODO: TBD: actually, this use case may beg the question whether
// "message type", et al, in some way deserve a first class definition?
msg_type_t msg_type;
std::vector<content_t> content;
};
struct top_level_def_t {
// TODO: TBD: may add svc_t after extend_t ...
variant<msg_t, enum_t, extend_t> content;
};
struct proto_t {
syntax_t syntax;
std::vector<variant<import_t, package_t, option_t, top_level_def_t, empty_statement_t>> content;
};
template <typename T>
static inline std::ostream& operator<<(std::ostream& os, T const&) {
std::operator<<(os, "[");
std::operator<<(os, typeid(T).name());
std::operator<<(os, "]");
return os;
}
}
BOOST_FUSION_ADAPT_STRUCT(AST::option_t, name, val)
BOOST_FUSION_ADAPT_STRUCT(AST::full_id_t, full_id)
BOOST_FUSION_ADAPT_STRUCT(AST::package_t, full_id)
BOOST_FUSION_ADAPT_STRUCT(AST::import_modifier_t, val)
BOOST_FUSION_ADAPT_STRUCT(AST::import_t, mod, target_name)
BOOST_FUSION_ADAPT_STRUCT(AST::field_opt_t, name, val)
BOOST_FUSION_ADAPT_STRUCT(AST::field_t, label, type, name, number, opts)
BOOST_FUSION_ADAPT_STRUCT(AST::group_t, label, name, number, body)
BOOST_FUSION_ADAPT_STRUCT(AST::oneof_field_t, type, name, number, opts)
BOOST_FUSION_ADAPT_STRUCT(AST::oneof_t, name, choices)
BOOST_FUSION_ADAPT_STRUCT(AST::key_type_t, val)
BOOST_FUSION_ADAPT_STRUCT(AST::map_field_t, key_type, type, name, number, opts)
BOOST_FUSION_ADAPT_STRUCT(AST::range_t, min, max)
BOOST_FUSION_ADAPT_STRUCT(AST::extensions_t, ranges)
BOOST_FUSION_ADAPT_STRUCT(AST::reserved_t, val)
BOOST_FUSION_ADAPT_STRUCT(AST::enum_val_opt_t, name, val)
BOOST_FUSION_ADAPT_STRUCT(AST::enum_field_t, name, ordinal, opt)
BOOST_FUSION_ADAPT_STRUCT(AST::enum_t, name, body)
BOOST_FUSION_ADAPT_STRUCT(AST::msg_t, name, body)
BOOST_FUSION_ADAPT_STRUCT(AST::extend_t, msg_type, content)
BOOST_FUSION_ADAPT_STRUCT(AST::top_level_def_t, content)
BOOST_FUSION_ADAPT_STRUCT(AST::proto_t, syntax, content)
namespace qi = boost::spirit::qi;
template<typename It>
struct ProtoGrammar : qi::grammar<It, AST::proto_t()> {
using char_rule_type = qi::rule<It, char()>;
using string_rule_type = qi::rule<It, std::string()>;
using skipper_type = qi::space_type;
ProtoGrammar() : ProtoGrammar::base_type(start) {
using qi::lit;
using qi::digit;
using qi::lexeme; // redundant, because no rule declares a skipper
using qi::char_;
// Identifiers
id = lexeme[qi::alpha >> *char_("A-Za-z0-9_")];
full_id = id;
msg_name = id;
enum_name = id;
field_name = id;
oneof_name = id;
map_name = id;
service_name = id;
rpc_name = id;
stream_name = id;
// These distincions aren't very useful until in the semantic analysis
// stage. I'd suggest to not conflate that with parsing.
msg_type = qi::as_string[ -char_('.') >> *(qi::hold[id >> char_('.')]) >> msg_name ];
enum_type = qi::as_string[ -char_('.') >> *(qi::hold[id >> char_('.')]) >> enum_name ];
// group_name = lexeme[qi::upper >> *char_("A-Za-z0-9_")];
// simpler:
group_name = &qi::upper >> id;
// Integer literals
oct_lit = &char_('0') >> qi::uint_parser<AST::int_t, 8>{};
hex_lit = qi::no_case["0x"] >> qi::uint_parser<AST::int_t, 16>{};
dec_lit = qi::uint_parser<AST::int_t, 10>{};
int_lit = lexeme[hex_lit | oct_lit | dec_lit]; // ordering is important
// Floating-point literals
float_lit = qi::real_parser<double, qi::strict_real_policies<double> >{};
// String literals
oct_esc = '\\' >> qi::uint_parser<unsigned char, 8, 3, 3>{};
hex_esc = qi::no_case["\\x"] >> qi::uint_parser<unsigned char, 16, 2, 2>{};
// The last bit in this phrase is literally, "Or Any Characters Not in the Sequence" (fixed)
char_val = hex_esc | oct_esc | char_esc | ~char_("\0\n\\");
str_lit = lexeme["'" >> *(char_val - "'") >> "'"]
| lexeme['"' >> *(char_val - '"') >> '"']
;
// Empty Statement - likely redundant
empty_statement = ';' >> qi::attr(AST::empty_statement_t{});
// Constant
const_
= bool_lit
| str_lit
| float_lit // again, ordering is important
| int_lit
| full_id
;
// keyword helper
#define KW(p) (lexeme[(p) >> !(qi::alnum | '_')])
// Syntax
syntax = KW("syntax") >> '=' >> lexeme[ lit("'proto2'") | "\"proto2\"" ] >> ';' >> qi::attr(AST::syntax_t::proto2);
// Import Statement
import_modifier = KW("weak") | KW("public");
import = KW("import") >> -import_modifier >> str_lit >> ';';
// Package
package = KW("package") >> full_id >> ';';
// Option
opt_name = qi::raw[ (id | '(' >> full_id >> ')') >> *('.' >> id) ];
opt = KW("option") >> opt_name >> '=' >> const_ >> ';';
// Fields
field_num = int_lit;
label = KW("required")
| KW("optional")
| KW("repeated")
;
type
= KW(builtin_type)
| msg_type
| enum_type
;
// Normal field
field_opt = opt_name >> '=' >> const_;
field_opts = -('[' >> field_opt % ',' >> ']');
field = label >> type >> field_name >> '=' >> field_num >> field_opts >> ';';
// Group field
group = label >> KW("group") >> group_name >> '=' >> field_num >> msg_body;
// Oneof and oneof field
oneof_field = type >> field_name >> '=' >> field_num >> field_opts >> ';';
oneof = KW("oneof") >> oneof_name >> '{'
>> *(
oneof_field
// TODO: TBD: ditto how to handle "empty" not synthesizing any attributes ...
| empty_statement
) >> '}';
// Map field
key_type = KW(builtin_type);
// mapField = "map" "<" keyType "," type ">" mapName "=" fieldNumber [ "[" fieldOptions "]" ] ";"
map_field = KW("map") >> '<' >> key_type >> ',' >> type >> '>' >> map_name
>> '=' >> field_num >> field_opts >> ';';
// Extensions and Reserved, Extensions ...
range = int_lit >> -(KW("to") >> (int_lit | KW("max")));
ranges = range % ',';
extensions = KW("extensions") >> ranges >> ';';
// Reserved
reserved = KW("reserved") >> (ranges | field_names) >> ';';
field_names = field_name % ',';
// Enum definition
enum_val_opt = opt_name >> '=' >> const_;
enum_val_opts = -('[' >> (enum_val_opt % ',') >> ']');
enum_field = id >> '=' >> int_lit >> enum_val_opts >> ';';
enum_body = '{' >> *(opt | enum_field | empty_statement) >> '}';
enum_ = KW("enum") >> enum_name >> enum_body;
// Message definition
msg = KW("message") >> msg_name >> msg_body;
msg_body = '{' >> *(
field
| enum_
| msg
| extend
| extensions
| group
| opt
| oneof
| map_field
| reserved
//// TODO: TBD: how to "include" an empty statement ... ? "empty" does not synthesize anything, right?
| empty_statement
) >> '}';
// Extend
extend_content = field | group | empty_statement;
extend_contents = '{' >> *extend_content >> '}';
extend = KW("extend") >> msg_type >> extend_contents;
top_level_def = msg | enum_ | extend /*| service*/;
proto = syntax >> *(import | package | opt | top_level_def | empty_statement);
start = qi::skip(qi::space) [ proto ];
BOOST_SPIRIT_DEBUG_NODES(
(id) (full_id) (msg_name) (enum_name) (field_name) (oneof_name)
(map_name) (service_name) (rpc_name) (stream_name) (group_name)
(msg_type) (enum_type)
(oct_lit) (hex_lit) (dec_lit) (int_lit)
(float_lit)
(oct_esc) (hex_esc) (char_val) (str_lit)
(empty_statement)
(const_)
(syntax)
(import_modifier) (import)
(package)
(opt_name) (opt)
(field_num)
(label)
(type)
(field_opt) (field_opts) (field)
(group)
(oneof_field)
(oneof)
(key_type) (map_field)
(range) (ranges) (extensions) (reserved)
(field_names)
(enum_val_opt) (enum_val_opts) (enum_field) (enum_body) (enum_)
(msg) (msg_body)
(extend_content) (extend_contents) (extend)
(top_level_def) (proto))
}
private:
struct escapes_t : qi::symbols<char, char> {
escapes_t() { this->add
("\\a", '\a')
("\\b", '\b')
("\\f", '\f')
("\\n", '\n')
("\\r", '\r')
("\\t", '\t')
("\\v", '\v')
("\\\\", '\\')
("\\'", '\'')
("\\\"", '"');
}
} char_esc;
string_rule_type id, full_id, msg_name, enum_name, field_name, oneof_name,
map_name, service_name, rpc_name, stream_name, group_name;
qi::rule<It, AST::msg_type_t(), skipper_type> msg_type;
qi::rule<It, AST::enum_type_t(), skipper_type> enum_type;
qi::rule<It, AST::int_t()> int_lit, dec_lit, oct_lit, hex_lit;
qi::rule<It, AST::float_t()> float_lit;
/// true | false
struct bool_lit_t : qi::symbols<char, AST::bool_t> {
bool_lit_t() { this->add
("true", AST::bool_t::true_)
("false", AST::bool_t::false_);
}
} bool_lit;
char_rule_type oct_esc, hex_esc, char_val;
qi::rule<It, AST::str_t()> str_lit;
// TODO: TBD: there are moments when this is a case in a variant or vector<variant>
qi::rule<It, AST::empty_statement_t(), skipper_type> empty_statement;
qi::rule<It, AST::const_t(), skipper_type> const_;
/// syntax = {'proto2' | "proto2"} ;
qi::rule<It, AST::syntax_t(), skipper_type> syntax;
/// import [weak|public] <targetName/> ;
qi::rule<It, AST::import_t(), skipper_type> import;
qi::rule<It, AST::import_modifier_t(), skipper_type> import_modifier;
/// package <fullIdent/> ;
qi::rule<It, AST::package_t(), skipper_type> package;
/// option <optionName/> = <const/> ;
qi::rule<It, AST::option_t(), skipper_type> opt;
/// <ident/> | "(" <fullIdent/> ")" ("." <ident/>)*
string_rule_type opt_name;
qi::rule<It, AST::label_t(), skipper_type> label;
qi::rule<It, AST::type_t(), skipper_type> type;
struct builtin_type_t : qi::symbols<char, std::string> {
builtin_type_t() { this->add
("double", "double")
("float", "float")
("int32", "int32")
("int64", "int64")
("uint32", "uint32")
("uint64", "uint64")
("sint32", "sint32")
("sint64", "sint64")
("fixed32", "fixed32")
("fixed64", "fixed64")
("sfixed32", "sfixed32")
("sfixed64", "sfixed64")
("bool", "bool")
("string", "string")
("bytes", "bytes");
}
} builtin_type;
qi::rule<It, AST::int_t()> field_num;
qi::rule<It, AST::field_opt_t(), skipper_type> field_opt;
qi::rule<It, std::vector<AST::field_opt_t>(), skipper_type> field_opts;
qi::rule<It, AST::field_t(), skipper_type> field;
qi::rule<It, AST::group_t(), skipper_type> group;
qi::rule<It, AST::oneof_t(), skipper_type> oneof;
qi::rule<It, AST::oneof_field_t(), skipper_type> oneof_field;
qi::rule<It, AST::key_type_t(), skipper_type> key_type;
qi::rule<It, AST::map_field_t(), skipper_type> map_field;
/// <int/> [ to ( <int/> | "max" ) ]
qi::rule<It, AST::range_t(), skipper_type> range;
qi::rule<It, std::vector<AST::range_t>(), skipper_type> ranges;
/// extensions <ranges/> ;
qi::rule<It, AST::extensions_t(), skipper_type> extensions;
/// reserved <ranges/>|<fieldNames/> ;
qi::rule<It, AST::reserved_t(), skipper_type> reserved;
qi::rule<It, std::vector<std::string>(), skipper_type> field_names;
/// <optionName/> = <constant/>
qi::rule<It, AST::enum_val_opt_t(), skipper_type> enum_val_opt;
qi::rule<It, std::vector<AST::enum_val_opt_t>(), skipper_type> enum_val_opts;
/// <ident/> = <int/> [ +<enumValueOption/> ] ;
qi::rule<It, AST::enum_field_t(), skipper_type> enum_field;
qi::rule<It, AST::enum_body_t(), skipper_type> enum_body;
qi::rule<It, AST::enum_t(), skipper_type> enum_;
// TODO: TBD: continue here: https://developers.google.com/protocol-buffers/docs/reference/proto2-spec#message_definition
/// message <messageName/> <messageBody/>
qi::rule<It, AST::msg_t(), skipper_type> msg;
/// *{ <field/> | <enum/> | <message/> | <extend/> | <extensions/> | <group/>
/// | <option/> | <oneof/> | <mapField/> | <reserved/> | <emptyStatement/> }
qi::rule<It, AST::msg_body_t(), skipper_type> msg_body;
// TODO: TBD: not sure how appropriate it would be to reach these cases, but we'll see what happens...
/// extend <messageType/> *{ <field/> | <group/> | <emptyStatement/> }
qi::rule<It, AST::extend_t::content_t(), skipper_type> extend_content;
qi::rule<It, std::vector<AST::extend_t::content_t>(), skipper_type> extend_contents;
qi::rule<It, AST::extend_t(), skipper_type> extend;
// TODO: TBD: ditto comments in the rule definition section.
// service; rpc; stream;
/// topLevelDef = <message/> | <enum/> | <extend/> | <service/>
qi::rule<It, AST::top_level_def_t(), skipper_type> top_level_def;
/// <syntax/> { <import/> | <package/> | <option/> | <option/> | <emptyStatement/> }
qi::rule<It, AST::proto_t(), skipper_type> proto;
qi::rule<It, AST::proto_t()> start;
};
#include <fstream>
int main() {
std::ifstream ifs("sample.proto");
std::string const input(std::istreambuf_iterator<char>(ifs), {});
using It = std::string::const_iterator;
It f = input.begin(), l = input.end();
ProtoGrammar<It> const g;
AST::proto_t parsed;
bool ok = qi::parse(f, l, g, parsed);
if (ok) {
std::cout << "Parse succeeded\n";
} else {
std::cout << "Parse failed\n";
}
if (f != l) {
std::cout << "Remaining unparsed input: '" << std::string(f,l) << "'\n";
}
}
Which for a sample input of
syntax = "proto2";
import "demo_stuff.proto";
package StackOverflow;
message Sample {
optional StuffMsg foo_list = 1;
optional StuffMsg bar_list = 2;
optional StuffMsg qux_list = 3;
}
message TransportResult {
message Sentinel {}
oneof Chunk {
Sample payload = 1;
Sentinel end_of_stream = 2;
}
}
message ShowTime {
optional uint32 magic = 1 [ default = 0xBDF69E88 ];
repeated string parameters = 2;
optional string version_info = 3;
}
Prints
<proto>
<try>syntax = "proto2";\ni</try>
<syntax>
<try>syntax = "proto2";\ni</try>
<success>\nimport "demo_stuff.</success>
<attributes>[[N3AST8syntax_tE]]</attributes>
</syntax>
<import>
<try>\nimport "demo_stuff.</try>
<import_modifier>
<try> "demo_stuff.proto";</try>
<fail/>
</import_modifier>
<str_lit>
<try>"demo_stuff.proto";\n</try>
[ ...
much
snipped
... ]
<empty_statement>
<try>\n\n</try>
<fail/>
</empty_statement>
<success>\n\n</success>
<attributes>[[[N3AST8syntax_tE], [[[empty], [d, e, m, o, _, s, t, u, f, f, ., p, r, o, t, o]], [[S, t, a, c, k, O, v, e, r, f, l, o, w]], [[[S, a, m, p, l, e], [[[], [S, t, u, f, f, M, s, g], [f, o, o, _, l, i, s, t], 1, []], [[], [S, t, u, f, f, M, s, g], [b, a, r, _, l, i, s, t], 2, []], [[], [S, t, u, f, f, M, s, g], [q, u, x, _, l, i, s, t], 3, []]]]], [[[T, r, a, n, s, p, o, r, t, R, e, s, u, l, t], [[[S, e, n, t, i, n, e, l], []], [[C, h, u, n, k], [[[S, a, m, p, l, e], [p, a, y, l, o, a, d], 1, []], [[S, e, n, t, i, n, e, l], [e, n, d, _, o, f, _, s, t, r, e, a, m], 2, []]]]]]], [[[S, h, o, w, T, i, m, e], [[[], [u, i, n, t, 3, 2], [m, a, g, i, c], 1, [[[d, e, f, a, u, l, t], 3187056264]]], [[], [s, t, r, i, n, g], [p, a, r, a, m, e, t, e, r, s], 2, []], [[], [s, t, r, i, n, g], [v, e, r, s, i, o, n, _, i, n, f, o], 3, []]]]]]]]</attributes>
</proto>
Parse succeeded
Remaining unparsed input: '
'
¹ (Conflating "recursive descent" (a parsing concept) with recursive variants is confusing too).
² Sadly it exceeds the capacity of both Wandbox and Coliru

I'll just summarize a couple of key points I observed digesting. First, wow, some of it I do not think is documented, in Spirit Qi pages, etc, unless you happened to have heard about it through little birds, etc. That to say, thanks so much for the insights!
Interesting, transformed things directly to language level whenever possible. For instance, bool_t, deriving directly from std::string, and even syntax_t, to name a few. Did not think one could even do that from a parser/AST perspective, but it makes sense.
Very interesting, deriving from std::string. As above, did not know that.
struct element_type_t : std::string {
using std::string::string;
using std::string::operator=;
};
In particular, with emphasis on string and operator=, I'm assuming to help the parser rules, attribute propagation, etc.
Yes, I wondered about support for std::optional and std::variant, but it would make sense considering Boost.Spirit maturity. Good points re: leveraging boost constructs of the same in lieu of std.
Did not know you could define aliases. It would make sense instead of defining a first class struct. For instance,
using const_t = variant<full_id_t, int_t, float_t, str_t, bool_t>;
Interesting label_t aliasing. Although I may pursue that being a language level enum with corresponding rule attribution. Still, up-vote for so much effort here.
using label_t = std::string;
Then the forward declaration and alias of the problem area, msg_body_t. INTERESTING I had no idea. Really.
struct field_t;
struct enum_t;
struct msg_t;
struct extend_t;
struct extensions_t;
struct group_t;
struct option_t;
struct oneof_t;
struct map_field_t;
struct reserved_t;
using msg_body_t = std::vector<variant<
field_t,
enum_t,
msg_t,
extend_t,
extensions_t,
group_t,
option_t,
oneof_t,
map_field_t,
reserved_t,
empty_statement_t
>>;
Still, I'm not sure how that avoids the C++ C2079 (VS2017) forward declaration issue? I'll have to double check in my project code, but it ran for you, obviously, so something about that must be more kosher than I am thinking.
BOOST_FUSION_ADAPT_STRUCT(AST::option_t, name, val)
// etc ...
And which simplifies the struct adaptation significantly, I imagine.
Eventually, yes, I would want a skipper involved. I had not quite gotten that far yet when I stumbled on the forward declaration issue.
using skipper_type = qi::space_type;
// ...
start = qi::skip(qi::space) [ proto ];
// ...
qi::rule<It, AST::msg_type_t(), skipper_type> msg_type;
For many of the rule definitions, = or %=? The prevailing wisdom I've heard over the years is to prefer %=. Your thoughts? i.e.
id = lexeme[qi::alpha >> *char_("A-Za-z0-9_")];
// ^, or:
id %= lexeme[qi::alpha >> *char_("A-Za-z0-9_")];
// ^^ ?
Makes sense for these to land in an language friendly AST attribution:
oct_lit = &char_('0') >> qi::uint_parser<AST::int_t, 8>{};
hex_lit = qi::no_case["0x"] >> qi::uint_parser<AST::int_t, 16>{};
dec_lit = qi::uint_parser<AST::int_t, 10>{};
int_lit = lexeme[hex_lit | oct_lit | dec_lit]; // ordering is important
// Yes, I understand why, because 0x... | 0... | dig -> that to say, great point!
I did not spend as much time as I should, perhaps, exploring the bits exposed by Qi here, i.e. qi::upper, etc, but it's a great point:
group_name = &qi::upper >> id;
Did not know this operator was a thing for char_. I do not think it is documented, however, unless you happened to have heard it from little birdies:
// Again, great points re: numerical/parser ordering.
char_val = hex_esc | oct_esc | char_esc | ~char_("\0\n\\");
// ^
Not sure what you mean here, "likely redundant". However, it is VERY interesting that you can produce the attribution here. I like that a lot.
// Empty Statement - likely redundant
empty_statement = ';' >> qi::attr(AST::empty_statement_t{});
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If you mean in terms of whether the semi-colon is redundant, I thought so as well, at first. Then I studied the rest of the grammar, and rather than second guess things, no, I agree with the grammar: "empty statement" really is an empty statement, at least when you accept it in the context of the grammatical alternatives. Sometimes, however, the semi-colon does indicate what you and I thought it was: that is, "end of statement" or eos, which caused me to raise an eyebrow at first as well.
I also did not know you could attribute things directly during a Qi rule. In fact, the general guidance I'd been considering was to avoid semantic actions. But I suppose these are different animals than qi::attr(...) per se.
Nice approach here. Plus it lends consistency to the rule definitions. Cannot up-vote enough for this one, among others.
#define KW(p) (lexeme[(p) >> !(qi::alnum | '_')])
Here I am considering language level enumerated values, but it is interesting nontheless.
label = KW("required")
| KW("optional")
| KW("repeated")
;
Here, the long and the short of it is, fewer rules involved. It is a bit messier in terms of all the strings, etc, but I like that it more or less reads one-for-one with the grammar to inform the definition.
// mapField = "map" "<" keyType "," type ">" mapName "=" fieldNumber [ "[" fieldOptions "]" ] ";"
map_field = KW("map") >> '<' >> key_type >> ',' >> type >> '>' >> map_name
>> '=' >> field_num >> field_opts >> ';';
I wondered about whether Qi symbols might be useful, but I had no idea these bits would be that useful:
struct escapes_t : qi::symbols<char, char> {
escapes_t() { this->add
("\\a", '\a')
("\\b", '\b')
("\\f", '\f')
("\\n", '\n')
("\\r", '\r')
("\\t", '\t')
("\\v", '\v')
("\\\\", '\\')
("\\'", '\'')
("\\\"", '"');
}
} char_esc;
Ditto symbols, up-vote:
struct builtin_type_t : qi::symbols<char, std::string> { /* ... */ };
In summary, very impressed here. Thank you so much for the insights.

There was a slight oversight in the ranges I think. Referring to the proto2 Extensions specification, literally we have:
range = intLit [ "to" ( intLit | "max" ) ]
Then adjusting in the AST:
enum range_max_t { max };
struct range_t {
int_t min;
boost::optional<boost::variant<int_t, range_max_t>> max;
};
And last but not least in the grammar:
range %= int_lit >> -(KW("to") >> (int_lit | KW_ATTR("max", ast::range_max_t::max)));
With the helper:
#define KW_ATTR(p, a) (qi::lexeme[(p) >> !(qi::alnum | '_')] >> qi::attr(a))
Untested, but my confidence is higher today than it was yesterday that this approach is on the right track.
Worst case, if there are any type conflicts between the int_t, which is basically defined as long long and the enumerated type range_max_t, then I could just store the keyword "max" for the same effect.
That's a worst case; I'd like to keep it as simple as possible but not lose sight of the specification at the same time.
Anyway, thanks again for the insights! up-vote

I'm not positive I completely understand this aspect, apart from one builds and the other does not.
With extend_t you introduce a using type alias content_t. I "get it", in the sense that this magically "just works". For instance:
struct extend_t {
using content_t = boost::variant<field_t, group_t, empty_statement_t>;
msg_type_t msg_type;
std::vector<content_t> content;
};
However, contrasting that with a more traditional template inheritance and type definition, I'm not sure why that does not work. For instance:
template<typename Content>
struct has_content {
typedef Content content_type;
content_type content;
};
// It is noteworthy, would need to identify the std::vector::value_type as well...
struct extend_t : has_content<std::vector<boost::variant<field_t, group_t, empty_statement_t>>> {
msg_type_t msg_type;
};
In which case, I start seeing symptoms of the forward declaration in the form of incomplete type errors.
I am hesitant to accept the one as "gospel" as it were without having a better understanding as to why it is.

Related

How to create an optional parser that is able to conditionally drop synthesized items

I am trying to create an optional parser rule. Depending on the value of the first attribute, I want to optionally emits a data.
Example, for the input:
x,2,3
y,3,4
x,5,6
If the first character is a y then the line should be discarded. Otherwise it will be processed. In this example, if the 3rd attribute is >= 4 then it is true. The synthesized attribute should be std::pair<bool, unsigned int> where the unsigned int value is the second attribute.
The parser is:
using namespace qi = boost::spirit::qi;
using Data = std::pair<bool, unsigned>;
BOOST_PHOENIX_ADAPT_FUNCTION(Data, make_pair, std::make_pair, 2);
class DataParser :
public qi::grammar<
std::string::iterator,
boost::spirit::char_encoding::ascii,
boost::spirit::ascii::space_type,
std::vector<Data>()
>
{
qi::rule<iterator_type, encoding_type, bool()> type;
qi::rule<iterator_type, encoding_type, bool()> side;
// doesn't compile: qi::rule<iterator_type, encoding_type, boost::spirit::ascii::space_type, boost::optional<Data>()> line;
qi::rule<iterator_type, encoding_type, boost::spirit::ascii::space_type, qi::locals<bool, unsigned, bool>, Data()> line;
qi::rule<iterator_type, encoding_type, boost::spirit::ascii::space_type, sig_type> start;
public:
DataParser()
: base_type(start)
{
using namespace qi::labels;
type = qi::char_[_val = _1 == 'x'];
side = qi::int_[_val = _1 >= 4];
line %= (qi::omit[type[_a = _1]] >> ',' >> qi::omit[qi::uint_[_b = _1]] >> ',' >> qi::omit[side[_c = _1]])[if_(_a)[_val = make_pair(_c, _b)]];
// doesn't compile: line %= (qi::omit[type[_a = _1]] >> ',' >> qi::omit[qi::uint_[_b = _1]] >> ',' >> qi::omit[side[_c = _1]])[if_(_a)[_val = make_pair(_c, _b)].else_[_val = qi::unused]];
// doesn't compile: line %= (type >> ',' >> qi::uint_ >> ',' >> side)[if_(_1)[_val = make_pair(_3, _2)]];
// doesn't compile: line %= (type >> ',' >> qi::uint_ >> ',' >> side)[if_(_1)[_val = make_pair(_3, _2)].else_[_val = unused]];
start = *line;
}
};
I get: [[false, 2], [false, 0], [true, 5]] where I want to get: [[false, 2], [true, 5]] (the second entry should be discarded).
I tried with boost::optional<Data> for the data rule and also to assign unused to _val but nothing worked.
Edit after fixing the issue with the accepted answer
The new rules are now:
using Data = std::pair<bool, unsigned>;
BOOST_PHOENIX_ADAPT_FUNCTION(Data, make_pair, std::make_pair, 2);
class DataParser :
public qi::grammar<
std::string::iterator,
boost::spirit::char_encoding::ascii,
boost::spirit::ascii::blank_type,
std::vector<Data>()
>
{
using Items = boost::fusion::vector<bool, unsigned, bool>;
qi::rule<iterator_type, encoding_type, bool()> type;
qi::rule<iterator_type, encoding_type, bool()> side;
qi::rule<iterator_type, encoding_type, boost::spirit::ascii::blank_type, Items()> line;
qi::rule<iterator_type, encoding_type, boost::spirit::ascii::blank_type, sig_type> start;
public:
DataParser()
: base_type(start)
{
using namespace qi::labels;
namespace px = boost::phoenix;
type = qi::char_[_val = _1 == 'x'];
side = qi::int_[_val = _1 >= 4];
line = type >> ',' >> qi::uint_ >> ',' >> side;
start = line[if_(_1)[px::push_back(_val, make_pair(_3, _2))]] % qi::eol;
}
};
The key points being to use the semantic action to decide if the synthesized attribute should be added by using all attributes of the previous rule, in this case line.
Okay. You use lots of power-tools. But remember, with great power comes....
In particular, qi::locals, phoenix, semantic actions: they're all complicating life so only use them as a last resort (or when they're a natural fit, which is rarely¹).
Think directly,
start = *line;
line = // ....
When you say
If the first character is a y then the line should be discarded. Otherwise it will be processed.
You can express this directly:
line = !qi::lit('y') >> // ...
Alternatively, spell out what starters to accept:
line = qi::omit[ qi::char_("xz") ] >> // ...
Done.
Straight Forward Mapping
Here I'll cheat by re-ordering the pair<unsigned, bool> so it matches the input order. Now everything works out of the box without "any" magic:
line = !qi::lit('y') >> qi::omit[qi::alnum] >> ',' >> qi::int_ >> ',' >> side;
ignore = +(qi::char_ - qi::eol);
start = qi::skip(qi::blank) [ (line | ignore) % qi::eol ];
However it WILL result in the spurious entries as you noticed: Live On Compiler Explorer
Parsed: {(2, false), (0, false), (5, true)}
Improving
Now you could go hack around things by changing the eol to also eat subsequent lines that don't appear to contain valid data lines. However, it becomes unwieldy, and we still have the desire to flip the pair's members.
So, here's where I think an actrion could be handy:
public:
DataParser() : DataParser::base_type(start) {
using namespace qi::labels;
start = qi::skip(qi::blank) [
(qi::char_ >> ',' >> qi::uint_ >> ',' >> qi::int_) [
_pass = process(_val, _1, _2, _3) ]
% qi::eol ];
}
private:
struct process_f {
template <typename... T>
bool operator()(Datas& into, char id, unsigned type, int side) const {
switch(id) {
case 'z': case 'x':
into.emplace_back(side >= 4, type);
break;
case 'y': // ignore
break;
case 'a':
return false; // fail the rule
}
return true;
}
};
boost::phoenix::function<action_f> process;
You can see, there's a nice separation of concerns now. You parse (char,int,int) and conditionally process it. That's what's keeping this relatively simple compared to your attempts.
Live Demo
Live On Compiler Explorer
#include <boost/fusion/adapted.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <fmt/ranges.h>
namespace qi = boost::spirit::qi;
using Data = std::pair<bool, unsigned>;
using Datas = std::vector<Data>;
template <typename It>
class DataParser : public qi::grammar<It, Datas()> {
using Skipper = qi::blank_type;
qi::rule<It, Datas(), Skipper> line;
qi::rule<It, Datas()> start;
public:
DataParser() : DataParser::base_type(start) {
using namespace qi::labels;
start = qi::skip(qi::blank) [
(qi::char_ >> ',' >> qi::uint_ >> ',' >> qi::int_) [
_pass = process(_val, _1, _2, _3) ]
% qi::eol ];
}
private:
struct process_f {
template <typename... T>
bool operator()(Datas& into, char id, unsigned type, int side) const {
switch(id) {
case 'z': case 'x':
into.emplace_back(side >= 4, type);
break;
case 'y': // ignore
break;
case 'a':
return false; // fail the rule
}
return true;
}
};
boost::phoenix::function<process_f> process;
};
int main() {
using It = std::string::const_iterator;
DataParser<It> p;
for (std::string const input : {
"x,2,3\ny,3,4\nx,5,6",
})
{
auto f = begin(input), l = end(input);
Datas d;
auto ok = qi::parse(f, l, p, d);
if (ok) {
fmt::print("Parsed: {}\n", d);
} else {
fmt::print("Parsed failed\n", d);
}
if (f!=l) {
fmt::print("Remaining unparsed: '{}'\n", std::string(f,l));
}
}
}
Prints
Parsed: {(false, 2), (true, 5)}
¹ Boost Spirit: "Semantic actions are evil"?

Got stuck porting legacy boost::spirit code

I am porting some legacy code from VS2010 & boost1.53 to VS2017 & boost1.71.
I have got stuck last two hours while trying compiling it.
The code is:
#include <string>
#include <vector>
#include <fstream>
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace qi = boost::spirit::qi;
using qi::_1; using qi::_2; using qi::_3; using qi::_4;
enum TYPE { SEND, CHECK, COMMENT };
struct Command
{
TYPE type;
std::string id;
std::string arg1;
std::string arg2;
bool checking;
};
class Parser
{
typedef boost::spirit::istream_iterator It;
typedef std::vector<Command> Commands;
struct deferred_fill
{
template <typename R, typename S, typename T, typename U> struct result { typedef void type; };//Not really sure still necessary
typedef void result_type;//Not really sure still necessary
void operator() (boost::iterator_range<It> const& id, boost::iterator_range<It> const& arg1, bool checking, Command& command) const
{
command.type = TYPE::SEND;
command.id.assign(id.begin(), id.end());
command.arg1.assign(arg1.begin(), arg1.end());
command.checking = checking;
}
};
private:
qi::symbols<char, bool> value;
qi::rule<It> ignore;
qi::rule<It, Command()> send;
qi::rule<It, Commands()> start;
boost::phoenix::function<deferred_fill> fill;
public:
std::vector<Command> commands;
Parser()
{
using namespace qi;
using boost::phoenix::push_back;
value.add("TRUE", true)
("FALSE", false);
send = ("SEND_CONFIRM" >> *blank >> '(' >> *blank >> raw[*~char_(',')] >> ','
>> *blank >> raw[*~char_(',')] >> ','
>> *blank >> value >> *blank >> ')' >> *blank >> ';')[fill(_1, _2, _3, _val)];
ignore = *~char_("\r\n");
start = (send[push_back(_val, _1)] | ignore) % eol;
}
void parse(const std::string& path)
{
std::ifstream in(path, std::ios_base::in);
if (!in) return;
in >> std::noskipws;//No white space skipping
boost::spirit::istream_iterator first(in);
boost::spirit::istream_iterator last;
qi::parse(first, last, start, commands);
}
};
int main(int argc, char* argv[])
{
Parser parser;
parser.parse("file.txt");
return 0;
}
The compiler complains in the next way (only copy first lines):
1>z:\externos\boost_1_71_0\boost\phoenix\core\detail\function_eval.hpp(116): error C2039: 'type': no es un miembro de 'boost::result_of<const Parser::deferred_fill (std::vector<Value,std::allocator<char>> &,std::vector<Value,std::allocator<char>> &,boost::iterator_range<Parser::It> &,Command &)>'
1> with
1> [
1> Value=char
1> ]
1>z:\externos\boost_1_71_0\boost\phoenix\core\detail\function_eval.hpp(114): note: vea la declaración de 'boost::result_of<const Parser::deferred_fill (std::vector<Value,std::allocator<char>> &,std::vector<Value,std::allocator<char>> &,boost::iterator_range<Parser::It> &,Command &)>'
1> with
1> [
1> Value=char
1> ]
1>z:\externos\boost_1_71_0\boost\phoenix\core\detail\function_eval.hpp(89): note: vea la referencia a la creación de instancias de plantilla clase de 'boost::phoenix::detail::function_eval::result_impl<F,void (Head,const boost::phoenix::actor<boost::spirit::argument<1>>&,const boost::phoenix::actor<boost::spirit::argument<2>>&,const boost::phoenix::actor<boost::spirit::attribute<0>>&),const boost::phoenix::vector2<Env,Actions> &>' que se está compilando
1> with
1> [
1> F=const boost::proto::exprns_::basic_expr<boost::proto::tagns_::tag::terminal,boost::proto::argsns_::term<Parser::deferred_fill>,0> &,
1> Head=const boost::phoenix::actor<boost::spirit::argument<0>> &,
1> Env=boost::phoenix::vector4<const boost::phoenix::actor<boost::proto::exprns_::basic_expr<boost::phoenix::detail::tag::function_eval,boost::proto::argsns_::list5<boost::proto::exprns_::basic_expr<boost::proto::tagns_::tag::terminal,boost::proto::argsns_::term<Parser::deferred_fill>,0>,boost::phoenix::actor<boost::spirit::argument<0>>,boost::phoenix::actor<boost::spirit::argument<1>>,boost::phoenix::actor<boost::spirit::argument<2>>,boost::phoenix::actor<boost::spirit::attribute<0>>>,5>> *,boost::fusion::vector<std::vector<char,std::allocator<char>>,std::vector<char,std::allocator<char>>,boost::iterator_range<Parser::It>,std::vector<char,std::allocator<char>>,boost::iterator_range<Parser::It>,std::vector<char,std::allocator<char>>,bool,std::vector<char,std::allocator<char>>,std::vector<char,std::allocator<char>>> &,boost::spirit::context<boost::fusion::cons<Command &,boost::fusion::nil_>,boost::fusion::vector<>> &,bool &> &,
1> Actions=const boost::phoenix::default_actions &
1> ]
I guess that error is related with the use of boost::spirit::istream_iterator, instead of char*, but I cann't figure out how to fix it to work again.
I have run out of ideas, please, anyone can see where my mistake is?
Aw. You're doing awesome things. Sadly/fortunately it's overcomplicated.
So let's first fix, and then simplify.
The Error
It's like you said,
void operator() (boost::iterator_range<It> const& id, boost::iterator_range<It> const& arg1, bool checking, Command& command) const
Doesn't match what actually gets invoked:
void Parser::deferred_fill::operator()(T&& ...) const [with T =
{std::vector<char>&, std::vector<char>&,
boost::iterator_range<boost::spirit::basic_istream_iterator<...> >&,
Command&}]
The reason is NOT the iterator (as you can see it's boost::spirit__istream_iterator alright).
However it's because you're getting other things as attributes. Turns out *blank exposes the attribute as a vector<char>. So you can "fix" that by omit[]-ing those. Let's instead wrap it in an attribute-less rule like ignore so we reduce the clutter.
Now the invocation is with
void Parser::deferred_fill::operator()(T&& ...) const [with T = {boost::iterator_range<It>&, boost::iterator_range<It>&, bool&, Command&}]
So it is compatible and compiles. Parsing:
SEND_CONFIRM("this is the id part", "this is arg1", TRUE);
With
Parser parser;
parser.parse("file.txt");
std::cout << std::boolalpha;
for (auto& cmd : parser.commands) {
std::cout << '{' << cmd.id << ", "
<< cmd.arg1 << ", "
<< cmd.arg2 << ", "
<< cmd.checking << "}\n";
}
Prints
{"this is the id part", "this is arg1", , TRUE}
Let's improve this
This calls for a skipper
This calls for automatic attribute propagation
Some other elements of style
Skippers
Instead of "calling" a skipper explicitly, let's use the built in capability:
rule<It, Attr(), Skipper> x;
defines a rule that skips over inputs sequences matched by a parser of the Skipper type. You need to actually pass in the skipper of that type.
using qi::phrase_parse instead of qi::parse
by using the qi::skip() directive
I always advocate the second approach, because it makes for a friendlier, less error-prone interface.
So declaring the skipper type:
qi::rule<It, Command(), qi::blank_type> send;
We can reduce the rule to:
send = (lit("SEND_CONFIRM") >> '('
>> raw[*~char_(',')] >> ','
>> raw[*~char_(',')] >> ','
>> value >> ')' >> ';')
[fill(_1, _2, _3, _val)];
And than pass a skipper from the start rule:
start = skip(blank) [
(send[push_back(_val, _1)] | ignore) % eol
];
That's all. Still compiles and matches the same.
Live On Coliru
Skipping with Lexemes
Still the same topic, lexemes actually inhibit the skipper¹, so you don't have to raw[]. This changes the exposed attributes to vector<char> as well:
void operator() (std::vector<char> const& id, std::vector<char> const& arg1, bool checking, Command& command) const
Live On Coliru
Automatic Attribute Propagation
Qi has semantic actions, but its real strength is in them being optional: Boost Spirit: "Semantic actions are evil"?
push_back(_val, _1) is actually the automatic attribute propagation semantics anwyays for *p, +p and p % delim² anyways, so just drop it:
start = skip(blank) [
(send | ignore) % eol
];
(note that send|ignore actually synthesizes optional<Command> which is fine for automatic propagation)
std::vector is attribute-compatible with std::string, e.g.. So if we can add a placeholder for arg2 we can match the Command structure layout:
send = lit("SEND_CONFIRM") >> '('
>> attr(SEND) // fill type
>> lexeme[*~char_(',')] >> ','
>> lexeme[*~char_(',')] >> ','
>> attr(std::string()) // fill for arg2
>> value >> ')' >> ';'
;
Now to be able to drop fill and its implementation, we have to adapt Command as a fusion sequence:
BOOST_FUSION_ADAPT_STRUCT(Command, type, id, arg1, arg2, checking)
Elements of Style 1
Using a namespace for your Command types makes it easier to ADL use the operator<< overloads for Commands, se we can just std::cout << cmd;
At this point, it all works in a fraction of the code: Live On Coliru
Elements of Style 2
If you can, make your parser stateless. That means it can be const, so you can:
reuse it without costly construction
the optimizer has more to work with
it's more testable (stateful things are harder to prove idempotent)
So, instead of having commands a member, just return them. While we're at it, we can make parse a static function
Instead of hardcoding the iterator type, it's flexible to have it as a template argument. That way you're not stuck with the overhead of multi_pass_adaptor and istream_iterator if you have a command in a char[] buffer, string or string_view at some point.
Also, deriving your Parser from qi::grammar with a suitable entry-point means you can use it as a parser expression (actually a non-terminal, just like rule<>) as any other parser.
Consider enabling rule debugging (see example)
Full Code
Live On Coliru
#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <fstream>
namespace qi = boost::spirit::qi;
namespace Commands {
enum TYPE { SEND, CHECK, COMMENT };
enum BOOL { FALSE, TRUE };
struct Command {
TYPE type;
std::string id;
std::string arg1;
std::string arg2;
BOOL checking;
};
typedef std::vector<Command> Commands;
// for (debug) output
static inline std::ostream& operator<<(std::ostream& os, TYPE t) {
switch (t) {
case SEND: return os << "SEND";
case CHECK: return os << "CHECK";
case COMMENT: return os << "COMMENT";
}
return os << "(unknown)";
}
static inline std::ostream& operator<<(std::ostream& os, BOOL b) {
return os << (b?"TRUE":"FALSE");
}
using boost::fusion::operator<<;
}
BOOST_FUSION_ADAPT_STRUCT(Commands::Command, type, id, arg1, arg2, checking)
namespace Commands {
template <typename It>
class Parser : public qi::grammar<It, Commands()> {
public:
Commands commands;
Parser() : Parser::base_type(start) {
using namespace qi;
value.add("TRUE", TRUE)
("FALSE", FALSE);
send = lit("SEND_CONFIRM") >> '('
>> attr(SEND) // fill type
>> lexeme[*~char_(',')] >> ','
>> lexeme[*~char_(',')] >> ','
>> attr(std::string()) // fill for arg2
>> value >> ')' >> ';'
;
ignore = +~char_("\r\n");
start = skip(blank) [
(send | ignore) % eol
];
BOOST_SPIRIT_DEBUG_NODES((start)(send)(ignore))
}
private:
qi::symbols<char, BOOL> value;
qi::rule<It> ignore;
qi::rule<It, Command(), qi::blank_type> send;
qi::rule<It, Commands()> start;
};
static Commands parse(std::istream& in) {
using It = boost::spirit::istream_iterator;
static const Parser<It> parser;
It first(in >> std::noskipws), //No white space skipping
last;
Commands commands;
if (!qi::parse(first, last, parser, commands)) {
throw std::runtime_error("command parse error");
}
return commands; // c++11 move semantics
}
}
int main() {
try {
for (auto& cmd : Commands::parse(std::cin))
std::cout << cmd << "\n";
} catch(std::exception const& e) {
std::cout << e.what() << "\n";
}
}
Prints
(SEND "this is the id part" "this is arg1" TRUE)
Or indeed with BOOST_SPIRIT_DEBUG defined:
<start>
<try>SEND_CONFIRM("this i</try>
<send>
<try>SEND_CONFIRM("this i</try>
<success>\n</success>
<attributes>[[SEND, [", t, h, i, s, , i, s, , t, h, e, , i, d, , p, a, r, t, "], [", t, h, i, s, , i, s, , a, r, g, 1, "], [], TRUE]]</attributes>
</send>
<send>
<try></try>
<fail/>
</send>
<ignore>
<try></try>
<fail/>
</ignore>
<success>\n</success>
<attributes>[[[SEND, [", t, h, i, s, , i, s, , t, h, e, , i, d, , p, a, r, t, "], [", t, h, i, s, , i, s, , a, r, g, 1, "], [], TRUE]]]</attributes>
</start>
¹ while pre-skipping as you require; see Boost spirit skipper issues
² (and then some, but let's not digress)

How to keep space character on breaking down input into a sequence of different parts using Alternative Parser?

I want to write a simple C++ parser which extracts the block hierarchy. I am using this rule:
std::string rest_content;
std::vector<boost::tuple<std::string, std::string>> scopes;
qi::rule<It, qi::ascii::space_type> block = *(
r.comment
| r.scope[push_back(boost::phoenix::ref(scopes), _1)]
| qi::char_[boost::phoenix::ref(rest_content) += _1] // rest
);
qi::phrase_parse(first, last,
block,
ascii::space);
which is supposed to break down code into three parts: comment, scope (code surrounded by "{}") and "rest".
The problem is that all the space characters are removed from the "rest". I need those spaces for later parsing (such as extracting identifiers).
I have tried using qi::skip, qi::lexeme and qi::raw to keep spaces:
// one of many failed attempts
qi::rule<It, qi::ascii::space_type> block = qi::lexeme[*(
qi::skip[r.comment]
| qi::skip[r.scope[push_back(boost::phoenix::ref(scopes), _1)]]
| qi::char_[push_back(boost::phoenix::ref(rest_content), _1)]
)];
but it never works.
So how to keep space characters? Any help is welcome. Thanks.
If you're parsing C++ code this way you may be biting off more than you can chew.
I'll answer, but the answer should show you how limited this approach is going to be. Just imagine parsing through
namespace q::x {
namespace y {
struct A {
template <typename = ns1::c<int>, typename...> struct C;
};
template <typename T, typename... Ts>
struct A::C final : ns2::ns3::base<A::C<T, Ts...>, Ts...> {
int simple = [](...) {
enum class X : unsigned { answer = 42, };
struct {
auto operator()(...) -> decltype(auto) {
return static_cast<int>(X::answer);
}
} iife;
return iife();
}("/* }}} */"); // {{{
};
}
}
and getting it right. And yes. that's valid code.
In fact it's so tricky, that it's easy to make "grown compilers" (GCC) trip: https://wandbox.org/permlink/FzcaSl6tbn18jq4f (Clang has no issue: https://wandbox.org/permlink/wu0mFwQiTOogKB5L).
That said, let me refer to my old explanation of how rule declarations and skippers work together: Boost spirit skipper issues
And show an approximation of what I'd do.
The comments
Actually, the comments should be part of your skipper, so let's make it so:
using SkipRule = qi::rule<It>;
SkipRule comment_only
= "//" >> *~qi::char_("\r\n") >> qi::eol
| "/*" >> *(qi::char_ - "*/") >> "*/"
;
Now for general skipping, we want to include whitespace:
SkipRule comment_or_ws
= qi::space | comment_only;
Now we want to parse types and identifiers:
qi::rule<It, std::string()> type
= ( qi::string("struct")
| qi::string("class")
| qi::string("union")
| qi::string("enum") >> -(*comment_or_ws >> qi::string("class"))
| qi::string("namespace")
)
>> !qi::graph // must be followed by whitespace
;
qi::rule<It, std::string()> identifier =
qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9")
;
I've /guessed/ that struct X { }; would be an example of a "scope" for you, and the tuple would contain ("struct", "X").
As a bonus I used attribute adaption of std::pair and show how to insert into a multimap for good measure later on
qi::rule<It, std::pair<std::string, std::string>()> scope
= qi::skip(comment_or_ws.alias()) [
type >> identifier
>> *~qi::char_(";{") // ignore some stuff like base classes
>> qi::omit["{" >> *~qi::char_("}") >> "}" | ';']
];
Note a big short-coming here is that the first non-commented '}' will "end" the scope. That's not how the language works (see the leading example)
Now we can conclude with an improved "block" rule:
qi::rule<It, SkipRule> block
= *(
scope [px::insert(px::ref(scopes), _1)]
| qi::skip(comment_only.alias()) [
qi::as_string[qi::raw[+(qi::char_ - scope)]] [px::ref(rest_content) += _1]
] // rest
);
Note that
- we override the comment_or_ws skipper with comment_only so we don't drop all whitespace from "rest content"
- inversely, we override the skipper to include whitespace inside the scope rule because otherwise the negative scope invocation (char_ - scope) would do the wrong thing because it wouldn't skip whitespace
Full Demo
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/std_pair.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace qi = boost::spirit::qi;
namespace px = boost::phoenix;
int main() {
using It = std::string::const_iterator;
using namespace qi::labels;
std::string rest_content;
std::multimap<std::string, std::string> scopes;
using SkipRule = qi::rule<It>;
SkipRule comment_only
= "//" >> *~qi::char_("\r\n") >> qi::eol
| "/*" >> *(qi::char_ - "*/") >> "*/"
;
SkipRule comment_or_ws
= qi::space | comment_only;
qi::rule<It, std::string()> type
= ( qi::string("struct")
| qi::string("class")
| qi::string("union")
| qi::string("enum") >> -(*comment_or_ws >> qi::string("class"))
| qi::string("namespace")
)
>> !qi::graph // must be followed by whitespace
;
qi::rule<It, std::string()> identifier =
qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9")
;
qi::rule<It, std::pair<std::string, std::string>()> scope
= qi::skip(comment_or_ws.alias()) [
type >> identifier
>> *~qi::char_(";{") // ignore some stuff like base classes
>> qi::omit["{" >> *~qi::char_("}") >> "}" | ';']
];
qi::rule<It, SkipRule> block
= *(
scope [px::insert(px::ref(scopes), _1)]
| qi::skip(comment_only.alias()) [
qi::as_string[qi::raw[+(qi::char_ - scope)]] [px::ref(rest_content) += _1]
] // rest
);
//BOOST_SPIRIT_DEBUG_NODES((block)(scope)(identifier)(type))
std::string const code = R"(
// some random sample "code"
struct base {
std::vector<int> ints;
};
/* class skipped_comment : base { };
*/
namespace q { namespace nested { } } // nested is not supported
class forward_declared;
template <typename T> // actually basically ignored
class
Derived
: base {
std::string more_data_members;
};
enum class MyEnum : int32_t {
foo = 0,
bar, /* whoop } */
qux = foo + bar
};
int main() {
return 0;
}
)";
qi::phrase_parse(begin(code), end(code), block, comment_or_ws);
for (auto& [k,v] : scopes) {
std::cout << k << ": " << v << "\n";
}
std::cout << "------------------ BEGIN REST_CONTENT -----------------\n";
std::cout << rest_content << "\n";
std::cout << "------------------ END REST_CONENT --------------------\n";
}
Which parses the following sample input:
// some random sample "code"
struct base {
std::vector<int> ints;
};
/* class skipped_comment : base { };
*/
namespace q { namespace nested { } } // nested is not supported
class forward_declared;
template <typename T> // actually basically ignored
class
Derived
: base {
std::string more_data_members;
};
enum class MyEnum : int32_t {
foo = 0,
bar, /* whoop } */
qux = foo + bar
};
int main() {
return 0;
}
Printing
class: forward_declared
class: Derived
enumclass: MyEnum
namespace: q
struct: base
------------------ BEGIN REST_CONTENT -----------------
;}template <typename T>;;
int main() {
return 0;
}
------------------ END REST_CONENT --------------------
Conclusion
This result seems a decent pointer to
explain how to tackle the specific hurdle
demonstrate how this approach to parsing is breaking down at the slightest obstacle (namespace a { namespace b { } } for example)
Caveat Emptor

decode http header value fully with boost spirit

Once again, I find myself reaching for boost spirit. Once again I find myself defeated by it.
A HTTP header value takes the general form:
text/html; q=1.0, text/*; q=0.8, image/gif; q=0.6, image/jpeg; q=0.6, image/*; q=0.5, */*; q=0.1
i.e. value *OWS [; *OWS name *OWS [= *OWS possibly_quoted_value] *OWS [...]] *OWS [ , <another value> ...]
so in my mind, this header decodes to:
value[0]:
text/html
params:
name : q
value : 1.0
value[1]:
text/*
params:
name : q
value : 0.8
...
and so on.
I am certain that to anyone who knows how, the boost::spirit::qi syntax for this is trivial.
I humbly ask for your assistance.
for example, here's the outline of the code which decodes the Content-Type header, which is limited to one value of the form type/subtype, with any number of parameters of the form <sp> ; <sp> token=token|quoted_string
template<class Iter>
void parse(ContentType& ct, Iter first, Iter last)
{
ct.mutable_type()->append(to_lower(consume_token(first, last)));
consume_lit(first, last, '/');
ct.mutable_subtype()->append(to_lower(consume_token(first, last)));
while (first != last) {
skipwhite(first, last);
if (consume_char_if(first, last, ';'))
{
auto p = ct.add_parameters();
skipwhite(first, last);
p->set_name(to_lower(consume_token(first, last)));
skipwhite(first, last);
if (consume_char_if(first, last, '='))
{
skipwhite(first, last);
p->set_value(consume_token_or_quoted(first, last));
}
else {
// no value on this parameter
}
}
else if (consume_char_if(first, last, ','))
{
// normally we should get the next value-token here but in the case of Content-Type
// we must barf
throw std::runtime_error("invalid use of ; in Content-Type");
}
}
}
ContentType& populate(ContentType& ct, const std::string& header_value)
{
parse(ct, header_value.begin(), header_value.end());
return ct;
}
OK, after an heroic 24 hours of struggle (well, not really - more like reading the manual over and over again...), I've found a way that works.
I am by no means competent with boost::spirit. If someone out there can improve on this answer, please do post it.
This spirit state machine takes the value of a header (with one, optionally parameterised, value) and turns it into a content_type structure.
My amateur reading of the HTTP standard indicates that some headers have the form (spaces here indicate any amount of white space, values may be quoted or not:
Header-Name: tokena/tokenb [; param1 = "value" [; param2 = value]...]
whereas others have the more general form:
Header-Name: token [; param1 = "value"[; param2 = value]...] [ , token ...]
This code covers the first case - i.e. the HTTP Content-Type header value. I will need to extend it to cater for the Accept header (which can advertise multiple values with parameters) - that will come later.
So here's the code. Please by all means show me how to improve it!!
#define BOOST_SPIRIT_DEBUG
#include <gtest/gtest.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_char.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <utility>
#include <vector>
#include <string>
#include <boost/variant.hpp>
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using unary_parameter = std::string;
struct binary_parameter
{
std::string name;
std::string value;
};
BOOST_FUSION_ADAPT_STRUCT(binary_parameter,
(std::string, name)
(std::string, value))
using parameter = boost::variant<unary_parameter, binary_parameter>;
struct type_subtype
{
std::string type;
std::string subtype;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype,
(std::string, type)
(std::string, subtype))
using content_type_pair = std::pair<std::string, std::string>;
struct content_type
{
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(content_type,
(type_subtype, type)
(std::vector<parameter>, params))
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using ascii::char_;
using qi::omit;
using qi::eoi;
CR = char_('\r');
LF = char_('\n');
CRLF = CR >> LF;
SP = char_(' ');
HT = char_('\t');
LWS = -CRLF >> +(SP | HT);
UPALPHA = char_('A', 'Z');
LOALPHA = char_('a', 'z');
ALPHA = UPALPHA | LOALPHA;
DIGIT = char_('0', '9');
CTL = char_(0, 31) | char_(127);
QUOT = char_('"');
TEXT = (char_ - CTL) | HT;
separator = char_('(') | ')' | '<' | '>' | '#'
| ',' | ';' | ':' | '\\' | '"'
| '/' | '[' | ']' | '?' | '='
| '{' | '}' | SP | HT;
end_sequence = separator | space;
token = +(char_ - separator);
qdtext = char_ - char_('"') - '\\';
quoted_pair = omit[char_('\\')] >> char_;
quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
value = quoted_string | token ;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
content_type_rule = type_subtype_rule >> *any_parameter;
BOOST_SPIRIT_DEBUG_NODES((qdtext)(quoted_pair)(quoted_string)(value)(token)(separator));
}
qi::rule<Iterator, void()> CR, LF, CRLF, SP, HT, LWS, CTL, QUOT;
qi::rule<Iterator, char()> UPALPHA, LOALPHA, ALPHA, DIGIT, TEXT, qdtext, quoted_pair;
qi::rule<Iterator, void()> separator, space, end_sequence;
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
qi::rule<Iterator, binary_parameter()> nvp;
qi::rule<Iterator, parameter()> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
};
TEST(spirit_test, test1)
{
token_grammar<std::string::const_iterator> grammar{};
std::string test = R"__test(application/json )__test";
content_type ct;
bool r = qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("application", ct.type.type);
EXPECT_EQ("json", ct.type.subtype);
EXPECT_EQ(0, ct.params.size());
ct = {};
test = R"__test(text/html ; charset = "ISO-8859-5")__test";
qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("text", ct.type.type);
EXPECT_EQ("html", ct.type.subtype);
ASSERT_EQ(1, ct.params.size());
ASSERT_EQ(typeid(binary_parameter), ct.params[0].type());
auto& x = boost::get<binary_parameter>(ct.params[0]);
EXPECT_EQ("charset", x.name);
EXPECT_EQ("ISO-8859-5", x.value);
}
I've taken the code as posted by OP and given it a review.
there's no need to specify void(). In fact it's preferable to use qi::unused_type in such cases, which is what rules will default to if no attribute type is declared.
there no need for char_ if you don't wish to expose the attribute. Use lit instead.
there is no need to wrap every char parser in a rule. That hurts performance. It's best to leave the proto expression tree un-evaluated as long so Qi can optimize parser expressions more, and the compiler can inline more.
Also, Qi doesn't have move semantics on attributes, so avoiding redundant rules eliminates redundant copies of sub-attributes that get concatenated in the containing rules.
Sample alternative spelling (caution, see Assigning parsers to auto variables)
auto CR = qi::lit('\r');
auto LF = qi::lit('\n');
auto CRLF = qi::lit("\r\n");
auto HT = qi::lit('\t');
auto SP = qi::lit(' ');
auto LWS = qi::copy(-CRLF >> +(SP | HT)); // deepcopy
UPALPHA = char_('A', 'Z');
LOALPHA = char_('a', 'z');
ALPHA = UPALPHA | LOALPHA;
DIGIT = char_('0', '9');
//CTL = char_(0, 31) | char_(127);
TEXT = char_("\t\x20-\x7e\x80-\xff");
Since you didn't have to use char_, you also don't have kill the attribute using qi::omit[].
When you are in a Qi domain expression template, raw string/char literals are implicitly wrapped in a qi::lit so, you can simply things like
quoted_pair = omit[char_('\\')] >> char_;
quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
to just
quoted_pair = '\\' >> char_;
quoted_string = '"' >> *(qdtext | quoted_pair) >> '"';
instead of spelling out skipping spaces with omit[*SP] all the time, just declare the rule with a skipper. Now, you can simplify
nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
content_type_rule = type_subtype_rule >> *any_parameter;
to just
nvp = token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces)[*any_parameter];
Note that any subrule invocations of rules that are declared without a skipper are implicitly lexeme: Boost spirit skipper issues
there were many redundant/unused headers
recent compilers + boost versions make BOOST_FUSION_ADAPT_STRUCT much simpler by using decltype
The results of simplifying are much less noisy:
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapted.hpp>
struct parameter {
boost::optional<std::string> name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>#,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, binary_parameter(), Skipper> nvp;
qi::rule<Iterator, parameter(), Skipper> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
};
See it Live On Coliru (with the same test cases)
BONUS
I'd prefer a simpler AST in a case like this. By injecting some attribute values using qi::attr you can avoid using boost::variant and/or even avoid boost::optional:
struct parameter {
bool have_name;
std::string name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(parameter, have_name, name, value)
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
namespace qi = boost::spirit::qi;
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>#,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = qi::attr(false) >> qi::attr("") >> token;
nvp = qi::attr(true) >> token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, parameter(), Skipper> nvp, name_only, any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
};

boost spirit parsing CSV with columns in variable order

I'm trying to parse a CSV file (with header line) using boost spirit.
The csv is not in a constant format. Sometimes there is some extra column or the order of the column is mixed. I'm interested in few columns, whose header name is well known.
For instance my CSV may look like:
Name,Surname,Age
John,Doe,32
Or:
Age,Name
32,John
I want to parse only the content of Name and Age (N.B. Age is integer type). At the moment i come out with a very ugly solution where Spirit parses the first line and creates a vector that contains an enum in the positions i'm interested into. And then i have to do the parsing of the terminal symbols by hand...
enum LineItems {
NAME, AGE, UNUSED
};
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() :
CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
start = qi::omit[header[qi::_a = qi::_1]] >> eol >> line(_a) % eol;
header = (lit("Name")[phx::push_back(phx::ref(qi::_val), LineItems::NAME)]
| lit("Age")[phx::push_back(phx::ref(qi::_val), LineItems::AGE)]
| column[phx::push_back(phx::ref(qi::_val), LineItems::UNUSED)]) % colsep;
line = (column % colsep)[phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1,
qi::_val)];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
}
void convertFunc(std::vector<string>& columns, std::vector<LineItems>& positions, CsvLine &csvLine) {
//terminal symbol parsing here, and assign to csvLine struct.
...
}
private:
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvLine(std::vector<LineItems>), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
};
Here is the full source.
What if the header parser could prepare a vector<rule<...>*> and the "line parser" just use this vector to parse itself? a sort of advanced nabialek trick (i've been trying but i couldn't make it).
Or is there any better way to parse this kind of CSV with Spirit?
(any help is appreciated, thank you in advance)
I'd go with the concept that you have,
I think it's plenty elegant (the qi locals even allow reentrant use of this).
To reduce the cruft in the rules (Boost Spirit: "Semantic actions are evil"?) you could move the "conversion function" off into attribute transformation customization points.
Oops. As commented that was too simple. However, you can still reduce the cruftiness quite a bit. With two simple tweaks, the grammar reads:
item.add("Name", NAME)("Age", AGE);
start = omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
The tweaks:
using qi::symbols to map from header to LineItem
using a raw semantinc action ([convert]) which directly access the context (see boost spirit semantic action parameters):
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
Arguably the upshot is akin to doing auto convert = phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1, qi::_val) but using auto with Proto/Phoenix/Spirit expressions is notoriously error prone (UB due to dangling refs to temporaries from the expression template), so I'd certainly prefer the way shown above.
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <iostream>
#include <boost/fusion/include/at_c.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string>
#include <vector>
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
using std::string;
enum LineItems { NAME, AGE, UNUSED };
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using Columns = std::vector<Column>;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() : CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
item.add("Name", NAME)("Age", AGE);
start = qi::omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
BOOST_SPIRIT_DEBUG_NODES((header)(column)(quoted));
}
private:
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, CsvLine(std::vector<LineItems> const&), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
qi::symbols<char, LineItems> item;
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
};
int main() {
const std::string s = "Surname,Name,Age,\nJohn,Doe,32\nMark,Smith,43";
auto f(begin(s)), l(end(s));
CsvGrammar<std::string::const_iterator> p;
CsvFile parsed;
bool ok = qi::phrase_parse(f, l, p, qi::blank, parsed);
if (ok) {
for (CsvLine line : parsed) {
std::cout << '[' << line.name << ']' << '[' << line.age << ']';
std::cout << std::endl;
}
} else {
std::cout << "Parse failed\n";
}
if (f != l)
std::cout << "Remaining unparsed: '" << std::string(f, l) << "'\n";
}
Prints
[Doe][32]
[Smith][43]