decode http header value fully with boost spirit - c++

Once again, I find myself reaching for boost spirit. Once again I find myself defeated by it.
A HTTP header value takes the general form:
text/html; q=1.0, text/*; q=0.8, image/gif; q=0.6, image/jpeg; q=0.6, image/*; q=0.5, */*; q=0.1
i.e. value *OWS [; *OWS name *OWS [= *OWS possibly_quoted_value] *OWS [...]] *OWS [ , <another value> ...]
so in my mind, this header decodes to:
value[0]:
text/html
params:
name : q
value : 1.0
value[1]:
text/*
params:
name : q
value : 0.8
...
and so on.
I am certain that to anyone who knows how, the boost::spirit::qi syntax for this is trivial.
I humbly ask for your assistance.
for example, here's the outline of the code which decodes the Content-Type header, which is limited to one value of the form type/subtype, with any number of parameters of the form <sp> ; <sp> token=token|quoted_string
template<class Iter>
void parse(ContentType& ct, Iter first, Iter last)
{
ct.mutable_type()->append(to_lower(consume_token(first, last)));
consume_lit(first, last, '/');
ct.mutable_subtype()->append(to_lower(consume_token(first, last)));
while (first != last) {
skipwhite(first, last);
if (consume_char_if(first, last, ';'))
{
auto p = ct.add_parameters();
skipwhite(first, last);
p->set_name(to_lower(consume_token(first, last)));
skipwhite(first, last);
if (consume_char_if(first, last, '='))
{
skipwhite(first, last);
p->set_value(consume_token_or_quoted(first, last));
}
else {
// no value on this parameter
}
}
else if (consume_char_if(first, last, ','))
{
// normally we should get the next value-token here but in the case of Content-Type
// we must barf
throw std::runtime_error("invalid use of ; in Content-Type");
}
}
}
ContentType& populate(ContentType& ct, const std::string& header_value)
{
parse(ct, header_value.begin(), header_value.end());
return ct;
}

OK, after an heroic 24 hours of struggle (well, not really - more like reading the manual over and over again...), I've found a way that works.
I am by no means competent with boost::spirit. If someone out there can improve on this answer, please do post it.
This spirit state machine takes the value of a header (with one, optionally parameterised, value) and turns it into a content_type structure.
My amateur reading of the HTTP standard indicates that some headers have the form (spaces here indicate any amount of white space, values may be quoted or not:
Header-Name: tokena/tokenb [; param1 = "value" [; param2 = value]...]
whereas others have the more general form:
Header-Name: token [; param1 = "value"[; param2 = value]...] [ , token ...]
This code covers the first case - i.e. the HTTP Content-Type header value. I will need to extend it to cater for the Accept header (which can advertise multiple values with parameters) - that will come later.
So here's the code. Please by all means show me how to improve it!!
#define BOOST_SPIRIT_DEBUG
#include <gtest/gtest.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_char.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <utility>
#include <vector>
#include <string>
#include <boost/variant.hpp>
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using unary_parameter = std::string;
struct binary_parameter
{
std::string name;
std::string value;
};
BOOST_FUSION_ADAPT_STRUCT(binary_parameter,
(std::string, name)
(std::string, value))
using parameter = boost::variant<unary_parameter, binary_parameter>;
struct type_subtype
{
std::string type;
std::string subtype;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype,
(std::string, type)
(std::string, subtype))
using content_type_pair = std::pair<std::string, std::string>;
struct content_type
{
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(content_type,
(type_subtype, type)
(std::vector<parameter>, params))
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using ascii::char_;
using qi::omit;
using qi::eoi;
CR = char_('\r');
LF = char_('\n');
CRLF = CR >> LF;
SP = char_(' ');
HT = char_('\t');
LWS = -CRLF >> +(SP | HT);
UPALPHA = char_('A', 'Z');
LOALPHA = char_('a', 'z');
ALPHA = UPALPHA | LOALPHA;
DIGIT = char_('0', '9');
CTL = char_(0, 31) | char_(127);
QUOT = char_('"');
TEXT = (char_ - CTL) | HT;
separator = char_('(') | ')' | '<' | '>' | '#'
| ',' | ';' | ':' | '\\' | '"'
| '/' | '[' | ']' | '?' | '='
| '{' | '}' | SP | HT;
end_sequence = separator | space;
token = +(char_ - separator);
qdtext = char_ - char_('"') - '\\';
quoted_pair = omit[char_('\\')] >> char_;
quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
value = quoted_string | token ;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
content_type_rule = type_subtype_rule >> *any_parameter;
BOOST_SPIRIT_DEBUG_NODES((qdtext)(quoted_pair)(quoted_string)(value)(token)(separator));
}
qi::rule<Iterator, void()> CR, LF, CRLF, SP, HT, LWS, CTL, QUOT;
qi::rule<Iterator, char()> UPALPHA, LOALPHA, ALPHA, DIGIT, TEXT, qdtext, quoted_pair;
qi::rule<Iterator, void()> separator, space, end_sequence;
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
qi::rule<Iterator, binary_parameter()> nvp;
qi::rule<Iterator, parameter()> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
};
TEST(spirit_test, test1)
{
token_grammar<std::string::const_iterator> grammar{};
std::string test = R"__test(application/json )__test";
content_type ct;
bool r = qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("application", ct.type.type);
EXPECT_EQ("json", ct.type.subtype);
EXPECT_EQ(0, ct.params.size());
ct = {};
test = R"__test(text/html ; charset = "ISO-8859-5")__test";
qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("text", ct.type.type);
EXPECT_EQ("html", ct.type.subtype);
ASSERT_EQ(1, ct.params.size());
ASSERT_EQ(typeid(binary_parameter), ct.params[0].type());
auto& x = boost::get<binary_parameter>(ct.params[0]);
EXPECT_EQ("charset", x.name);
EXPECT_EQ("ISO-8859-5", x.value);
}

I've taken the code as posted by OP and given it a review.
there's no need to specify void(). In fact it's preferable to use qi::unused_type in such cases, which is what rules will default to if no attribute type is declared.
there no need for char_ if you don't wish to expose the attribute. Use lit instead.
there is no need to wrap every char parser in a rule. That hurts performance. It's best to leave the proto expression tree un-evaluated as long so Qi can optimize parser expressions more, and the compiler can inline more.
Also, Qi doesn't have move semantics on attributes, so avoiding redundant rules eliminates redundant copies of sub-attributes that get concatenated in the containing rules.
Sample alternative spelling (caution, see Assigning parsers to auto variables)
auto CR = qi::lit('\r');
auto LF = qi::lit('\n');
auto CRLF = qi::lit("\r\n");
auto HT = qi::lit('\t');
auto SP = qi::lit(' ');
auto LWS = qi::copy(-CRLF >> +(SP | HT)); // deepcopy
UPALPHA = char_('A', 'Z');
LOALPHA = char_('a', 'z');
ALPHA = UPALPHA | LOALPHA;
DIGIT = char_('0', '9');
//CTL = char_(0, 31) | char_(127);
TEXT = char_("\t\x20-\x7e\x80-\xff");
Since you didn't have to use char_, you also don't have kill the attribute using qi::omit[].
When you are in a Qi domain expression template, raw string/char literals are implicitly wrapped in a qi::lit so, you can simply things like
quoted_pair = omit[char_('\\')] >> char_;
quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
to just
quoted_pair = '\\' >> char_;
quoted_string = '"' >> *(qdtext | quoted_pair) >> '"';
instead of spelling out skipping spaces with omit[*SP] all the time, just declare the rule with a skipper. Now, you can simplify
nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
content_type_rule = type_subtype_rule >> *any_parameter;
to just
nvp = token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces)[*any_parameter];
Note that any subrule invocations of rules that are declared without a skipper are implicitly lexeme: Boost spirit skipper issues
there were many redundant/unused headers
recent compilers + boost versions make BOOST_FUSION_ADAPT_STRUCT much simpler by using decltype
The results of simplifying are much less noisy:
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapted.hpp>
struct parameter {
boost::optional<std::string> name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>#,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, binary_parameter(), Skipper> nvp;
qi::rule<Iterator, parameter(), Skipper> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
};
See it Live On Coliru (with the same test cases)
BONUS
I'd prefer a simpler AST in a case like this. By injecting some attribute values using qi::attr you can avoid using boost::variant and/or even avoid boost::optional:
struct parameter {
bool have_name;
std::string name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(parameter, have_name, name, value)
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
namespace qi = boost::spirit::qi;
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>#,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = qi::attr(false) >> qi::attr("") >> token;
nvp = qi::attr(true) >> token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, parameter(), Skipper> nvp, name_only, any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
};

Related

How to keep space character on breaking down input into a sequence of different parts using Alternative Parser?

I want to write a simple C++ parser which extracts the block hierarchy. I am using this rule:
std::string rest_content;
std::vector<boost::tuple<std::string, std::string>> scopes;
qi::rule<It, qi::ascii::space_type> block = *(
r.comment
| r.scope[push_back(boost::phoenix::ref(scopes), _1)]
| qi::char_[boost::phoenix::ref(rest_content) += _1] // rest
);
qi::phrase_parse(first, last,
block,
ascii::space);
which is supposed to break down code into three parts: comment, scope (code surrounded by "{}") and "rest".
The problem is that all the space characters are removed from the "rest". I need those spaces for later parsing (such as extracting identifiers).
I have tried using qi::skip, qi::lexeme and qi::raw to keep spaces:
// one of many failed attempts
qi::rule<It, qi::ascii::space_type> block = qi::lexeme[*(
qi::skip[r.comment]
| qi::skip[r.scope[push_back(boost::phoenix::ref(scopes), _1)]]
| qi::char_[push_back(boost::phoenix::ref(rest_content), _1)]
)];
but it never works.
So how to keep space characters? Any help is welcome. Thanks.
If you're parsing C++ code this way you may be biting off more than you can chew.
I'll answer, but the answer should show you how limited this approach is going to be. Just imagine parsing through
namespace q::x {
namespace y {
struct A {
template <typename = ns1::c<int>, typename...> struct C;
};
template <typename T, typename... Ts>
struct A::C final : ns2::ns3::base<A::C<T, Ts...>, Ts...> {
int simple = [](...) {
enum class X : unsigned { answer = 42, };
struct {
auto operator()(...) -> decltype(auto) {
return static_cast<int>(X::answer);
}
} iife;
return iife();
}("/* }}} */"); // {{{
};
}
}
and getting it right. And yes. that's valid code.
In fact it's so tricky, that it's easy to make "grown compilers" (GCC) trip: https://wandbox.org/permlink/FzcaSl6tbn18jq4f (Clang has no issue: https://wandbox.org/permlink/wu0mFwQiTOogKB5L).
That said, let me refer to my old explanation of how rule declarations and skippers work together: Boost spirit skipper issues
And show an approximation of what I'd do.
The comments
Actually, the comments should be part of your skipper, so let's make it so:
using SkipRule = qi::rule<It>;
SkipRule comment_only
= "//" >> *~qi::char_("\r\n") >> qi::eol
| "/*" >> *(qi::char_ - "*/") >> "*/"
;
Now for general skipping, we want to include whitespace:
SkipRule comment_or_ws
= qi::space | comment_only;
Now we want to parse types and identifiers:
qi::rule<It, std::string()> type
= ( qi::string("struct")
| qi::string("class")
| qi::string("union")
| qi::string("enum") >> -(*comment_or_ws >> qi::string("class"))
| qi::string("namespace")
)
>> !qi::graph // must be followed by whitespace
;
qi::rule<It, std::string()> identifier =
qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9")
;
I've /guessed/ that struct X { }; would be an example of a "scope" for you, and the tuple would contain ("struct", "X").
As a bonus I used attribute adaption of std::pair and show how to insert into a multimap for good measure later on
qi::rule<It, std::pair<std::string, std::string>()> scope
= qi::skip(comment_or_ws.alias()) [
type >> identifier
>> *~qi::char_(";{") // ignore some stuff like base classes
>> qi::omit["{" >> *~qi::char_("}") >> "}" | ';']
];
Note a big short-coming here is that the first non-commented '}' will "end" the scope. That's not how the language works (see the leading example)
Now we can conclude with an improved "block" rule:
qi::rule<It, SkipRule> block
= *(
scope [px::insert(px::ref(scopes), _1)]
| qi::skip(comment_only.alias()) [
qi::as_string[qi::raw[+(qi::char_ - scope)]] [px::ref(rest_content) += _1]
] // rest
);
Note that
- we override the comment_or_ws skipper with comment_only so we don't drop all whitespace from "rest content"
- inversely, we override the skipper to include whitespace inside the scope rule because otherwise the negative scope invocation (char_ - scope) would do the wrong thing because it wouldn't skip whitespace
Full Demo
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/std_pair.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace qi = boost::spirit::qi;
namespace px = boost::phoenix;
int main() {
using It = std::string::const_iterator;
using namespace qi::labels;
std::string rest_content;
std::multimap<std::string, std::string> scopes;
using SkipRule = qi::rule<It>;
SkipRule comment_only
= "//" >> *~qi::char_("\r\n") >> qi::eol
| "/*" >> *(qi::char_ - "*/") >> "*/"
;
SkipRule comment_or_ws
= qi::space | comment_only;
qi::rule<It, std::string()> type
= ( qi::string("struct")
| qi::string("class")
| qi::string("union")
| qi::string("enum") >> -(*comment_or_ws >> qi::string("class"))
| qi::string("namespace")
)
>> !qi::graph // must be followed by whitespace
;
qi::rule<It, std::string()> identifier =
qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9")
;
qi::rule<It, std::pair<std::string, std::string>()> scope
= qi::skip(comment_or_ws.alias()) [
type >> identifier
>> *~qi::char_(";{") // ignore some stuff like base classes
>> qi::omit["{" >> *~qi::char_("}") >> "}" | ';']
];
qi::rule<It, SkipRule> block
= *(
scope [px::insert(px::ref(scopes), _1)]
| qi::skip(comment_only.alias()) [
qi::as_string[qi::raw[+(qi::char_ - scope)]] [px::ref(rest_content) += _1]
] // rest
);
//BOOST_SPIRIT_DEBUG_NODES((block)(scope)(identifier)(type))
std::string const code = R"(
// some random sample "code"
struct base {
std::vector<int> ints;
};
/* class skipped_comment : base { };
*/
namespace q { namespace nested { } } // nested is not supported
class forward_declared;
template <typename T> // actually basically ignored
class
Derived
: base {
std::string more_data_members;
};
enum class MyEnum : int32_t {
foo = 0,
bar, /* whoop } */
qux = foo + bar
};
int main() {
return 0;
}
)";
qi::phrase_parse(begin(code), end(code), block, comment_or_ws);
for (auto& [k,v] : scopes) {
std::cout << k << ": " << v << "\n";
}
std::cout << "------------------ BEGIN REST_CONTENT -----------------\n";
std::cout << rest_content << "\n";
std::cout << "------------------ END REST_CONENT --------------------\n";
}
Which parses the following sample input:
// some random sample "code"
struct base {
std::vector<int> ints;
};
/* class skipped_comment : base { };
*/
namespace q { namespace nested { } } // nested is not supported
class forward_declared;
template <typename T> // actually basically ignored
class
Derived
: base {
std::string more_data_members;
};
enum class MyEnum : int32_t {
foo = 0,
bar, /* whoop } */
qux = foo + bar
};
int main() {
return 0;
}
Printing
class: forward_declared
class: Derived
enumclass: MyEnum
namespace: q
struct: base
------------------ BEGIN REST_CONTENT -----------------
;}template <typename T>;;
int main() {
return 0;
}
------------------ END REST_CONENT --------------------
Conclusion
This result seems a decent pointer to
explain how to tackle the specific hurdle
demonstrate how this approach to parsing is breaking down at the slightest obstacle (namespace a { namespace b { } } for example)
Caveat Emptor

Cannot get Boost Spirit grammar to use known keys for std::map<>

I seem to be experiencing some mental block with Boost Spirit I just cannot get by. I have a fairly simple grammar I need to handle, where I would like to put the values into a struct, that contains a std::map<> as one of it's members. The key names for the pairs are known up front, so that only those are allowed. There could be one to many keys in the map, in any order with each key name validated via qi.
The grammar looks something like this, as an example.
test .|*|<hostname> add|modify|save ( key [value] key [value] ... ) ;
//
test . add ( a1 ex00
a2 ex01
a3 "ex02,ex03,ex04" );
//
test * modify ( m1 ex10
m2 ex11
m3 "ex12,ex13,ex14"
m4 "abc def ghi" );
//
test 10.0.0.1 clear ( c1
c2
c3 );
In this example the keys for “add” being a1, a2 and a3, likewise for “modify” m1, m2, m3 and m4 and each must contain a value. For “clear” the keys of the map c1, c2 and c3 may not contain a value. Also, let's say for this example you can have up to 10 keys (a1 ... a11, m1 ... m11 and c1 ... c11) any combination of them could be used, in any order, for their corresponding action. Meaning that you cannot use the known key cX for the "add" or mX for "clear"
The structure follows this simple pattern
//
struct test
{
std::string host;
std::string action;
std::map<std::string,std::string> option;
}
So from the above examples, I would expect to have the struct contain ...
// add ...
test.host = .
test.action = add
test.option[0].first = a1
test.option[0].second = ex00
test.option[1].first = a2
test.option[1].second = ex01
test.option[2].first = a3
test.option[2].second = ex02,ex03,ex04
// modify ...
test.host = *
test.action = modify
test.option[0].first = m1
test.option[0].second = ex10
test.option[1].first = m2
test.option[1].second = ex11
test.option[2].first = m3
test.option[2].second = ex12,ex13,ex14
test.option[2].first = m3
test.option[2].second = abc def ghi
// clear ...
test.host = *
test.action = 10.0.0.1
test.option[0].first = c1
test.option[0].second =
test.option[1].first = c2
test.option[1].second =
test.option[2].first = c3
test.option[2].second =
I can get each indivudal part working, standalone, but I cannot seem to them working together. For example I have the host and action working without the map<>.
I’ve adapted a previously posted example from Sehe (here) trying to get this to work (BTW: Sehe has some awesome examples, which I’ve been using as much as the documentation).
Here is an excerpt (obviously not working), but at least shows where I’m trying to go.
namespace ast {
namespace qi = boost::spirit::qi;
//
using unused = qi::unused_type;
//
using string = std::string;
using strings = std::vector<string>;
using list = strings;
using pair = std::pair<string, string>;
using map = std::map<string, string>;
//
struct test
{
using preference = std::map<string,string>;
string host;
string action;
preference option;
};
}
//
BOOST_FUSION_ADAPT_STRUCT( ast::test,
( std::string, host )
( std::string, action ) )
( ast::test::preference, option ) )
//
namespace grammar
{
//
template <typename It>
struct parser
{
//
struct skip : qi::grammar<It>
{
//
skip() : skip::base_type( text )
{
using namespace qi;
// handle all whitespace (" ", \t, ...)
// along with comment lines/blocks
//
// comment blocks: /* ... */
// // ...
// -- ...
// # ...
text = ascii::space
| ( "#" >> *( char_ - eol ) >> ( eoi | eol ) ) // line comment
| ( "--" >> *( char_ - eol ) >> ( eoi | eol ) ) // ...
| ( "//" >> *( char_ - eol ) >> ( eoi | eol ) ) // ...
| ( "/*" >> *( char_ - "*/" ) >> "*/" ); // block comment
//
BOOST_SPIRIT_DEBUG_NODES( ( text ) )
}
//
qi::rule<It> text;
};
//
struct token
{
//
token()
{
using namespace qi;
// common
string = '"' >> *("\\" >> char_ | ~char_('"')) >> '"';
identity = char_("a-zA-Z_") >> *char_("a-zA-Z0-9_");
real = double_;
integer = int_;
//
value = ( string | identity );
// ip target
any = '*';
local = ( char_('.') | fqdn );
fqdn = +char_("a-zA-Z0-9.\\-" ); // consession
ipv4 = +as_string[ octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] >> '.'
>> octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] >> '.'
>> octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] >> '.'
>> octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] ];
//
target = ( any | local | fqdn | ipv4 );
//
pair = identity >> -( attr( ' ' ) >> value );
map = pair >> *( attr( ' ' ) >> pair );
list = *( value );
//
BOOST_SPIRIT_DEBUG_NODES( ( string )
( identity )
( value )
( real )
( integer )
( any )
( local )
( fqdn )
( ipv4 )
( target )
( pair )
( keyval )
( map )
( list ) )
}
//
qi::rule<It, std::string()> string;
qi::rule<It, std::string()> identity;
qi::rule<It, std::string()> value;
qi::rule<It, double()> real;
qi::rule<It, int()> integer;
qi::uint_parser<unsigned, 10, 1, 3> octet;
qi::rule<It, std::string()> any;
qi::rule<It, std::string()> local;
qi::rule<It, std::string()> fqdn;
qi::rule<It, std::string()> ipv4;
qi::rule<It, std::string()> target;
//
qi::rule<It, ast::map()> map;
qi::rule<It, ast::pair()> pair;
qi::rule<It, ast::pair()> keyval;
qi::rule<It, ast::list()> list;
};
//
struct test : token, qi::grammar<It, ast::test(), skip>
{
//
test() : test::base_type( command_ )
{
using namespace qi;
using namespace qr;
auto kw = qr::distinct( copy( char_( "a-zA-Z0-9_" ) ) );
// not sure how to enforce the "key" names!
key_ = *( '(' >> *value >> ')' );
// tried using token::map ... didn't work ...
//
add_ = ( ( "add" >> attr( ' ' ) ) [ _val = "add" ] );
modify_ = ( ( "modify" >> attr( ' ' ) ) [ _val = "modify" ] );
clear_ = ( ( "clear" >> attr( ' ' ) ) [ _val = "clear" ] );
//
action_ = ( add_ | modify_ | clear_ );
/* *** can't get from A to B here ... not sure what to do *** */
//
command_ = kw[ "test" ]
>> target
>> action_
>> ';';
BOOST_SPIRIT_DEBUG_NODES( ( command_ )
( action_ )
( add_ )
( modify_ )
( clear_ ) )
}
//
private:
//
using token::value;
using token::target;
using token::map;
qi::rule<It, ast::test(), skip> command_;
qi::rule<It, std::string(), skip> action_;
//
qi::rule<It, std::string(), skip> add_;
qi::rule<It, std::string(), skip> modify_;
qi::rule<It, std::string(), skip> clear_;
};
...
};
}
I hope this question isn't too ambiguous and if you need a working example of the problem, I can certainly provide that. Any and all help is greatly appreciated, so thank you in advance!
Notes:
with this
add_ = ( ( "add" >> attr( ' ' ) ) [ _val = "add" ] );
modify_ = ( ( "modify" >> attr( ' ' ) ) [ _val = "modify" ] );
clear_ = ( ( "clear" >> attr( ' ' ) ) [ _val = "clear" ] );
did you mean to require a space? Or are you really just trying to force the struct action field to contain a trailing space (that's what will happen).
If you meant the latter, I'd do that outside of the parser¹.
If you wanted the first, use the kw facility:
add_ = kw["add"] [ _val = "add" ];
modify_ = kw["modify"] [ _val = "modify" ];
clear_ = kw["clear"] [ _val = "clear" ];
In fact, you can simplify that (again, ¹):
add_ = raw[ kw["add"] ];
modify_ = raw[ kw["modify"] ];
clear_ = raw[ kw["clear"] ];
Which also means that you can simplify to
action_ = raw[ kw[lit("add")|"modify"|"clear"] ];
However, getting a bit close to your question, you could also use a symbol parser:
symbols<char> action_sym;
action_sym += "add", "modify", "clear";
//
action_ = raw[ kw[action_sym] ];
Caveat: the symbols needs to be a member so its lifetime extends beyond the constructor.
If you meant to capture the input representation of ipv4 addresses with
ipv4 = +as_string[ octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] >> '.'
>> octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] >> '.'
>> octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] >> '.'
>> octet[ _pass = ( _1 >= 0 && _1 <= 255 ) ] ];
Side note I'm assuming +as_string is a simple mistake and you meant as_string instead.
Simplify:
qi::uint_parser<uint8_t, 10, 1, 3> octet;
This obviates the range checks (see ¹ again):
ipv4 = as_string[ octet >> '.' >> octet >> '.' >> octet >> '.' >> octet ];
However, this would build a 4-char binary string representation of the address. If you wanted that, fine. I doubt it (because you'd have written std::array<uint8_t, 4> or uint64_t, right?). So if you wanted the string, again use raw[]:
ipv4 = raw[ octet >> '.' >> octet >> '.' >> octet >> '.' >> octet ];
Same issue as with number 1.:
pair = identity >> -( attr(' ') >> value );
This time, the problem betrays that the productions should not be in token; Conceptually token-izing precedes parsing and hence I'd keep the tokens skipper-less. kw doesn't really do a lot of good in that context. Instead, I'd move pair, map and list (unused?) into the parser:
pair = kw[identity] >> -value;
map = +pair;
list = *value;
Some examples
There's a very recent example I made about using symbols to parse (here), but this answer comes a lot closer to your question:
How to provider user with autocomplete suggestions for given boost::spirit grammar?
It goes far beyond the scope of your parser because it does all kinds of actions in the grammar, but what it does show is to have generic "lookup-ish" rules that can be parameterized with a particular "symbol set": see the Identifier Lookup section of the answer:
Identifier Lookup
We store "symbol tables" in Domain members _variables and
_functions:
using Domain = qi::symbols<char>; Domain _variables, _functions;
Then we declare some rules that can do lookups on either of them:
// domain identifier lookups
qi::_r1_type _domain;
qi::rule<It, Ast::Identifier(Domain const&)> maybe_known, known,
unknown;
The corresponding declarations will be shown shortly.
Variables are pretty simple:
variable = maybe_known(phx::ref(_variables));
Calls are trickier. If a name is unknown we don't want to assume it
implies a function unless it's followed by a '(' character.
However, if an identifier is a known function name, we want even to
imply the ( (this gives the UX the appearance of autocompletion
where when the user types sqrt, it suggests the next character to be
( magically).
// The heuristics: // - an unknown identifier followed by (
// - an unclosed argument list implies ) call %= (
known(phx::ref(_functions)) // known -> imply the parens
| &(identifier >> '(') >> unknown(phx::ref(_functions))
) >> implied('(') >> -(expression % ',') >> implied(')');
It all builds on known, unknown and maybe_known:
///////////////////////////////
// identifier loopkup, suggesting
{
maybe_known = known(_domain) | unknown(_domain);
// distinct to avoid partially-matching identifiers
using boost::spirit::repository::qi::distinct;
auto kw = distinct(copy(alnum | '_'));
known = raw[kw[lazy(_domain)]];
unknown = raw[identifier[_val=_1]] [suggest_for(_1, _domain)];
}
I think you can use the same approach constructively here. One additional gimmick could be to validate that properties supplied are, in fact, unique.
Demo Work
Combining all the hints above makes it compile and "parse" the test commands:
Live On Coliru
#include <string>
#include <map>
#include <vector>
namespace ast {
//
using string = std::string;
using strings = std::vector<string>;
using list = strings;
using pair = std::pair<string, string>;
using map = std::map<string, string>;
//
struct command {
string host;
string action;
map option;
};
}
#include <boost/fusion/adapted.hpp>
BOOST_FUSION_ADAPT_STRUCT(ast::command, host, action, option)
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/repository/include/qi_distinct.hpp>
namespace grammar
{
namespace qi = boost::spirit::qi;
namespace qr = boost::spirit::repository::qi;
template <typename It>
struct parser
{
struct skip : qi::grammar<It> {
skip() : skip::base_type(text) {
using namespace qi;
// handle all whitespace along with line/block comments
text = ascii::space
| (lit("#")|"--"|"//") >> *(char_ - eol) >> (eoi | eol) // line comment
| "/*" >> *(char_ - "*/") >> "*/"; // block comment
//
BOOST_SPIRIT_DEBUG_NODES((text))
}
private:
qi::rule<It> text;
};
//
struct token {
//
token() {
using namespace qi;
// common
string = '"' >> *("\\" >> char_ | ~char_('"')) >> '"';
identity = char_("a-zA-Z_") >> *char_("a-zA-Z0-9_");
value = string | identity;
// ip target
any = '*';
local = '.' | fqdn;
fqdn = +char_("a-zA-Z0-9.\\-"); // concession
ipv4 = raw [ octet >> '.' >> octet >> '.' >> octet >> '.' >> octet ];
//
target = any | local | fqdn | ipv4;
//
BOOST_SPIRIT_DEBUG_NODES(
(string) (identity) (value)
(any) (local) (fqdn) (ipv4) (target)
)
}
protected:
//
qi::rule<It, std::string()> string;
qi::rule<It, std::string()> identity;
qi::rule<It, std::string()> value;
qi::uint_parser<uint8_t, 10, 1, 3> octet;
qi::rule<It, std::string()> any;
qi::rule<It, std::string()> local;
qi::rule<It, std::string()> fqdn;
qi::rule<It, std::string()> ipv4;
qi::rule<It, std::string()> target;
};
//
struct test : token, qi::grammar<It, ast::command(), skip> {
//
test() : test::base_type(command_)
{
using namespace qi;
auto kw = qr::distinct( copy( char_( "a-zA-Z0-9_" ) ) );
//
action_sym += "add", "modify", "clear";
action_ = raw[ kw[action_sym] ];
//
command_ = kw["test"]
>> target
>> action_
>> '(' >> map >> ')'
>> ';';
//
pair = kw[identity] >> -value;
map = +pair;
list = *value;
BOOST_SPIRIT_DEBUG_NODES(
(command_) (action_)
(pair) (map) (list)
)
}
private:
using token::target;
using token::identity;
using token::value;
qi::symbols<char> action_sym;
//
qi::rule<It, ast::command(), skip> command_;
qi::rule<It, std::string(), skip> action_;
//
qi::rule<It, ast::map(), skip> map;
qi::rule<It, ast::pair(), skip> pair;
qi::rule<It, ast::list(), skip> list;
};
};
}
#include <fstream>
int main() {
using It = boost::spirit::istream_iterator;
using Parser = grammar::parser<It>;
std::ifstream input("input.txt");
It f(input >> std::noskipws), l;
Parser::skip const s{};
Parser::test const p{};
std::vector<ast::command> data;
bool ok = phrase_parse(f, l, *p, s, data);
if (ok) {
std::cout << "Parsed " << data.size() << " commands\n";
} else {
std::cout << "Parsed failed\n";
}
if (f != l) {
std::cout << "Remaining unparsed input: '" << std::string(f,l) << "'\n";
}
}
Prints
Parsed 3 commands
Let's restrict the Keys
Like in the linked answer above, let's pass the map, pair rules the actual key set to get their allowed values from:
using KeySet = qi::symbols<char>;
using KeyRef = KeySet const*;
//
KeySet add_keys, modify_keys, clear_keys;
qi::symbols<char, KeyRef> action_sym;
qi::rule<It, ast::pair(KeyRef), skip> pair;
qi::rule<It, ast::map(KeyRef), skip> map;
Note A key feature used is the associated attribute value with a symbols<> lookup (in this case we associate a KeyRef with an action symbol):
//
add_keys += "a1", "a2", "a3", "a4", "a5", "a6";
modify_keys += "m1", "m2", "m3", "m4";
clear_keys += "c1", "c2", "c3", "c4", "c5";
action_sym.add
("add", &add_keys)
("modify", &modify_keys)
("clear", &clear_keys);
Now the heavy lifting starts.
Using qi::locals<> and inherited attributes
Let's give command_ some local space to store the selected keyset:
qi::rule<It, ast::command(), skip, qi::locals<KeyRef> > command_;
Now we can in principle assignt to it (using the _a placeholder). However, there's some details:
//
qi::_a_type selected;
Always prefer descriptive names :) _a and _r1 get old pretty quick. Things are confusing enough as it is.
command_ %= kw["test"]
>> target
>> raw[ kw[action_sym] [ selected = _1 ] ]
>> '(' >> map(selected) >> ')'
>> ';';
Note: the subtlest detail here is %= instead of = to avoid the suppression of automatic attribute propagation when a semantic action is present (yeah, see ¹ again...)
But all in all, that doesn't read so bad?
//
qi::_r1_type symref;
pair = raw[ kw[lazy(*symref)] ] >> -value;
map = +pair(symref);
And now at least things parse
Almost there
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#include <string>
#include <map>
#include <vector>
namespace ast {
//
using string = std::string;
using strings = std::vector<string>;
using list = strings;
using pair = std::pair<string, string>;
using map = std::map<string, string>;
//
struct command {
string host;
string action;
map option;
};
}
#include <boost/fusion/adapted.hpp>
BOOST_FUSION_ADAPT_STRUCT(ast::command, host, action, option)
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/repository/include/qi_distinct.hpp>
namespace grammar
{
namespace qi = boost::spirit::qi;
namespace qr = boost::spirit::repository::qi;
template <typename It>
struct parser
{
struct skip : qi::grammar<It> {
skip() : skip::base_type(rule_) {
using namespace qi;
// handle all whitespace along with line/block comments
rule_ = ascii::space
| (lit("#")|"--"|"//") >> *(char_ - eol) >> (eoi | eol) // line comment
| "/*" >> *(char_ - "*/") >> "*/"; // block comment
//
//BOOST_SPIRIT_DEBUG_NODES((skipper))
}
private:
qi::rule<It> rule_;
};
//
struct token {
//
token() {
using namespace qi;
// common
string = '"' >> *("\\" >> char_ | ~char_('"')) >> '"';
identity = char_("a-zA-Z_") >> *char_("a-zA-Z0-9_");
value = string | identity;
// ip target
any = '*';
local = '.' | fqdn;
fqdn = +char_("a-zA-Z0-9.\\-"); // concession
ipv4 = raw [ octet >> '.' >> octet >> '.' >> octet >> '.' >> octet ];
//
target = any | local | fqdn | ipv4;
//
BOOST_SPIRIT_DEBUG_NODES(
(string) (identity) (value)
(any) (local) (fqdn) (ipv4) (target)
)
}
protected:
//
qi::rule<It, std::string()> string;
qi::rule<It, std::string()> identity;
qi::rule<It, std::string()> value;
qi::uint_parser<uint8_t, 10, 1, 3> octet;
qi::rule<It, std::string()> any;
qi::rule<It, std::string()> local;
qi::rule<It, std::string()> fqdn;
qi::rule<It, std::string()> ipv4;
qi::rule<It, std::string()> target;
};
//
struct test : token, qi::grammar<It, ast::command(), skip> {
//
test() : test::base_type(start_)
{
using namespace qi;
auto kw = qr::distinct( copy( char_( "a-zA-Z0-9_" ) ) );
//
add_keys += "a1", "a2", "a3", "a4", "a5", "a6";
modify_keys += "m1", "m2", "m3", "m4";
clear_keys += "c1", "c2", "c3", "c4", "c5";
action_sym.add
("add", &add_keys)
("modify", &modify_keys)
("clear", &clear_keys);
//
qi::_a_type selected;
command_ %= kw["test"]
>> target
>> raw[ kw[action_sym] [ selected = _1 ] ]
>> '(' >> map(selected) >> ')'
>> ';';
//
qi::_r1_type symref;
pair = raw[ kw[lazy(*symref)] ] >> -value;
map = +pair(symref);
list = *value;
start_ = command_;
BOOST_SPIRIT_DEBUG_NODES(
(start_) (command_)
(pair) (map) (list)
)
}
private:
using token::target;
using token::identity;
using token::value;
using KeySet = qi::symbols<char>;
using KeyRef = KeySet const*;
//
qi::rule<It, ast::command(), skip> start_;
qi::rule<It, ast::command(), skip, qi::locals<KeyRef> > command_;
//
KeySet add_keys, modify_keys, clear_keys;
qi::symbols<char, KeyRef> action_sym;
qi::rule<It, ast::pair(KeyRef), skip> pair;
qi::rule<It, ast::map(KeyRef), skip> map;
qi::rule<It, ast::list(), skip> list;
};
};
}
#include <fstream>
int main() {
using It = boost::spirit::istream_iterator;
using Parser = grammar::parser<It>;
std::ifstream input("input.txt");
It f(input >> std::noskipws), l;
Parser::skip const s{};
Parser::test const p{};
std::vector<ast::command> data;
bool ok = phrase_parse(f, l, *p, s, data);
if (ok) {
std::cout << "Parsed " << data.size() << " commands\n";
} else {
std::cout << "Parsed failed\n";
}
if (f != l) {
std::cout << "Remaining unparsed input: '" << std::string(f,l) << "'\n";
}
}
Prints
Parsed 3 commands
HOLD ON, NOT SO FAST! It's wrong
Yeah. If you enable debug, you'll see it parses things oddly:
<attributes>[[[1, 0, ., 0, ., 0, ., 1], [c, l, e, a, r], [[[c, 1], [c, 2]], [[c, 3], []]]]]</attributes>
This is actually "merely" a problem with the grammar. If the grammar cannot see the difference between a key and value then obviously c2 is going to be parsed as the value of property with key c1.
It's up to you to disambiguate the grammar. For now, I'm going to demonstrate a fix using a negative assertion: we only accept values that are not known keys. It's a bit dirty, but might be useful to you for instructional purposes:
key = raw[ kw[lazy(*symref)] ];
pair = key(symref) >> -(!key(symref) >> value);
map = +pair(symref);
Note I factored out the key rule for readability:
Live On Coliru
Parses
<attributes>[[[1, 0, ., 0, ., 0, ., 1], [c, l, e, a, r], [[[c, 1], []], [[c, 2], []], [[c, 3], []]]]]</attributes>
Just what the doctor ordered!
¹ Boost Spirit: "Semantic actions are evil"?

boost spirit - improving error output

This question leads on from its predecessor here: decoding an http header value
The Question:
In my test assertion failure, I am printing out the following contents of error_message:
Error! Expecting <alternative><media_type_no_parameters><media_type_with_parameters> in header value: "text/html garbage ; charset = \"ISO-8859-5\"" at position: 0
Which is unhelpful...
What is the correct way to get a nice syntax error that says:
Error! token_pair has invalid syntax here:
text/html garbage ; charset = "ISO-8859-5"
^ must be eoi or separator of type ;
Background:
HTTP Content-Type in a request has the following form:
type/subtype *( ; param[=param_value]) <eoi>
Where type and subtype many not be quoted or be separated by spaces, param is not quoted, and param_value is both optional and optionally quoted.
Other than between type/subtype spaces or horizontal tabs may be used as white space. There may also be space before type/subtype.
For now I am ignoring the possibility of HTTP line breaks or comments as I understand that they are deprecated.
Summary:
There shall be one type, one subtype and zero or more parameters. type and subtype are HTTP tokens, which is to say that they may not contain delimiters ("/\[]<>,; and so on) or spaces.
Thus, the following header is legal:
text/html ; charset = "ISO-8859-5"
And the following header is illegal:
text/html garbage ; charset = "ISO-8859-5"
^^^^^^^ illegal - must be either ; or <eoi>
The code I am using to parse this (seemingly simple, but actually quite devious) protocol component is below.
Code
My code, adapted from sehe's fantasic answer here
(warning, prerequisites are google test and boost)
//#define BOOST_SPIRIT_DEBUG
#include <boost/config/warning_disable.hpp>
#include <gtest/gtest.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/adapted.hpp>
#include <utility>
#include <vector>
#include <string>
#include <iostream>
using token_pair = std::pair<std::string, std::string>;
struct parameter {
std::string name;
std::string value;
bool has_value;
};
struct media_type {
token_pair type_subtype;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(parameter, name, value, has_value)
BOOST_FUSION_ADAPT_STRUCT(media_type, type_subtype, params)
namespace qi = boost::spirit::qi;
namespace phoenix = boost::phoenix;
using namespace std::literals;
template<class Iterator>
struct components
{
components()
{
using qi::ascii::char_;
spaces = char_(" \t");
token = +~char_( "()<>#,;:\\\"/[]?={} \t");
token_pair_rule = token >> '/' >> token;
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
name_only = token >> qi::attr("") >> qi::attr(false);
nvp = token >> '=' >> value >> qi::attr(true);
any_parameter = ';' >> (nvp | name_only);
some_parameters = +any_parameter;
parameters = *any_parameter;
qi::on_error<qi::fail>(
token,
this->report_error(qi::_1, qi::_2, qi::_3, qi::_4)
);
BOOST_SPIRIT_DEBUG_NODES((token)
(quoted_string)
(value)
(name_only)
(nvp)
(any_parameter)
(parameters)
)
}
protected:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, parameter(), Skipper> nvp, name_only, any_parameter;
qi::rule<Iterator, std::vector<parameter>(), Skipper> parameters, some_parameters;
qi::rule<Iterator, token_pair()> token_pair_rule;
public:
std::string error_message;
protected:
struct ReportError {
// the result type must be explicit for Phoenix
template<typename, typename, typename, typename>
struct result { typedef void type; };
ReportError(std::string& error_message)
: error_message(error_message) {}
// contract the string to the surrounding new-line characters
template<typename Iter>
void operator()(Iter first, Iter last,
Iter error, const qi::info& what) const
{
using namespace std::string_literals;
std::ostringstream ss;
ss << "Error! Expecting "
<< what
<< " in header value: " << std::quoted(std::string(first, last))
<< " at position: " << error - first;
error_message = ss.str();
}
std::string& error_message;
};
const phoenix::function<ReportError> report_error = ReportError(error_message);
};
template<class Iterator>
struct token_grammar
: components<Iterator>
, qi::grammar<Iterator, media_type()>
{
token_grammar() : token_grammar::base_type(media_type_rule)
{
media_type_with_parameters = token_pair_rule >> qi::skip(spaces)[some_parameters];
media_type_no_parameters = token_pair_rule >> qi::attr(std::vector<parameter>()) >> qi::skip(spaces)[qi::eoi];
media_type_rule = qi::eps > (qi::hold[media_type_no_parameters]
| qi::hold[media_type_with_parameters]);
BOOST_SPIRIT_DEBUG_NODES((media_type_with_parameters)
(media_type_no_parameters)
(media_type_rule))
qi::on_error<qi::fail>(
media_type_rule,
this->report_error(qi::_1, qi::_2, qi::_3, qi::_4)
);
}
private:
using Skipper = typename token_grammar::components::Skipper;
using token_grammar::components::spaces;
using token_grammar::components::token;
using token_grammar::components::token_pair_rule;
using token_grammar::components::value;
using token_grammar::components::any_parameter;
using token_grammar::components::parameters;
using token_grammar::components::some_parameters;
public:
qi::rule<Iterator, media_type()> media_type_no_parameters, media_type_with_parameters, media_type_rule;
};
TEST(spirit_test, test1)
{
token_grammar<std::string::const_iterator> grammar{};
auto test = R"__test(application/json )__test"s;
auto ct = media_type {};
bool r = parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("application", ct.type_subtype.first);
EXPECT_EQ("json", ct.type_subtype.second);
EXPECT_EQ(0, ct.params.size());
ct = {};
test = R"__test(text/html ; charset = "ISO-8859-5")__test"s;
parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("text", ct.type_subtype.first);
EXPECT_EQ("html", ct.type_subtype.second);
ASSERT_EQ(1, ct.params.size());
EXPECT_TRUE(ct.params[0].has_value);
EXPECT_EQ("charset", ct.params[0].name);
EXPECT_EQ("ISO-8859-5", ct.params[0].value);
auto mt = media_type {};
parse(test.cbegin(), test.cend(), grammar.media_type_rule, mt);
EXPECT_EQ("text", mt.type_subtype.first);
EXPECT_EQ("html", mt.type_subtype.second);
EXPECT_EQ(1, mt.params.size());
//
// Introduce a failure case
//
mt = media_type {};
test = R"__test(text/html garbage ; charset = "ISO-8859-5")__test"s;
r = parse(test.cbegin(), test.cend(), grammar.media_type_rule, mt);
EXPECT_FALSE(r);
EXPECT_EQ("", grammar.error_message);
}

boost spirit parsing CSV with columns in variable order

I'm trying to parse a CSV file (with header line) using boost spirit.
The csv is not in a constant format. Sometimes there is some extra column or the order of the column is mixed. I'm interested in few columns, whose header name is well known.
For instance my CSV may look like:
Name,Surname,Age
John,Doe,32
Or:
Age,Name
32,John
I want to parse only the content of Name and Age (N.B. Age is integer type). At the moment i come out with a very ugly solution where Spirit parses the first line and creates a vector that contains an enum in the positions i'm interested into. And then i have to do the parsing of the terminal symbols by hand...
enum LineItems {
NAME, AGE, UNUSED
};
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() :
CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
start = qi::omit[header[qi::_a = qi::_1]] >> eol >> line(_a) % eol;
header = (lit("Name")[phx::push_back(phx::ref(qi::_val), LineItems::NAME)]
| lit("Age")[phx::push_back(phx::ref(qi::_val), LineItems::AGE)]
| column[phx::push_back(phx::ref(qi::_val), LineItems::UNUSED)]) % colsep;
line = (column % colsep)[phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1,
qi::_val)];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
}
void convertFunc(std::vector<string>& columns, std::vector<LineItems>& positions, CsvLine &csvLine) {
//terminal symbol parsing here, and assign to csvLine struct.
...
}
private:
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvLine(std::vector<LineItems>), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
};
Here is the full source.
What if the header parser could prepare a vector<rule<...>*> and the "line parser" just use this vector to parse itself? a sort of advanced nabialek trick (i've been trying but i couldn't make it).
Or is there any better way to parse this kind of CSV with Spirit?
(any help is appreciated, thank you in advance)
I'd go with the concept that you have,
I think it's plenty elegant (the qi locals even allow reentrant use of this).
To reduce the cruft in the rules (Boost Spirit: "Semantic actions are evil"?) you could move the "conversion function" off into attribute transformation customization points.
Oops. As commented that was too simple. However, you can still reduce the cruftiness quite a bit. With two simple tweaks, the grammar reads:
item.add("Name", NAME)("Age", AGE);
start = omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
The tweaks:
using qi::symbols to map from header to LineItem
using a raw semantinc action ([convert]) which directly access the context (see boost spirit semantic action parameters):
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
Arguably the upshot is akin to doing auto convert = phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1, qi::_val) but using auto with Proto/Phoenix/Spirit expressions is notoriously error prone (UB due to dangling refs to temporaries from the expression template), so I'd certainly prefer the way shown above.
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <iostream>
#include <boost/fusion/include/at_c.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string>
#include <vector>
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
using std::string;
enum LineItems { NAME, AGE, UNUSED };
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using Columns = std::vector<Column>;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() : CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
item.add("Name", NAME)("Age", AGE);
start = qi::omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
BOOST_SPIRIT_DEBUG_NODES((header)(column)(quoted));
}
private:
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, CsvLine(std::vector<LineItems> const&), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
qi::symbols<char, LineItems> item;
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
};
int main() {
const std::string s = "Surname,Name,Age,\nJohn,Doe,32\nMark,Smith,43";
auto f(begin(s)), l(end(s));
CsvGrammar<std::string::const_iterator> p;
CsvFile parsed;
bool ok = qi::phrase_parse(f, l, p, qi::blank, parsed);
if (ok) {
for (CsvLine line : parsed) {
std::cout << '[' << line.name << ']' << '[' << line.age << ']';
std::cout << std::endl;
}
} else {
std::cout << "Parse failed\n";
}
if (f != l)
std::cout << "Remaining unparsed: '" << std::string(f, l) << "'\n";
}
Prints
[Doe][32]
[Smith][43]

Composite grammar in Boost::Spirit

I have the following grammar which works as expected.
struct query_term {
std::string term;
bool is_tag;
query_term(const std::string &a, bool tag = false): term(a), is_tag(tag) { } };
template<typename Iterator> struct query_grammar: grammar<Iterator, std::vector<query_term>(), space_type> {
query_grammar():
query_grammar::base_type(query) {
word %= +alnum;
tag = (omit[word >> ':'] >> word[_val = phoenix::construct<query_term>(_1, true)]);
non_tag = word[_val = phoenix::construct<query_term>(_1, false)];
query = (
(omit[word >> ':'] >> word[push_back(_val, phoenix::construct<query_term>(_1, true))])
|
word[push_back(_val,
phoenix::construct<query_term>(_1))
]
) % space;
};
qi::rule<Iterator, std::string(), space_type> word;
qi::rule<Iterator, query_term, space_type> tag;
qi::rule<Iterator, query_term, space_type> non_tag;
qi::rule<Iterator, std::vector<query_term>(), space_type> query; };
But when I replace query with
query = (
tag[phoenix::push_back(_val, _1)]
|
word[push_back(_val,
phoenix::construct<query_term>(_1))
]
) % space;
code doesn't compile. Basically I am trying to split the grammar into components that can be reused within larger grammar. When a word or tag is parsed, create a query_term object with appropriate flag in tag and word rule. Re-use these attributes in query rule.
In the previous version, tag and word rules are inlined in the query grammar.
I am not sure what I am missing here. Any help would be greatly appreciated.
FYI: This is not the final code. I am trying out the rules before using it in production code.
Thanx,
-- baliga
The real issue is that you define the attribute for the tag/non_tag rules as query_term (instead of query_term()).
Some minor issues appear to be:
using word instead of non_tag (exposes a std::string which doesn't convert to a query_type)
using % space with a space skipper doesn't really make sense
you probably wanted lexeme in the word rule because otherwise, it will just keep 'eating' chars regardless of whitespace
Other suggestions:
avoid excess scope of using namespace (or avoid it completely). You will run into hard-to-spot or hard-to-fix conflicts (e.g. boost::cref vs. std::cref, std::string vs. qi::string etc).
try to stay low on the Phoenix usage. In this case, I think you'd be far easier off using qi::attr with an adapted struct.
use BOOST_SPIRIT_DEBUG_* macros to get insight in your parser
Here is the entire grammar, the way I'd suggest it:
template<typename Iterator> struct query_grammar: qi::grammar<Iterator, std::vector<query_term>(), qi::space_type>
{
query_grammar() : query_grammar::base_type(query)
{
using namespace qi;
word = lexeme[ +alnum ];
tag = omit[word >> ':'] >> word >> attr(true);
non_tag = word >> attr(false);
query = *(tag | non_tag);
};
qi::rule<Iterator, std::string() , qi::space_type> word;
qi::rule<Iterator, query_term() , qi::space_type> tag, non_tag;
qi::rule<Iterator, std::vector<query_term>(), qi::space_type> query;
};
A fully working example with output (trivially onelined using karma):
// #define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/karma.hpp>
namespace qi = boost::spirit::qi;
namespace karma = boost::spirit::karma;
struct query_term {
std::string term;
bool is_tag;
};
BOOST_FUSION_ADAPT_STRUCT(query_term, (std::string,term)(bool,is_tag));
template<typename Iterator> struct query_grammar: qi::grammar<Iterator, std::vector<query_term>(), qi::space_type>
{
query_grammar() : query_grammar::base_type(query)
{
using namespace qi;
word = lexeme[ +alnum ];
tag = omit[word >> ':'] >> word >> attr(true);
non_tag = word >> attr(false);
query = *(tag | non_tag);
BOOST_SPIRIT_DEBUG_NODE(word);
BOOST_SPIRIT_DEBUG_NODE(tag);
BOOST_SPIRIT_DEBUG_NODE(non_tag);
BOOST_SPIRIT_DEBUG_NODE(query);
};
qi::rule<Iterator, std::string() , qi::space_type> word;
qi::rule<Iterator, query_term() , qi::space_type> tag, non_tag;
qi::rule<Iterator, std::vector<query_term>(), qi::space_type> query;
};
int main()
{
const std::string input = "apple tag:beer banana grape";
typedef std::string::const_iterator It;
query_grammar<It> parser;
std::vector<query_term> data;
It f(input.begin()), l(input.end());
bool ok = qi::phrase_parse(f, l, parser, qi::space, data);
if (ok)
std::cout << karma::format(karma::delimit [ karma::auto_ ] % karma::eol, data) << '\n';
if (f!=l)
std::cerr << "Unparsed: '" << std::string(f,l) << "'\n";
return ok? 0 : 255;
}
Output:
apple false
beer true
banana false
grape false