I have the following grammar which works as expected.
struct query_term {
std::string term;
bool is_tag;
query_term(const std::string &a, bool tag = false): term(a), is_tag(tag) { } };
template<typename Iterator> struct query_grammar: grammar<Iterator, std::vector<query_term>(), space_type> {
query_grammar():
query_grammar::base_type(query) {
word %= +alnum;
tag = (omit[word >> ':'] >> word[_val = phoenix::construct<query_term>(_1, true)]);
non_tag = word[_val = phoenix::construct<query_term>(_1, false)];
query = (
(omit[word >> ':'] >> word[push_back(_val, phoenix::construct<query_term>(_1, true))])
|
word[push_back(_val,
phoenix::construct<query_term>(_1))
]
) % space;
};
qi::rule<Iterator, std::string(), space_type> word;
qi::rule<Iterator, query_term, space_type> tag;
qi::rule<Iterator, query_term, space_type> non_tag;
qi::rule<Iterator, std::vector<query_term>(), space_type> query; };
But when I replace query with
query = (
tag[phoenix::push_back(_val, _1)]
|
word[push_back(_val,
phoenix::construct<query_term>(_1))
]
) % space;
code doesn't compile. Basically I am trying to split the grammar into components that can be reused within larger grammar. When a word or tag is parsed, create a query_term object with appropriate flag in tag and word rule. Re-use these attributes in query rule.
In the previous version, tag and word rules are inlined in the query grammar.
I am not sure what I am missing here. Any help would be greatly appreciated.
FYI: This is not the final code. I am trying out the rules before using it in production code.
Thanx,
-- baliga
The real issue is that you define the attribute for the tag/non_tag rules as query_term (instead of query_term()).
Some minor issues appear to be:
using word instead of non_tag (exposes a std::string which doesn't convert to a query_type)
using % space with a space skipper doesn't really make sense
you probably wanted lexeme in the word rule because otherwise, it will just keep 'eating' chars regardless of whitespace
Other suggestions:
avoid excess scope of using namespace (or avoid it completely). You will run into hard-to-spot or hard-to-fix conflicts (e.g. boost::cref vs. std::cref, std::string vs. qi::string etc).
try to stay low on the Phoenix usage. In this case, I think you'd be far easier off using qi::attr with an adapted struct.
use BOOST_SPIRIT_DEBUG_* macros to get insight in your parser
Here is the entire grammar, the way I'd suggest it:
template<typename Iterator> struct query_grammar: qi::grammar<Iterator, std::vector<query_term>(), qi::space_type>
{
query_grammar() : query_grammar::base_type(query)
{
using namespace qi;
word = lexeme[ +alnum ];
tag = omit[word >> ':'] >> word >> attr(true);
non_tag = word >> attr(false);
query = *(tag | non_tag);
};
qi::rule<Iterator, std::string() , qi::space_type> word;
qi::rule<Iterator, query_term() , qi::space_type> tag, non_tag;
qi::rule<Iterator, std::vector<query_term>(), qi::space_type> query;
};
A fully working example with output (trivially onelined using karma):
// #define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/karma.hpp>
namespace qi = boost::spirit::qi;
namespace karma = boost::spirit::karma;
struct query_term {
std::string term;
bool is_tag;
};
BOOST_FUSION_ADAPT_STRUCT(query_term, (std::string,term)(bool,is_tag));
template<typename Iterator> struct query_grammar: qi::grammar<Iterator, std::vector<query_term>(), qi::space_type>
{
query_grammar() : query_grammar::base_type(query)
{
using namespace qi;
word = lexeme[ +alnum ];
tag = omit[word >> ':'] >> word >> attr(true);
non_tag = word >> attr(false);
query = *(tag | non_tag);
BOOST_SPIRIT_DEBUG_NODE(word);
BOOST_SPIRIT_DEBUG_NODE(tag);
BOOST_SPIRIT_DEBUG_NODE(non_tag);
BOOST_SPIRIT_DEBUG_NODE(query);
};
qi::rule<Iterator, std::string() , qi::space_type> word;
qi::rule<Iterator, query_term() , qi::space_type> tag, non_tag;
qi::rule<Iterator, std::vector<query_term>(), qi::space_type> query;
};
int main()
{
const std::string input = "apple tag:beer banana grape";
typedef std::string::const_iterator It;
query_grammar<It> parser;
std::vector<query_term> data;
It f(input.begin()), l(input.end());
bool ok = qi::phrase_parse(f, l, parser, qi::space, data);
if (ok)
std::cout << karma::format(karma::delimit [ karma::auto_ ] % karma::eol, data) << '\n';
if (f!=l)
std::cerr << "Unparsed: '" << std::string(f,l) << "'\n";
return ok? 0 : 255;
}
Output:
apple false
beer true
banana false
grape false
Related
Once again, I find myself reaching for boost spirit. Once again I find myself defeated by it.
A HTTP header value takes the general form:
text/html; q=1.0, text/*; q=0.8, image/gif; q=0.6, image/jpeg; q=0.6, image/*; q=0.5, */*; q=0.1
i.e. value *OWS [; *OWS name *OWS [= *OWS possibly_quoted_value] *OWS [...]] *OWS [ , <another value> ...]
so in my mind, this header decodes to:
value[0]:
text/html
params:
name : q
value : 1.0
value[1]:
text/*
params:
name : q
value : 0.8
...
and so on.
I am certain that to anyone who knows how, the boost::spirit::qi syntax for this is trivial.
I humbly ask for your assistance.
for example, here's the outline of the code which decodes the Content-Type header, which is limited to one value of the form type/subtype, with any number of parameters of the form <sp> ; <sp> token=token|quoted_string
template<class Iter>
void parse(ContentType& ct, Iter first, Iter last)
{
ct.mutable_type()->append(to_lower(consume_token(first, last)));
consume_lit(first, last, '/');
ct.mutable_subtype()->append(to_lower(consume_token(first, last)));
while (first != last) {
skipwhite(first, last);
if (consume_char_if(first, last, ';'))
{
auto p = ct.add_parameters();
skipwhite(first, last);
p->set_name(to_lower(consume_token(first, last)));
skipwhite(first, last);
if (consume_char_if(first, last, '='))
{
skipwhite(first, last);
p->set_value(consume_token_or_quoted(first, last));
}
else {
// no value on this parameter
}
}
else if (consume_char_if(first, last, ','))
{
// normally we should get the next value-token here but in the case of Content-Type
// we must barf
throw std::runtime_error("invalid use of ; in Content-Type");
}
}
}
ContentType& populate(ContentType& ct, const std::string& header_value)
{
parse(ct, header_value.begin(), header_value.end());
return ct;
}
OK, after an heroic 24 hours of struggle (well, not really - more like reading the manual over and over again...), I've found a way that works.
I am by no means competent with boost::spirit. If someone out there can improve on this answer, please do post it.
This spirit state machine takes the value of a header (with one, optionally parameterised, value) and turns it into a content_type structure.
My amateur reading of the HTTP standard indicates that some headers have the form (spaces here indicate any amount of white space, values may be quoted or not:
Header-Name: tokena/tokenb [; param1 = "value" [; param2 = value]...]
whereas others have the more general form:
Header-Name: token [; param1 = "value"[; param2 = value]...] [ , token ...]
This code covers the first case - i.e. the HTTP Content-Type header value. I will need to extend it to cater for the Accept header (which can advertise multiple values with parameters) - that will come later.
So here's the code. Please by all means show me how to improve it!!
#define BOOST_SPIRIT_DEBUG
#include <gtest/gtest.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_char.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <utility>
#include <vector>
#include <string>
#include <boost/variant.hpp>
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using unary_parameter = std::string;
struct binary_parameter
{
std::string name;
std::string value;
};
BOOST_FUSION_ADAPT_STRUCT(binary_parameter,
(std::string, name)
(std::string, value))
using parameter = boost::variant<unary_parameter, binary_parameter>;
struct type_subtype
{
std::string type;
std::string subtype;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype,
(std::string, type)
(std::string, subtype))
using content_type_pair = std::pair<std::string, std::string>;
struct content_type
{
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(content_type,
(type_subtype, type)
(std::vector<parameter>, params))
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using ascii::char_;
using qi::omit;
using qi::eoi;
CR = char_('\r');
LF = char_('\n');
CRLF = CR >> LF;
SP = char_(' ');
HT = char_('\t');
LWS = -CRLF >> +(SP | HT);
UPALPHA = char_('A', 'Z');
LOALPHA = char_('a', 'z');
ALPHA = UPALPHA | LOALPHA;
DIGIT = char_('0', '9');
CTL = char_(0, 31) | char_(127);
QUOT = char_('"');
TEXT = (char_ - CTL) | HT;
separator = char_('(') | ')' | '<' | '>' | '#'
| ',' | ';' | ':' | '\\' | '"'
| '/' | '[' | ']' | '?' | '='
| '{' | '}' | SP | HT;
end_sequence = separator | space;
token = +(char_ - separator);
qdtext = char_ - char_('"') - '\\';
quoted_pair = omit[char_('\\')] >> char_;
quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
value = quoted_string | token ;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
content_type_rule = type_subtype_rule >> *any_parameter;
BOOST_SPIRIT_DEBUG_NODES((qdtext)(quoted_pair)(quoted_string)(value)(token)(separator));
}
qi::rule<Iterator, void()> CR, LF, CRLF, SP, HT, LWS, CTL, QUOT;
qi::rule<Iterator, char()> UPALPHA, LOALPHA, ALPHA, DIGIT, TEXT, qdtext, quoted_pair;
qi::rule<Iterator, void()> separator, space, end_sequence;
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
qi::rule<Iterator, binary_parameter()> nvp;
qi::rule<Iterator, parameter()> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
};
TEST(spirit_test, test1)
{
token_grammar<std::string::const_iterator> grammar{};
std::string test = R"__test(application/json )__test";
content_type ct;
bool r = qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("application", ct.type.type);
EXPECT_EQ("json", ct.type.subtype);
EXPECT_EQ(0, ct.params.size());
ct = {};
test = R"__test(text/html ; charset = "ISO-8859-5")__test";
qi::parse(test.cbegin(), test.cend(), grammar, ct);
EXPECT_EQ("text", ct.type.type);
EXPECT_EQ("html", ct.type.subtype);
ASSERT_EQ(1, ct.params.size());
ASSERT_EQ(typeid(binary_parameter), ct.params[0].type());
auto& x = boost::get<binary_parameter>(ct.params[0]);
EXPECT_EQ("charset", x.name);
EXPECT_EQ("ISO-8859-5", x.value);
}
I've taken the code as posted by OP and given it a review.
there's no need to specify void(). In fact it's preferable to use qi::unused_type in such cases, which is what rules will default to if no attribute type is declared.
there no need for char_ if you don't wish to expose the attribute. Use lit instead.
there is no need to wrap every char parser in a rule. That hurts performance. It's best to leave the proto expression tree un-evaluated as long so Qi can optimize parser expressions more, and the compiler can inline more.
Also, Qi doesn't have move semantics on attributes, so avoiding redundant rules eliminates redundant copies of sub-attributes that get concatenated in the containing rules.
Sample alternative spelling (caution, see Assigning parsers to auto variables)
auto CR = qi::lit('\r');
auto LF = qi::lit('\n');
auto CRLF = qi::lit("\r\n");
auto HT = qi::lit('\t');
auto SP = qi::lit(' ');
auto LWS = qi::copy(-CRLF >> +(SP | HT)); // deepcopy
UPALPHA = char_('A', 'Z');
LOALPHA = char_('a', 'z');
ALPHA = UPALPHA | LOALPHA;
DIGIT = char_('0', '9');
//CTL = char_(0, 31) | char_(127);
TEXT = char_("\t\x20-\x7e\x80-\xff");
Since you didn't have to use char_, you also don't have kill the attribute using qi::omit[].
When you are in a Qi domain expression template, raw string/char literals are implicitly wrapped in a qi::lit so, you can simply things like
quoted_pair = omit[char_('\\')] >> char_;
quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
to just
quoted_pair = '\\' >> char_;
quoted_string = '"' >> *(qdtext | quoted_pair) >> '"';
instead of spelling out skipping spaces with omit[*SP] all the time, just declare the rule with a skipper. Now, you can simplify
nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
content_type_rule = type_subtype_rule >> *any_parameter;
to just
nvp = token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces)[*any_parameter];
Note that any subrule invocations of rules that are declared without a skipper are implicitly lexeme: Boost spirit skipper issues
there were many redundant/unused headers
recent compilers + boost versions make BOOST_FUSION_ADAPT_STRUCT much simpler by using decltype
The results of simplifying are much less noisy:
//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapted.hpp>
struct parameter {
boost::optional<std::string> name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>#,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = token;
nvp = token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, binary_parameter(), Skipper> nvp;
qi::rule<Iterator, parameter(), Skipper> any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
qi::rule<Iterator, unary_parameter()> name_only;
};
See it Live On Coliru (with the same test cases)
BONUS
I'd prefer a simpler AST in a case like this. By injecting some attribute values using qi::attr you can avoid using boost::variant and/or even avoid boost::optional:
struct parameter {
bool have_name;
std::string name;
std::string value;
};
struct type_subtype {
std::string type;
std::string subtype;
};
struct content_type {
type_subtype type;
std::vector<parameter> params;
};
BOOST_FUSION_ADAPT_STRUCT(parameter, have_name, name, value)
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)
namespace qi = boost::spirit::qi;
template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
token_grammar() : token_grammar::base_type(content_type_rule)
{
using qi::ascii::char_;
spaces = char_(' ');
token = +~char_( "()<>#,;:\\\"/[]?={} \t");
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
value = quoted_string | token;
type_subtype_rule = token >> '/' >> token;
name_only = qi::attr(false) >> qi::attr("") >> token;
nvp = qi::attr(true) >> token >> '=' >> value;
any_parameter = ';' >> (nvp | name_only);
content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];
BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
}
private:
using Skipper = qi::space_type;
Skipper spaces;
qi::rule<Iterator, parameter(), Skipper> nvp, name_only, any_parameter;
qi::rule<Iterator, content_type()> content_type_rule;
// lexemes
qi::rule<Iterator, std::string()> quoted_string, token, value;
qi::rule<Iterator, type_subtype()> type_subtype_rule;
};
I am new to spirit and currently trying to parse an ini like file into a struct. Creating the grammar is ok, but the mapping generation is still some kind of magic to me. The file looks like this:
[fine]
#cmp1
#cmp2
muh=b
[fail]
#cmp1
a=b
#cmp2
it works as long as i have the requirements and properties ordered (section "fine") but I can't get it to work if the requirements and properties are interleaved (section "fail"). My struct definition looks like this:
typedef std::map<std::string, std::string> Pairs;
struct Section
{
std::string name;
std::vector<std::string> requirements;
Pairs properties;
};
BOOST_FUSION_ADAPT_STRUCT(
Section,
(std::string, name)
(std::vector<std::string>, requirements)
(Pairs, properties)
)
My current grammar looks like this:
template <typename Iterator, typename Skipper>
struct SectionParser : qi::grammar<Iterator, Section(), Skipper>
{
qi::rule<Iterator, Section(), Skipper> section;
qi::rule<Iterator, std::pair<std::string, std::string>(), Skipper> pair;
qi::rule<Iterator, std::vector<std::string>()> requirements;
qi::rule<Iterator, std::string()> key, value, sectionIdent, name, component;
SectionParser()
: SectionParser::base_type(section, "entity grammar")
{
using namespace qi;
sectionIdent = *(qi::char_ - (qi::lit('[') | qi::lit(']') | qi::eol));
name = *qi::eol > qi::lit('[') > sectionIdent > qi::lit(']') > (qi::eol | qi::eoi);
component = qi::char_('#') > *(qi::char_ - (qi::eol)) > (qi::eol | qi::eoi);
value = *(qi::char_ - (qi::eol | qi::eoi));
key = qi::char_("a-zA-Z_") > *qi::char_("a-zA-Z_0-9");
pair = key > qi::lit('=') > value > (qi::eol | qi::eoi);
section = name >> *component >> *pair;
}
};
Thats how I run the parser:
std::vector<Section> sections;
bool ok = phrase_parse(first, last, (sectionParser % +qi::eol) >> *qi::eol > qi::eoi, qi::blank, sections);
I also have the feeling that i made the line ending handling more complicated than it should be...
After reading parsing into several vector members i found a solution using only semantic actions.
The struct definition stays the same:
struct Section
{
std::string name;
std::vector<std::string> requirements;
Pairs properties;
};
dont use adapt struct anymore.
The grammar changes to:
template <typename Iterator, typename Skipper>
struct SectionParser : qi::grammar<Iterator, Section(), Skipper>
{
qi::rule<Iterator, Section(), Skipper> start;
qi::rule<Iterator, std::string()> value, ident, name, component;
qi::rule<Iterator, std::pair<std::string, std::string>()> pair;
SectionParser()
: SectionParser::base_type(start, "section grammar")
{
auto add_component = phx::push_back(phx::bind(&Section::requirements, qi::_val), qi::_1);
auto add_pair = phx::insert(phx::bind(&Section::properties, qi::_val), qi::_1);
auto set_name = phx::assign(phx::bind(&Section::name, qi::_val), qi::_1);
ident = +qi::char_("a-zA-Z0-9_");
component = qi::char_('#') > ident >> (qi::eol | qi::eoi);
value = *(qi::char_ - (qi::eol | qi::eoi));
pair = ident > qi::lit('=') > value >> (qi::eol | qi::eoi);
name = qi::lit('[') > ident > qi::lit(']') >> (qi::eol | qi::eoi);
start = name[set_name] >> *(component[add_component] | pair[add_pair]);
}
};
I'm trying to parse a CSV file (with header line) using boost spirit.
The csv is not in a constant format. Sometimes there is some extra column or the order of the column is mixed. I'm interested in few columns, whose header name is well known.
For instance my CSV may look like:
Name,Surname,Age
John,Doe,32
Or:
Age,Name
32,John
I want to parse only the content of Name and Age (N.B. Age is integer type). At the moment i come out with a very ugly solution where Spirit parses the first line and creates a vector that contains an enum in the positions i'm interested into. And then i have to do the parsing of the terminal symbols by hand...
enum LineItems {
NAME, AGE, UNUSED
};
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() :
CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
start = qi::omit[header[qi::_a = qi::_1]] >> eol >> line(_a) % eol;
header = (lit("Name")[phx::push_back(phx::ref(qi::_val), LineItems::NAME)]
| lit("Age")[phx::push_back(phx::ref(qi::_val), LineItems::AGE)]
| column[phx::push_back(phx::ref(qi::_val), LineItems::UNUSED)]) % colsep;
line = (column % colsep)[phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1,
qi::_val)];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
}
void convertFunc(std::vector<string>& columns, std::vector<LineItems>& positions, CsvLine &csvLine) {
//terminal symbol parsing here, and assign to csvLine struct.
...
}
private:
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvLine(std::vector<LineItems>), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
};
Here is the full source.
What if the header parser could prepare a vector<rule<...>*> and the "line parser" just use this vector to parse itself? a sort of advanced nabialek trick (i've been trying but i couldn't make it).
Or is there any better way to parse this kind of CSV with Spirit?
(any help is appreciated, thank you in advance)
I'd go with the concept that you have,
I think it's plenty elegant (the qi locals even allow reentrant use of this).
To reduce the cruft in the rules (Boost Spirit: "Semantic actions are evil"?) you could move the "conversion function" off into attribute transformation customization points.
Oops. As commented that was too simple. However, you can still reduce the cruftiness quite a bit. With two simple tweaks, the grammar reads:
item.add("Name", NAME)("Age", AGE);
start = omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
The tweaks:
using qi::symbols to map from header to LineItem
using a raw semantinc action ([convert]) which directly access the context (see boost spirit semantic action parameters):
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
Arguably the upshot is akin to doing auto convert = phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1, qi::_val) but using auto with Proto/Phoenix/Spirit expressions is notoriously error prone (UB due to dangling refs to temporaries from the expression template), so I'd certainly prefer the way shown above.
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <iostream>
#include <boost/fusion/include/at_c.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string>
#include <vector>
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
using std::string;
enum LineItems { NAME, AGE, UNUSED };
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using Columns = std::vector<Column>;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() : CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
item.add("Name", NAME)("Age", AGE);
start = qi::omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
BOOST_SPIRIT_DEBUG_NODES((header)(column)(quoted));
}
private:
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, CsvLine(std::vector<LineItems> const&), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
qi::symbols<char, LineItems> item;
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
};
int main() {
const std::string s = "Surname,Name,Age,\nJohn,Doe,32\nMark,Smith,43";
auto f(begin(s)), l(end(s));
CsvGrammar<std::string::const_iterator> p;
CsvFile parsed;
bool ok = qi::phrase_parse(f, l, p, qi::blank, parsed);
if (ok) {
for (CsvLine line : parsed) {
std::cout << '[' << line.name << ']' << '[' << line.age << ']';
std::cout << std::endl;
}
} else {
std::cout << "Parse failed\n";
}
if (f != l)
std::cout << "Remaining unparsed: '" << std::string(f, l) << "'\n";
}
Prints
[Doe][32]
[Smith][43]
I want to create vector and append values to it (if any) in one spirit rule. Is it possible? I tried something like below but with no success. Read code comments for details please. Thanks.
typedef std::vector<double> number_array;
typedef std::vector<std::string> string_array;
typedef boost::variant<number_array, string_array> node
template<typename Iterator>
struct parser
: qi::grammar<Iterator, node(), ascii::space_type> {
parser(parser_impl* p)
: parser::base_type(expr_, ""),
error_handler(ErrorHandler(p)) {
// Here I want to create vector on the fly
// and append values to newly created vector.
// but this doesn't compile, specifically phoenix::push_back(...)
number_array_ = qi::lit('n[')[qi::_val = construct<number_array>()] >>
-(qi::double_ % ',')[phoenix::push_back(phoenix::ref(qi::_val), qi::_1)] >> ']';
// this doesn't compile too
string_array_ = qi::lit('s[')[qi::_val = construct<string_array>()] >>
-(quoted_string % ',')[phoenix::push_back(phoenix::ref(qi::_val), qi::_1)] >> ']';
quoted_string %= "'" > qi::lexeme[*(qi::char_ - "'")] > "'";
expr_ = number_array_[qi::_val = qi::_1] | string_array_[[qi::_val = qi::_1]];
}
qi::rule<Iterator, number_array(), ascii::space_type> number_array_;
qi::rule<Iterator, string_array(), ascii::space_type> string_array_;
qi::rule<Iterator, std::string(), ascii::space_type> quoted_string;
qi::rule<Iterator, node(), ascii::space_type> expr_;
};
Most important note here is that I think you can just do without all the semantic actions.
They only do what the default attribute rules already do (_val = _1 for scalar attributes, insert(_val, end(_val), _1) for conainer attributes, basically).
This means you can just write the whole shebang as
number_array_ = "n[" >> -(qi::double_ % ',') >> ']';
string_array_ = "s[" >> -(quoted_string % ',') >> ']';
quoted_string = "'" > qi::lexeme[*(qi::char_ - "'")] > "'";
expr_ = number_array_ | string_array_;
And this will work. Note that I fixed the multibyte literals 'n[' and 's[n'.
See also Boost Spirit: "Semantic actions are evil"?
I extended the Mini XML example from the spirit manual.
The grammar describes a xml tag that can be closed with '/>' and has no child nodes or which is closed like in the example with a closing tag '' and can optionally have children.
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/variant.hpp>
#include <boost/variant/recursive_variant.hpp>
struct XmlTree;
typedef boost::variant<boost::recursive_wrapper<XmlTree>, std::string>
mini_xml_node;
typedef std::vector<mini_xml_node> Children;
struct XmlTree
{
std::string name;
Children childs;
};
BOOST_FUSION_ADAPT_STRUCT(
XmlTree,
(std::string, name)
(Children, childs)
)
typedef std::string::const_iterator Iterator;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phoenix = boost::phoenix;
class XmlParserGrammar : public qi::grammar<Iterator, XmlTree(), qi::locals<std::string*>, ascii::space_type>
{
public:
XmlParserGrammar() : XmlParserGrammar::base_type(xml, "xml")
{
using qi::lit;
using qi::lexeme;
using qi::attr;
using ascii::space;
using ascii::char_;
using ascii::alnum;
using phoenix::val;
xml %=
startTag[qi::_a = &qi::_1] >>
(
(
lit("/>") > attr(Children()) //can i remove this somehow?
)
|
(
lit(">")
>> *node_
> endTag(*qi::_a)
)
);
startTag %= '<' >> !lit('/') >> lexeme[ +(alnum - (space | '>' | "/>")) ] ;
node_ %= xml | text;
endTag = "</" > lit(qi::_r1) > '>';
text %= lexeme[+(char_ - '<')];
}
private:
qi::rule<Iterator, XmlTree(), qi::locals<std::string*>, ascii::space_type> xml;
qi::rule<Iterator, std::string(), ascii::space_type> startTag;
qi::rule<Iterator, mini_xml_node(), ascii::space_type> node_;
qi::rule<Iterator, void(std::string&), ascii::space_type> endTag;
qi::rule<Iterator, std::string(), ascii::space_type> text;
};
Is it possible to write this rule without the attr(Children()) tag? I think it is more or less a performance lag. I need it to avoid the optional attribute of the alternative parser.
If there are no child tags the attribute should only be an empty vector.
You should be able to write:
xml %= startTag[_a = &_1]
>> attributes
>> ( "/>" >> eps
| ">" >> *node > endTag(*_a)
)
;
That leaves the vector attribute unchanged (and empty).