// Copyright (c) 2001-2010 Hartmut Kaiser // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // This example shows how to create a simple lexer recognizing a couple of // different tokens and how to use this with a grammar. This example has a // heavily backtracking grammar which makes it a candidate for lexer based // parsing (all tokens are scanned and generated only once, even if // backtracking is required) which speeds up the overall parsing process // considerably, out-weighting the overhead needed for setting up the lexer. // Additionally it demonstrates how to use one of the defined tokens as a // parser component in the grammar. // // The grammar recognizes a simple input structure: any number of English // simple sentences (statements, questions and commands) are recognized and // are being counted separately. // #define BOOST_SPIRIT_DEBUG // #define BOOST_SPIRIT_LEXERTL_DEBUG #include #include #include #include #include #include #include #include "example.hpp" using namespace boost::spirit; using namespace boost::spirit::ascii; using boost::phoenix::ref; /////////////////////////////////////////////////////////////////////////////// // Token definition /////////////////////////////////////////////////////////////////////////////// template struct example2_tokens : lex::lexer { example2_tokens() { // A 'word' is comprised of one or more letters and an optional // apostrophe. If it contains an apostrophe, there may only be one and // the apostrophe must be preceded and succeeded by at least 1 letter. // For example, "I'm" and "doesn't" meet the definition of 'word' we // define below. word = "[a-zA-Z]+('[a-zA-Z]+)?"; // Associate the tokens and the token set with the lexer. Note that // single character token definitions as used below always get // interpreted literally and never as special regex characters. This is // done to be able to assign single characters the id of their character // code value, allowing to reference those as literals in Qi grammars. this->self = lex::token_def<>(',') | '!' | '.' | '?' | ' ' | '\n' | word; } lex::token_def<> word; }; /////////////////////////////////////////////////////////////////////////////// // Grammar definition /////////////////////////////////////////////////////////////////////////////// template struct example2_grammar : qi::grammar { template example2_grammar(TokenDef const& tok) : example2_grammar::base_type(story) , paragraphs(0), commands(0), questions(0), statements(0) { story = +paragraph ; paragraph = ( +( command [ ++ref(commands) ] | question [ ++ref(questions) ] | statement [ ++ref(statements) ] ) >> *char_(' ') >> +char_('\n') ) [ ++ref(paragraphs) ] ; command = +(tok.word | ' ' | ',') >> '!' ; question = +(tok.word | ' ' | ',') >> '?' ; statement = +(tok.word | ' ' | ',') >> '.' ; BOOST_SPIRIT_DEBUG_NODE(story); BOOST_SPIRIT_DEBUG_NODE(paragraph); BOOST_SPIRIT_DEBUG_NODE(command); BOOST_SPIRIT_DEBUG_NODE(question); BOOST_SPIRIT_DEBUG_NODE(statement); } qi::rule story, paragraph, command, question, statement; int paragraphs, commands, questions, statements; }; /////////////////////////////////////////////////////////////////////////////// int main() { // iterator type used to expose the underlying input stream typedef std::string::iterator base_iterator_type; // This is the token type to return from the lexer iterator typedef lex::lexertl::token token_type; // This is the lexer type to use to tokenize the input. // Here we use the lexertl based lexer engine. typedef lex::lexertl::lexer lexer_type; // This is the token definition type (derived from the given lexer type). typedef example2_tokens example2_tokens; // this is the iterator type exposed by the lexer typedef example2_tokens::iterator_type iterator_type; // this is the type of the grammar to parse typedef example2_grammar example2_grammar; // now we use the types defined above to create the lexer and grammar // object instances needed to invoke the parsing process example2_tokens tokens; // Our lexer example2_grammar calc(tokens); // Our parser std::string str (read_from_file("example2.input")); // At this point we generate the iterator pair used to expose the // tokenized input stream. std::string::iterator it = str.begin(); iterator_type iter = tokens.begin(it, str.end()); iterator_type end = tokens.end(); // Parsing is done based on the the token stream, not the character // stream read from the input. bool r = qi::parse(iter, end, calc); if (r && iter == end) { std::cout << "-------------------------\n"; std::cout << "Parsing succeeded\n"; std::cout << "There were " << calc.commands << " commands, " << calc.questions << " questions, and " << calc.statements << " statements.\n"; std::cout << "-------------------------\n"; } else { std::cout << "-------------------------\n"; std::cout << "Parsing failed\n"; std::cout << "-------------------------\n"; } std::cout << "Bye... :-) \n\n"; return 0; }