spirit/example/lex/word_count.cpp
2014-04-11 16:27:54 +04:00

167 lines
5.1 KiB
C++

// Copyright (c) 2001-2010 Hartmut Kaiser
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
// This example is the equivalent to the following lex program:
/*
//[wcp_flex_version
%{
int c = 0, w = 0, l = 0;
%}
word [^ \t\n]+
eol \n
%%
{word} { ++w; c += yyleng; }
{eol} { ++c; ++l; }
. { ++c; }
%%
main()
{
yylex();
printf("%d %d %d\n", l, w, c);
}
//]
*/
// Its purpose is to do the word count function of the wc command in UNIX. It
// prints the number of lines, words and characters in a file.
//
// The example additionally demonstrates how to use the add_pattern(...)(...)
// syntax to define lexer patterns. These patterns are essentially parameter-
// less 'macros' for regular expressions, allowing to simplify their
// definition.
// #define BOOST_SPIRIT_LEXERTL_DEBUG
#define BOOST_VARIANT_MINIMIZE_SIZE
#include <boost/config/warning_disable.hpp>
//[wcp_includes
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>
//]
#include <iostream>
#include <string>
#include "example.hpp"
//[wcp_namespaces
using namespace boost::spirit;
using namespace boost::spirit::ascii;
//]
///////////////////////////////////////////////////////////////////////////////
// Token definition: We use the lexertl based lexer engine as the underlying
// lexer type.
///////////////////////////////////////////////////////////////////////////////
//[wcp_token_ids
enum tokenids
{
IDANY = lex::min_token_id + 10
};
//]
//[wcp_token_definition
template <typename Lexer>
struct word_count_tokens : lex::lexer<Lexer>
{
word_count_tokens()
{
// define patterns (lexer macros) to be used during token definition
// below
this->self.add_pattern
("WORD", "[^ \t\n]+")
;
// define tokens and associate them with the lexer
word = "{WORD}"; // reference the pattern 'WORD' as defined above
// this lexer will recognize 3 token types: words, newlines, and
// everything else
this->self.add
(word) // no token id is needed here
('\n') // characters are usable as tokens as well
(".", IDANY) // string literals will not be escaped by the library
;
}
// the token 'word' exposes the matched string as its parser attribute
lex::token_def<std::string> word;
};
//]
///////////////////////////////////////////////////////////////////////////////
// Grammar definition
///////////////////////////////////////////////////////////////////////////////
//[wcp_grammar_definition
template <typename Iterator>
struct word_count_grammar : qi::grammar<Iterator>
{
template <typename TokenDef>
word_count_grammar(TokenDef const& tok)
: word_count_grammar::base_type(start)
, c(0), w(0), l(0)
{
using boost::phoenix::ref;
using boost::phoenix::size;
start = *( tok.word [++ref(w), ref(c) += size(_1)]
| lit('\n') [++ref(c), ++ref(l)]
| qi::token(IDANY) [++ref(c)]
)
;
}
std::size_t c, w, l;
qi::rule<Iterator> start;
};
//]
///////////////////////////////////////////////////////////////////////////////
//[wcp_main
int main(int argc, char* argv[])
{
/*< Define the token type to be used: `std::string` is available as the
type of the token attribute
>*/ typedef lex::lexertl::token<
char const*, boost::mpl::vector<std::string>
> token_type;
/*< Define the lexer type to use implementing the state machine
>*/ typedef lex::lexertl::lexer<token_type> lexer_type;
/*< Define the iterator type exposed by the lexer type
>*/ typedef word_count_tokens<lexer_type>::iterator_type iterator_type;
// now we use the types defined above to create the lexer and grammar
// object instances needed to invoke the parsing process
word_count_tokens<lexer_type> word_count; // Our lexer
word_count_grammar<iterator_type> g (word_count); // Our parser
// read in the file int memory
std::string str (read_from_file(1 == argc ? "word_count.input" : argv[1]));
char const* first = str.c_str();
char const* last = &first[str.size()];
/*< Parsing is done based on the token stream, not the character
stream read from the input. The function `tokenize_and_parse()` wraps
the passed iterator range `[first, last)` by the lexical analyzer and
uses its exposed iterators to parse the token stream.
>*/ bool r = lex::tokenize_and_parse(first, last, word_count, g);
if (r) {
std::cout << "lines: " << g.l << ", words: " << g.w
<< ", characters: " << g.c << "\n";
}
else {
std::string rest(first, last);
std::cerr << "Parsing failed\n" << "stopped at: \""
<< rest << "\"\n";
}
return 0;
}
//]