// file_input.hpp // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_LEXER_FILE_INPUT #define BOOST_LEXER_FILE_INPUT #include "char_traits.hpp" // memcpy #include #include #include "size_t.hpp" #include "state_machine.hpp" namespace boost { namespace lexer { template > class basic_file_input { public: class iterator { public: #if defined _MSC_VER && _MSC_VER <= 1200 friend basic_file_input; #else friend class basic_file_input; #endif struct data { std::size_t id; std::size_t unique_id; const CharT *start; const CharT *end; std::size_t state; // Construct in end() state. data () : id (0), unique_id (npos), state (npos) { } bool operator == (const data &rhs_) const { return id == rhs_.id && unique_id == rhs_.unique_id && start == rhs_.start && end == rhs_.end && state == rhs_.state; } }; iterator () : _input (0) { } bool operator == (const iterator &rhs_) const { return _data == rhs_._data; } bool operator != (const iterator &rhs_) const { return !(*this == rhs_); } data &operator * () { return _data; } data *operator -> () { return &_data; } // Let compiler generate operator = (). // prefix version iterator &operator ++ () { next_token (); return *this; } // postfix version iterator operator ++ (int) { iterator iter_ = *this; next_token (); return iter_; } void next_token () { const detail::internals &internals_ = _input->_state_machine->data (); _data.start = _data.end; if (internals_._dfa->size () == 1) { _data.id = _input->next (&internals_._lookup->front ()-> front (), internals_._dfa_alphabet.front (), &internals_._dfa->front ()->front (), _data.start, _data.end, _data.unique_id); } else { _data.id = _input->next (internals_, _data.state, _data.start, _data.end, _data.unique_id); } if (_data.id == 0) { _data.start = 0; _data.end = 0; // Ensure current state matches that returned by end(). _data.state = npos; } } private: // Not owner (obviously!) basic_file_input *_input; data _data; }; #if defined _MSC_VER && _MSC_VER <= 1200 friend iterator; #else friend class iterator; #endif // Make it explict that we are NOT taking a copy of state_machine_! basic_file_input (const basic_state_machine *state_machine_, std::basic_ifstream *is_, const std::streamsize buffer_size_ = 4096, const std::streamsize buffer_increment_ = 1024) : _state_machine (state_machine_), _stream (is_), _buffer_size (buffer_size_), _buffer_increment (buffer_increment_), _buffer (_buffer_size, '!') { _start_buffer = &_buffer.front (); _end_buffer = _start_buffer + _buffer.size (); _start_token = _end_buffer; _end_token = _end_buffer; } iterator begin () { iterator iter_; iter_._input = this; // Over-ride default of 0 (EOF) iter_._data.id = npos; iter_._data.start = 0; iter_._data.end = 0; iter_._data.state = 0; ++iter_; return iter_; } iterator end () { iterator iter_; iter_._input = this; iter_._data.start = 0; iter_._data.end = 0; return iter_; } void flush () { // This temporary is mandatory, otherwise the // pointer calculations won't work! const CharT *temp_ = _end_buffer; _start_token = _end_token = _end_buffer; reload_buffer (temp_, true, _end_token); } private: typedef std::basic_istream istream; typedef std::vector buffer; const basic_state_machine *_state_machine; const std::streamsize _buffer_size; const std::streamsize _buffer_increment; buffer _buffer; CharT *_start_buffer; istream *_stream; const CharT *_start_token; const CharT *_end_token; CharT *_end_buffer; std::size_t next (const detail::internals &internals_, std::size_t &start_state_, const CharT * &start_, const CharT * &end_, std::size_t &unique_id_) { _start_token = _end_token; again: const std::size_t * lookup_ = &internals_._lookup[start_state_]-> front (); std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); const std::size_t *ptr_ = dfa_ + dfa_alphabet_; const CharT *curr_ = _start_token; bool end_state_ = *ptr_ != 0; std::size_t id_ = *(ptr_ + id_index); std::size_t uid_ = *(ptr_ + unique_id_index); const CharT *end_token_ = curr_; for (;;) { if (curr_ >= _end_buffer) { if (!reload_buffer (curr_, end_state_, end_token_)) { // EOF break; } } const std::size_t BOL_state_ = ptr_[bol_index]; const std::size_t EOL_state_ = ptr_[eol_index]; if (BOL_state_ && (_start_token == _start_buffer || *(_start_token - 1) == '\n')) { ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; } else if (EOL_state_ && *curr_ == '\n') { ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; } else { const std::size_t state_ = ptr_[lookup_[static_cast (*curr_++)]]; if (state_ == 0) { break; } ptr_ = &dfa_[state_ * dfa_alphabet_]; } if (*ptr_) { end_state_ = true; id_ = *(ptr_ + id_index); uid_ = *(ptr_ + unique_id_index); start_state_ = *(ptr_ + state_index); end_token_ = curr_; } } if (_start_token >= _end_buffer) { // No more tokens... unique_id_ = npos; return 0; } const std::size_t EOL_state_ = ptr_[eol_index]; if (EOL_state_ && curr_ == end_) { ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; if (*ptr_) { end_state_ = true; id_ = *(ptr_ + id_index); uid_ = *(ptr_ + unique_id_index); start_state_ = *(ptr_ + state_index); end_token_ = curr_; } } if (end_state_) { // return longest match _end_token = end_token_; if (id_ == 0) goto again; } else { // No match causes char to be skipped _end_token = _start_token + 1; id_ = npos; uid_ = npos; } start_ = _start_token; end_ = _end_token; unique_id_ = uid_; return id_; } std::size_t next (const std::size_t * const lookup_, const std::size_t dfa_alphabet_, const std::size_t * const dfa_, const CharT * &start_, const CharT * &end_, std::size_t &unique_id_) { _start_token = _end_token; const std::size_t *ptr_ = dfa_ + dfa_alphabet_; const CharT *curr_ = _start_token; bool end_state_ = *ptr_ != 0; std::size_t id_ = *(ptr_ + id_index); std::size_t uid_ = *(ptr_ + unique_id_index); const CharT *end_token_ = curr_; for (;;) { if (curr_ >= _end_buffer) { if (!reload_buffer (curr_, end_state_, end_token_)) { // EOF break; } } const std::size_t BOL_state_ = ptr_[bol_index]; const std::size_t EOL_state_ = ptr_[eol_index]; if (BOL_state_ && (_start_token == _start_buffer || *(_start_token - 1) == '\n')) { ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; } else if (EOL_state_ && *curr_ == '\n') { ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; } else { const std::size_t state_ = ptr_[lookup_[static_cast (*curr_++)]]; if (state_ == 0) { break; } ptr_ = &dfa_[state_ * dfa_alphabet_]; } if (*ptr_) { end_state_ = true; id_ = *(ptr_ + id_index); uid_ = *(ptr_ + unique_id_index); end_token_ = curr_; } } if (_start_token >= _end_buffer) { // No more tokens... unique_id_ = npos; return 0; } const std::size_t EOL_state_ = ptr_[eol_index]; if (EOL_state_ && curr_ == end_) { ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; if (*ptr_) { end_state_ = true; id_ = *(ptr_ + id_index); uid_ = *(ptr_ + unique_id_index); end_token_ = curr_; } } if (end_state_) { // return longest match _end_token = end_token_; } else { // No match causes char to be skipped _end_token = _start_token + 1; id_ = npos; uid_ = npos; } start_ = _start_token; end_ = _end_token; unique_id_ = uid_; return id_; } bool reload_buffer (const CharT * &curr_, const bool end_state_, const CharT * &end_token_) { bool success_ = !_stream->eof (); if (success_) { const CharT *old_start_token_ = _start_token; std::size_t old_size_ = _buffer.size (); std::size_t count_ = 0; if (_start_token - 1 == _start_buffer) { // Run out of buffer space, so increase. _buffer.resize (old_size_ + _buffer_increment, '!'); _start_buffer = &_buffer.front (); _start_token = _start_buffer + 1; _stream->read (_start_buffer + old_size_, _buffer_increment); count_ = _stream->gcount (); _end_buffer = _start_buffer + old_size_ + count_; } else if (_start_token < _end_buffer) { const std::size_t len_ = _end_buffer - _start_token; // Some systems have memcpy in namespace std. using namespace std; memcpy (_start_buffer, _start_token - 1, (len_ + 1) * sizeof (CharT)); _stream->read (_start_buffer + len_ + 1, static_cast (_buffer.size () - len_ - 1)); count_ = _stream->gcount (); _start_token = _start_buffer + 1; _end_buffer = _start_buffer + len_ + 1 + count_; } else { _stream->read (_start_buffer, static_cast (_buffer.size ())); count_ = _stream->gcount (); _start_token = _start_buffer; _end_buffer = _start_buffer + count_; } if (end_state_) { end_token_ = _start_token + (end_token_ - old_start_token_); } curr_ = _start_token + (curr_ - old_start_token_); } return success_; } // Disallow copying of buffer basic_file_input (const basic_file_input &); const basic_file_input &operator = (const basic_file_input &); }; typedef basic_file_input file_input; typedef basic_file_input wfile_input; } } #endif