| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321 | /* * * Copyright (c) 1998-2002 * John Maddock * * Use, modification and distribution are subject to the  * Boost Software License, Version 1.0. (See accompanying file  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) * */ /*  *   LOCATION:    see http://www.boost.org for most recent version.  *   FILE         states.cpp  *   VERSION      see <boost/version.hpp>  *   DESCRIPTION: Declares internal state machine structures.  */#ifndef BOOST_REGEX_V4_STATES_HPP#define BOOST_REGEX_V4_STATES_HPP#ifdef BOOST_MSVC#pragma warning(push)#pragma warning(disable: 4103)#endif#ifdef BOOST_HAS_ABI_HEADERS#  include BOOST_ABI_PREFIX#endif#ifdef BOOST_MSVC#pragma warning(pop)#endifnamespace boost{namespace BOOST_REGEX_DETAIL_NS{/*** mask_type *******************************************************Whenever we have a choice of two alternatives, we use an array of bytesto indicate which of the two alternatives it is possible to take for anygiven input character.  If mask_take is set, then we can take the next state, and if mask_skip is set then we can take the alternative.***********************************************************************/enum mask_type{   mask_take = 1,   mask_skip = 2,   mask_init = 4,   mask_any = mask_skip | mask_take,   mask_all = mask_any};/*** helpers **********************************************************These helpers let us use function overload resolution to detect whetherwe have narrow or wide character strings:***********************************************************************/struct _narrow_type{};struct _wide_type{};template <class charT> struct is_byte;template<>             struct is_byte<char>         { typedef _narrow_type width_type; };template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };template <class charT> struct is_byte               { typedef _wide_type width_type; };/*** enum syntax_element_type ******************************************Every record in the state machine falls into one of the following types:***********************************************************************/enum syntax_element_type{   // start of a marked sub-expression, or perl-style (?...) extension   syntax_element_startmark = 0,   // end of a marked sub-expression, or perl-style (?...) extension   syntax_element_endmark = syntax_element_startmark + 1,   // any sequence of literal characters   syntax_element_literal = syntax_element_endmark + 1,   // start of line assertion: ^   syntax_element_start_line = syntax_element_literal + 1,   // end of line assertion $   syntax_element_end_line = syntax_element_start_line + 1,   // match any character: .   syntax_element_wild = syntax_element_end_line + 1,   // end of expression: we have a match when we get here   syntax_element_match = syntax_element_wild + 1,   // perl style word boundary: \b   syntax_element_word_boundary = syntax_element_match + 1,   // perl style within word boundary: \B   syntax_element_within_word = syntax_element_word_boundary + 1,   // start of word assertion: \<   syntax_element_word_start = syntax_element_within_word + 1,   // end of word assertion: \>   syntax_element_word_end = syntax_element_word_start + 1,   // start of buffer assertion: \`   syntax_element_buffer_start = syntax_element_word_end + 1,   // end of buffer assertion: \'   syntax_element_buffer_end = syntax_element_buffer_start + 1,   // backreference to previously matched sub-expression   syntax_element_backref = syntax_element_buffer_end + 1,   // either a wide character set [..] or one with multicharacter collating elements:   syntax_element_long_set = syntax_element_backref + 1,   // narrow character set: [...]   syntax_element_set = syntax_element_long_set + 1,   // jump to a new state in the machine:   syntax_element_jump = syntax_element_set + 1,   // choose between two production states:   syntax_element_alt = syntax_element_jump + 1,   // a repeat   syntax_element_rep = syntax_element_alt + 1,   // match a combining character sequence   syntax_element_combining = syntax_element_rep + 1,   // perl style soft buffer end: \z   syntax_element_soft_buffer_end = syntax_element_combining + 1,   // perl style continuation: \G   syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,   // single character repeats:   syntax_element_dot_rep = syntax_element_restart_continue + 1,   syntax_element_char_rep = syntax_element_dot_rep + 1,   syntax_element_short_set_rep = syntax_element_char_rep + 1,   syntax_element_long_set_rep = syntax_element_short_set_rep + 1,   // a backstep for lookbehind repeats:   syntax_element_backstep = syntax_element_long_set_rep + 1,   // an assertion that a mark was matched:   syntax_element_assert_backref = syntax_element_backstep + 1,   syntax_element_toggle_case = syntax_element_assert_backref + 1,   // a recursive expression:   syntax_element_recurse = syntax_element_toggle_case + 1,   // Verbs:   syntax_element_fail = syntax_element_recurse + 1,   syntax_element_accept = syntax_element_fail + 1,   syntax_element_commit = syntax_element_accept + 1,   syntax_element_then = syntax_element_commit + 1};#ifdef BOOST_REGEX_DEBUG// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversionstd::ostream& operator<<(std::ostream&, syntax_element_type);#endifstruct re_syntax_base;/*** union offset_type ************************************************Points to another state in the machine.  During machine constructionwe use integral offsets, but these are converted to pointers beforeexecution of the machine.***********************************************************************/union offset_type{   re_syntax_base*   p;   std::ptrdiff_t    i;};/*** struct re_syntax_base ********************************************Base class for all states in the machine.***********************************************************************/struct re_syntax_base{   syntax_element_type   type;         // what kind of state this is   offset_type           next;         // next state in the machine};/*** struct re_brace **************************************************A marked parenthesis.***********************************************************************/struct re_brace : public re_syntax_base{   // The index to match, can be zero (don't mark the sub-expression)   // or negative (for perl style (?...) extentions):   int index;   bool icase;};/*** struct re_dot **************************************************Match anything.***********************************************************************/enum{   dont_care = 1,   force_not_newline = 0,   force_newline = 2,   test_not_newline = 2,   test_newline = 3};struct re_dot : public re_syntax_base{   unsigned char mask;};/*** struct re_literal ************************************************A string of literals, following this structure will be an array of characters: charT[length]***********************************************************************/struct re_literal : public re_syntax_base{   unsigned int length;};/*** struct re_case ************************************************Indicates whether we are moving to a case insensive block or not***********************************************************************/struct re_case : public re_syntax_base{   bool icase;};/*** struct re_set_long ***********************************************A wide character set of characters, following this structure will bean array of type charT:First csingles null-terminated stringsThen 2 * cranges NULL terminated stringsThen cequivalents NULL terminated strings***********************************************************************/template <class mask_type>struct re_set_long : public re_syntax_base{   unsigned int            csingles, cranges, cequivalents;   mask_type               cclasses;   mask_type               cnclasses;   bool                    isnot;   bool                    singleton;};/*** struct re_set ****************************************************A set of narrow-characters, matches any of _map which is none-zero***********************************************************************/struct re_set : public re_syntax_base{   unsigned char _map[1 << CHAR_BIT];};/*** struct re_jump ***************************************************Jump to a new location in the machine (not next).***********************************************************************/struct re_jump : public re_syntax_base{   offset_type     alt;                 // location to jump to};/*** struct re_alt ***************************************************Jump to a new location in the machine (possibly next).***********************************************************************/struct re_alt : public re_jump{   unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump   unsigned int    can_be_null;         // true if we match a NULL string};/*** struct re_repeat *************************************************Repeat a section of the machine***********************************************************************/struct re_repeat : public re_alt{   std::size_t   min, max;  // min and max allowable repeats   int           state_id;        // Unique identifier for this repeat   bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)   bool          greedy;    // True if this is a greedy repeat};/*** struct re_recurse ************************************************Recurse to a particular subexpression.**********************************************************************/struct re_recurse : public re_jump{   int state_id;             // identifier of first nested repeat within the recursion.};/*** struct re_commit *************************************************Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happensif no match is found and we start searching forward.**********************************************************************/enum commit_type{   commit_prune,   commit_skip,   commit_commit};struct re_commit : public re_syntax_base{   commit_type action;};/*** enum re_jump_size_type *******************************************Provides compiled size of re_jump structure (allowing for trailing alignment).We provide this so we know how manybytes to insert when constructing the machine(The value of padding_mask is defined in regex_raw_buffer.hpp).***********************************************************************/enum re_jump_size_type{   re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),   re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),   re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)};/*** proc re_is_set_member *********************************************Forward declaration: we'll need this one later...***********************************************************************/template<class charT, class traits>struct regex_data;template <class iterator, class charT, class traits_type, class char_classT>iterator BOOST_REGEX_CALL re_is_set_member(iterator next,                           iterator last,                           const re_set_long<char_classT>* set_,                           const regex_data<charT, traits_type>& e, bool icase);} // namespace BOOST_REGEX_DETAIL_NS} // namespace boost#ifdef BOOST_MSVC#pragma warning(push)#pragma warning(disable: 4103)#endif#ifdef BOOST_HAS_ABI_HEADERS#  include BOOST_ABI_SUFFIX#endif#ifdef BOOST_MSVC#pragma warning(pop)#endif#endif
 |