/***************************************************************************** ** ** SRELL (std::regex-like library) version 4.000 ** ** Copyright (c) 2012-2022, Nozomu Katoo. All rights reserved. ** ** Redistribution and use in source and binary forms, with or without ** modification, are permitted provided that the following conditions are ** met: ** ** 1. Redistributions of source code must retain the above copyright notice, ** this list of conditions and the following disclaimer. ** ** 2. Redistributions in binary form must reproduce the above copyright ** notice, this list of conditions and the following disclaimer in the ** documentation and/or other materials provided with the distribution. ** ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS ** IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ** THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ** PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ** CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ** EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** ****************************************************************************** **/ #ifndef SRELL_REGEX_TEMPLATE_LIBRARY #define SRELL_REGEX_TEMPLATE_LIBRARY #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cpp_unicode_characters #ifndef SRELL_CPP11_CHAR1632_ENABLED #define SRELL_CPP11_CHAR1632_ENABLED #endif #endif #ifdef __cpp_initializer_lists #include #ifndef SRELL_CPP11_INITIALIZER_LIST_ENABLED #define SRELL_CPP11_INITIALIZER_LIST_ENABLED #endif #endif #ifdef __cpp_rvalue_references #ifndef SRELL_CPP11_MOVE_ENABLED #define SRELL_CPP11_MOVE_ENABLED #endif #endif #ifdef SRELL_CPP11_MOVE_ENABLED #if defined(_MSC_VER) && _MSC_VER < 1900 #define SRELL_NOEXCEPT #else #define SRELL_NOEXCEPT noexcept #endif #endif #ifdef __cpp_char8_t #ifndef SRELL_CPP20_CHAR8_ENABLED #ifdef __cpp_lib_char8_t #define SRELL_CPP20_CHAR8_ENABLED 2 #else #define SRELL_CPP20_CHAR8_ENABLED 1 #endif #endif #endif // The following SRELL_NO_* macros would be useful when wanting to // reduce the size of a binary by turning off some feature(s). #ifdef SRELL_NO_UNICODE_DATA // Prevents Unicode data used for icase (case-insensitive) matching // from being output into a resulting binary. In this case only the // ASCII characters are case-folded when icase matching is performed // (i.e., [A-Z] -> [a-z] only). #define SRELL_NO_UNICODE_ICASE // Disables the Unicode property (\p{...} and \P{...}) and prevents // Unicode property data from being output into a resulting binary. #define SRELL_NO_UNICODE_PROPERTY #endif // Prevents icase matching specific functions into a resulting binary. // In this case the icase flag is ignored and icase matching becomes // unavailable. #ifdef SRELL_NO_ICASE #ifndef SRELL_NO_UNICODE_ICASE #define SRELL_NO_UNICODE_ICASE #endif #endif // This macro might be removed in the future. #ifdef SRELL_V1_COMPATIBLE #ifndef SRELL_NO_UNICODE_PROPERTY #define SRELL_NO_UNICODE_PROPERTY #endif #ifndef SRELL_NO_VMODE #define SRELL_NO_VMODE #endif #define SRELL_NO_NAMEDCAPTURE #define SRELL_NO_SINGLELINE #define SRELL_FIXEDWIDTHLOOKBEHIND #endif namespace srell { // ["regex_constants.h" ... namespace regex_constants { enum syntax_option_type { icase = 1 << 0, nosubs = 1 << 1, optimize = 1 << 2, collate = 1 << 3, ECMAScript = 1 << 4, basic = 1 << 5, extended = 1 << 6, awk = 1 << 7, grep = 1 << 8, egrep = 1 << 9, multiline = 1 << 10, // SRELL's extension. dotall = 1 << 11, // singleline. unicodesets = 1 << 12 }; inline syntax_option_type operator&(const syntax_option_type left, const syntax_option_type right) { return static_cast(static_cast(left) & static_cast(right)); } inline syntax_option_type operator|(const syntax_option_type left, const syntax_option_type right) { return static_cast(static_cast(left) | static_cast(right)); } inline syntax_option_type operator^(const syntax_option_type left, const syntax_option_type right) { return static_cast(static_cast(left) ^ static_cast(right)); } inline syntax_option_type operator~(const syntax_option_type b) { return static_cast(~static_cast(b)); } inline syntax_option_type &operator&=(syntax_option_type &left, const syntax_option_type right) { left = left & right; return left; } inline syntax_option_type &operator|=(syntax_option_type &left, const syntax_option_type right) { left = left | right; return left; } inline syntax_option_type &operator^=(syntax_option_type &left, const syntax_option_type right) { left = left ^ right; return left; } } // namespace regex_constants namespace regex_constants { enum match_flag_type { match_default = 0, match_not_bol = 1 << 0, match_not_eol = 1 << 1, match_not_bow = 1 << 2, match_not_eow = 1 << 3, match_any = 1 << 4, match_not_null = 1 << 5, match_continuous = 1 << 6, match_prev_avail = 1 << 7, format_default = 0, format_sed = 1 << 8, format_no_copy = 1 << 9, format_first_only = 1 << 10, // For internal use. match_match_ = 1 << 11 }; inline match_flag_type operator&(const match_flag_type left, const match_flag_type right) { return static_cast(static_cast(left) & static_cast(right)); } inline match_flag_type operator|(const match_flag_type left, const match_flag_type right) { return static_cast(static_cast(left) | static_cast(right)); } inline match_flag_type operator^(const match_flag_type left, const match_flag_type right) { return static_cast(static_cast(left) ^ static_cast(right)); } inline match_flag_type operator~(const match_flag_type b) { return static_cast(~static_cast(b)); } inline match_flag_type &operator&=(match_flag_type &left, const match_flag_type right) { left = left & right; return left; } inline match_flag_type &operator|=(match_flag_type &left, const match_flag_type right) { left = left | right; return left; } inline match_flag_type &operator^=(match_flag_type &left, const match_flag_type right) { left = left ^ right; return left; } } // namespace regex_constants // 28.5, regex constants: namespace regex_constants { typedef int error_type; static const error_type error_collate = 100; static const error_type error_ctype = 101; static const error_type error_escape = 102; static const error_type error_backref = 103; static const error_type error_brack = 104; static const error_type error_paren = 105; static const error_type error_brace = 106; static const error_type error_badbrace = 107; static const error_type error_range = 108; static const error_type error_space = 109; static const error_type error_badrepeat = 110; static const error_type error_complexity = 111; static const error_type error_stack = 112; // SRELL's extensions. static const error_type error_utf8 = 113; // The expression contained an invalid UTF-8 sequence. static const error_type error_property = 114; // The expression contained an invalid Unicode property name or value. static const error_type error_noescape = 115; // (Only in v-mode) ( ) [ ] { } / - \ | need to be escaped in a character class. static const error_type error_operator = 116; // (Only in v-mode) A character class contained a reserved double punctuation // operator or different types of operators at the same level, such as [ab--cd]. static const error_type error_complement = 117; // (Only in v-mode) \P or a negated character class contained a property of strings. static const error_type error_modifier = 118; // A specific flag modifier appears more then once. #if defined(SRELL_FIXEDWIDTHLOOKBEHIND) static const error_type error_lookbehind = 200; #endif static const error_type error_internal = 999; } // namespace regex_constants // ... "regex_constants.h"] // ["regex_error.hpp" ... // 28.6, class regex_error: class regex_error : public std::runtime_error { public: explicit regex_error(const regex_constants::error_type ecode) : std::runtime_error("regex_error") // added for error C2512. , ecode_(ecode) { } regex_constants::error_type code() const { return ecode_; } private: regex_constants::error_type ecode_; }; // ... "regex_error.hpp"] // ["rei_type.h" ... namespace regex_internal { #if defined(SRELL_CPP11_CHAR1632_ENABLED) typedef char32_t uchar32; #elif defined(UINT_MAX) && UINT_MAX >= 0xFFFFFFFF typedef unsigned int uchar32; #elif defined(ULONG_MAX) && ULONG_MAX >= 0xFFFFFFFF typedef unsigned long uchar32; #else #error could not find a suitable type for 32-bit Unicode integer values. #endif // defined(SRELL_CPP11_CHAR1632_ENABLED) typedef uchar32 uint_l32; // uint_least32. } // regex_internal // ... "rei_type.h"] // ["rei_constants.h" ... namespace regex_internal { enum re_state_type { st_character, // 0x00 st_character_class, // 0x01 st_epsilon, // 0x02 st_check_counter, // 0x03 // st_increment_counter, // 0x04 st_decrement_counter, // 0x04 st_save_and_reset_counter, // 0x05 st_restore_counter, // 0x06 st_roundbracket_open, // 0x07 st_roundbracket_pop, // 0x08 st_roundbracket_close, // 0x09 st_repeat_in_push, // 0x0a st_repeat_in_pop, // 0x0b st_check_0_width_repeat, // 0x0c st_backreference, // 0x0d st_lookaround_open, // 0x0e // st_lookaround_pop, // 0x10 st_bol, // 0x0f st_eol, // 0x10 st_boundary, // 0x11 st_success, // 0x12 #if !defined(SRELLDBG_NO_NEXTPOS_OPT) st_move_nextpos, // 0x13 #endif st_lookaround_close = st_success, st_zero_width_boundary = st_lookaround_open, }; // re_state_type namespace constants { static const uchar32 unicode_max_codepoint = 0x10ffff; static const uchar32 invalid_u32value = static_cast(-1); static const uchar32 max_u32value = static_cast(-2); static const uchar32 asc_icase = 0x20; static const uchar32 ccstr_empty = static_cast(-1); static const uint_l32 infinity = static_cast(~0); } // constants namespace meta_char { static const uchar32 mc_exclam = 0x21; // '!' static const uchar32 mc_sharp = 0x23; // '#' static const uchar32 mc_dollar = 0x24; // '$' static const uchar32 mc_rbraop = 0x28; // '(' static const uchar32 mc_rbracl = 0x29; // ')' static const uchar32 mc_astrsk = 0x2a; // '*' static const uchar32 mc_plus = 0x2b; // '+' static const uchar32 mc_comma = 0x2c; // ',' static const uchar32 mc_minus = 0x2d; // '-' static const uchar32 mc_period = 0x2e; // '.' static const uchar32 mc_colon = 0x3a; // ':' static const uchar32 mc_lt = 0x3c; // '<' static const uchar32 mc_eq = 0x3d; // '=' static const uchar32 mc_gt = 0x3e; // '>' static const uchar32 mc_query = 0x3f; // '?' static const uchar32 mc_sbraop = 0x5b; // '[' static const uchar32 mc_escape = 0x5c; // '\\' static const uchar32 mc_sbracl = 0x5d; // ']' static const uchar32 mc_caret = 0x5e; // '^' static const uchar32 mc_cbraop = 0x7b; // '{' static const uchar32 mc_bar = 0x7c; // '|' static const uchar32 mc_cbracl = 0x7d; // '}' } // meta_char namespace char_ctrl { static const uchar32 cc_nul = 0x00; // '\0' //0x00:NUL static const uchar32 cc_bs = 0x08; // '\b' //0x08:BS static const uchar32 cc_htab = 0x09; // '\t' //0x09:HT static const uchar32 cc_nl = 0x0a; // '\n' //0x0a:LF static const uchar32 cc_vtab = 0x0b; // '\v' //0x0b:VT static const uchar32 cc_ff = 0x0c; // '\f' //0x0c:FF static const uchar32 cc_cr = 0x0d; // '\r' //0x0d:CR } // char_ctrl namespace char_alnum { static const uchar32 ch_0 = 0x30; // '0' static const uchar32 ch_1 = 0x31; // '1' static const uchar32 ch_7 = 0x37; // '7' static const uchar32 ch_8 = 0x38; // '8' static const uchar32 ch_9 = 0x39; // '9' static const uchar32 ch_A = 0x41; // 'A' static const uchar32 ch_B = 0x42; // 'B' static const uchar32 ch_D = 0x44; // 'D' static const uchar32 ch_F = 0x46; // 'F' static const uchar32 ch_P = 0x50; // 'P' static const uchar32 ch_S = 0x53; // 'S' static const uchar32 ch_W = 0x57; // 'W' static const uchar32 ch_Z = 0x5a; // 'Z' static const uchar32 ch_a = 0x61; // 'a' static const uchar32 ch_b = 0x62; // 'b' static const uchar32 ch_c = 0x63; // 'c' static const uchar32 ch_d = 0x64; // 'd' static const uchar32 ch_f = 0x66; // 'f' static const uchar32 ch_k = 0x6b; // 'k' static const uchar32 ch_n = 0x6e; // 'n' static const uchar32 ch_p = 0x70; // 'p' static const uchar32 ch_q = 0x71; // 'q' static const uchar32 ch_r = 0x72; // 'r' static const uchar32 ch_s = 0x73; // 's' static const uchar32 ch_t = 0x74; // 't' static const uchar32 ch_u = 0x75; // 'u' static const uchar32 ch_v = 0x76; // 'v' static const uchar32 ch_w = 0x77; // 'w' static const uchar32 ch_x = 0x78; // 'x' static const uchar32 ch_z = 0x7a; // 'z' } // char_alnum namespace char_other { static const uchar32 co_sp = 0x20; // ' ' static const uchar32 co_perc = 0x25; // '%' static const uchar32 co_amp = 0x26; // '&' static const uchar32 co_apos = 0x27; // '\'' static const uchar32 co_slash = 0x2f; // '/' static const uchar32 co_smcln = 0x3b; // ';' static const uchar32 co_atmrk = 0x40; // '@' static const uchar32 co_ll = 0x5f; // '_' static const uchar32 co_grav = 0x60; // '`' static const uchar32 co_tilde = 0x7e; // '~' } // char_other } // namespace regex_internal // ... "rei_constants.h"] // ["rei_utf_traits.hpp" ... namespace regex_internal { template struct utf_traits_core { public: static const std::size_t maxseqlen = 1; static const int utftype = 0; static const std::size_t bitsetsize = 0x100; static const uchar32 bitsetmask = 0xff; static const uchar32 cumask = 0xff; // *iter template static uchar32 codepoint(ForwardIterator begin, const ForwardIterator /* end */) { return static_cast(*begin); // Caller is responsible for begin != end. } // *iter++ template static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator /* end */) { return static_cast(*begin++); // Caller is responsible for begin != end. } // iter2 = iter; return *--iter2; template static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator /* begin */) { return static_cast(*--cur); } // *--iter template static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator /* begin */) { return static_cast(*--cur); // Caller is responsible for cur != begin. } #if !defined(SRELLDBG_NO_BMH) template static bool is_trailing(const charT2 /* cu */) { return false; } #endif // !defined(SRELLDBG_NO_BMH) static uchar32 to_codeunits(charT out[maxseqlen], uchar32 cp) { out[0] = static_cast(cp); return 1; } static uchar32 firstcodeunit(const uchar32 cp) { return cp; } template static bool seek_charboundary(ForwardIterator &begin, const ForwardIterator end) { return begin != end; } }; template const std::size_t utf_traits_core::maxseqlen; template const int utf_traits_core::utftype; template const std::size_t utf_traits_core::bitsetsize; template const uchar32 utf_traits_core::bitsetmask; template const uchar32 utf_traits_core::cumask; // utf_traits_core // common and utf-32. template struct utf_traits : public utf_traits_core { static const int utftype = 32; static const std::size_t bitsetsize = 0x10000; static const uchar32 bitsetmask = 0xffff; static const uchar32 cumask = 0x1fffff; }; template const int utf_traits::utftype; template const std::size_t utf_traits::bitsetsize; template const uchar32 utf_traits::bitsetmask; template const uchar32 utf_traits::cumask; // utf_traits // utf-8 specific. template struct utf8_traits : public utf_traits_core { public: // utf-8 specific. static const std::size_t maxseqlen = 4; static const int utftype = 8; template static uchar32 codepoint(ForwardIterator begin, const ForwardIterator end) { // return codepoint_inc(begin, end); uchar32 codepoint = static_cast(*begin & 0xff); if ((codepoint & 0x80) == 0) // 1 octet. return codepoint; if (++begin != end && (codepoint >= 0xc0 && codepoint <= 0xf7) && (*begin & 0xc0) == 0x80) { codepoint = static_cast((codepoint << 6) | (*begin & 0x3f)); if ((codepoint & 0x800) == 0) // 2 octets. return static_cast(codepoint & 0x7ff); if (++begin != end && (*begin & 0xc0) == 0x80) { codepoint = static_cast((codepoint << 6) | (*begin & 0x3f)); if ((codepoint & 0x10000) == 0) // 3 octets. return static_cast(codepoint & 0xffff); if (++begin != end && (*begin & 0xc0) == 0x80) // 4 octets. { codepoint = static_cast((codepoint << 6) | (*begin & 0x3f)); return static_cast(codepoint & 0x1fffff); } } } // else // 80-bf, f8-ff: invalid. return regex_internal::constants::invalid_u32value; } template static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator end) { uchar32 codepoint = static_cast(*begin++ & 0xff); if ((codepoint & 0x80) == 0) // 1 octet. return codepoint; // Expects transformation to (codepoint - 0xc0) <= 0x37 by optimisation. // 0xF7 instead of 0xF4 is for consistency with reverse iterators. if (begin != end && (codepoint >= 0xc0 && codepoint <= 0xf7) && (*begin & 0xc0) == 0x80) // if (begin != end && (0x7f00 & (1 << ((codepoint >> 3) & 0xf))) && (*begin & 0xc0) == 0x80) // c0, c8, d0, d8, e0, e8, f0. { codepoint = static_cast((codepoint << 6) | (*begin++ & 0x3f)); // 11 ?aaa aabb bbbb if ((codepoint & 0x800) == 0) // 2 octets. return static_cast(codepoint & 0x7ff); // c080-c1bf: invalid. 00-7F. // c280-dfbf: valid. 080-7FF. // 11 1aaa aabb bbbb if (begin != end && (*begin & 0xc0) == 0x80) { codepoint = static_cast((codepoint << 6) | (*begin++ & 0x3f)); // 111? aaaa bbbb bbcc cccc if ((codepoint & 0x10000) == 0) // 3 octets. return static_cast(codepoint & 0xffff); // e08080-e09fbf: invalid. 000-7FF. // e0a080-efbfbf: valid. 0800-FFFF. // 1111 0aaa bbbb bbcc cccc if (begin != end && (*begin & 0xc0) == 0x80) // 4 octets. { codepoint = static_cast((codepoint << 6) | (*begin++ & 0x3f)); // f0808080-f08fbfbf: invalid. 0000-FFFF. // f0908080-f3bfbfbf: valid. 10000-FFFFF. // f4808080-f48fbfbf: valid. 100000-10FFFF. // f4908080-f4bfbfbf: invalid. 110000-13FFFF. // f5808080-f7bfbfbf: invalid. 140000-1FFFFF. // 11 110a aabb bbbb cccc ccdd dddd return static_cast(codepoint & 0x1fffff); } } } // else // 80-bf, f8-ff: invalid. return regex_internal::constants::invalid_u32value; } template static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator begin) { uchar32 codepoint = static_cast(*--cur); if ((codepoint & 0x80) == 0) return static_cast(codepoint & 0xff); if ((codepoint & 0x40) == 0 && cur != begin) { codepoint = static_cast((codepoint & 0x3f) | (*--cur << 6)); if ((codepoint & 0x3800) == 0x3000) // 2 octets. return static_cast(codepoint & 0x7ff); if ((codepoint & 0x3000) == 0x2000 && cur != begin) { codepoint = static_cast((codepoint & 0xfff) | (*--cur << 12)); if ((codepoint & 0xf0000) == 0xe0000) // 3 octets. return static_cast(codepoint & 0xffff); if ((codepoint & 0xc0000) == 0x80000 && cur != begin) { if ((*--cur & 0xf8) == 0xf0) // 4 octets. return static_cast((codepoint & 0x3ffff) | ((*cur & 7) << 18)); } } } return regex_internal::constants::invalid_u32value; } template static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator begin) { uchar32 codepoint = static_cast(*--cur); if ((codepoint & 0x80) == 0) return static_cast(codepoint & 0xff); if ((codepoint & 0x40) == 0 && cur != begin) { codepoint = static_cast((codepoint & 0x3f) | (*--cur << 6)); // 11 0bbb bbaa aaaa? if ((codepoint & 0x3800) == 0x3000) // 2 octets. // if ((*cur & 0xe0) == 0xc0) return static_cast(codepoint & 0x7ff); // 10 bbbb bbaa aaaa? if ((codepoint & 0x3000) == 0x2000 && cur != begin) // [\x80-\xbf]{2}. // if ((*cur & 0xc0) == 0x80 && cur != begin) { codepoint = static_cast((codepoint & 0xfff) | (*--cur << 12)); // 1110 cccc bbbb bbaa aaaa? if ((codepoint & 0xf0000) == 0xe0000) // 3 octets. // if ((*cur & 0xf0) == 0xe0) return static_cast(codepoint & 0xffff); // 10cc cccc bbbb bbaa aaaa? if ((codepoint & 0xc0000) == 0x80000 && cur != begin) // [\x80-\xbf]{3}. // if ((*cur & 0xc0) == 0x80 && cur != begin) { if ((*--cur & 0xf8) == 0xf0) // 4 octets. return static_cast((codepoint & 0x3ffff) | ((*cur & 7) << 18)); // d ddcc cccc bbbb bbaa aaaa //else // [\0-\xef\xf8-\xff][\x80-\xbf]{3}. // Sequences [\xc0-\xdf][\x80-\xbf] and [\xe0-\xef][\x80-\xbf]{2} are valid. // To give a chance to them, rewinds cur. ++cur; } //else // [\0-\x7f\xc0-\xdf\xf0-\xff][\x80-\xbf]{2}. ++cur; // Sequence [\xc0-\xdf][\x80-\xbf] is valid. Rewinds to give a chance to it. } //else // [\0-\x7f\xe0-\xff][\x80-\xbf]. ++cur; // Rewinds to give a chance to [\0-\x7f]. } //else // [\xc0-\xff]. return regex_internal::constants::invalid_u32value; } #if !defined(SRELLDBG_NO_BMH) template static bool is_trailing(const charT2 cu) { return (cu & 0xc0) == 0x80; } #endif // !defined(SRELLDBG_NO_BMH) static uchar32 to_codeunits(charT out[maxseqlen], uchar32 cp) { if (cp < 0x80) { out[0] = static_cast(cp); return 1; } else if (cp < 0x800) { out[0] = static_cast(((cp >> 6) & 0x1f) | 0xc0); out[1] = static_cast((cp & 0x3f) | 0x80); return 2; } else if (cp < 0x10000) { out[0] = static_cast(((cp >> 12) & 0x0f) | 0xe0); out[1] = static_cast(((cp >> 6) & 0x3f) | 0x80); out[2] = static_cast((cp & 0x3f) | 0x80); return 3; } // else // if (cp < 0x110000) { out[0] = static_cast(((cp >> 18) & 0x07) | 0xf0); out[1] = static_cast(((cp >> 12) & 0x3f) | 0x80); out[2] = static_cast(((cp >> 6) & 0x3f) | 0x80); out[3] = static_cast((cp & 0x3f) | 0x80); return 4; } } static uchar32 firstcodeunit(const uchar32 cp) { if (cp < 0x80) return cp; if (cp < 0x800) return static_cast(((cp >> 6) & 0x1f) | 0xc0); if (cp < 0x10000) return static_cast(((cp >> 12) & 0x0f) | 0xe0); return static_cast(((cp >> 18) & 0x07) | 0xf0); } template static bool seek_charboundary(ForwardIterator &begin, const ForwardIterator end) { for (; begin != end; ++begin) { // if ((*begin & 0xc0) != 0x80 && (*begin & 0xf8) != 0xf8) // 00-7f, c0-f7. if ((*begin & 0xc0) != 0x80) // 00-7f, c0-ff. return true; } return false; } }; template const std::size_t utf8_traits::maxseqlen; template const int utf8_traits::utftype; // utf8_traits // utf-16 specific. template struct utf16_traits : public utf_traits_core { public: // utf-16 specific. static const std::size_t maxseqlen = 2; static const int utftype = 16; static const std::size_t bitsetsize = 0x10000; static const uchar32 bitsetmask = 0xffff; static const uchar32 cumask = 0xffff; template static uchar32 codepoint(ForwardIterator begin, const ForwardIterator end) { const uchar32 codeunit = *begin; if ((codeunit & 0xdc00) != 0xd800) return static_cast(codeunit & 0xffff); if (++begin != end && (*begin & 0xdc00) == 0xdc00) return static_cast((((codeunit & 0x3ff) << 10) | (*begin & 0x3ff)) + 0x10000); return static_cast(codeunit & 0xffff); } template static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator end) { const uchar32 codeunit = *begin++; if ((codeunit & 0xdc00) != 0xd800) return static_cast(codeunit & 0xffff); if (begin != end && (*begin & 0xdc00) == 0xdc00) return static_cast((((codeunit & 0x3ff) << 10) | (*begin++ & 0x3ff)) + 0x10000); return static_cast(codeunit & 0xffff); } template static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator begin) { const uchar32 codeunit = *--cur; if ((codeunit & 0xdc00) != 0xdc00 || cur == begin) return static_cast(codeunit & 0xffff); if ((*--cur & 0xdc00) == 0xd800) return static_cast((((*cur & 0x3ff) << 10) | (codeunit & 0x3ff)) + 0x10000); return static_cast(codeunit & 0xffff); } template static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator begin) { const uchar32 codeunit = *--cur; if ((codeunit & 0xdc00) != 0xdc00 || cur == begin) return static_cast(codeunit & 0xffff); if ((*--cur & 0xdc00) == 0xd800) return static_cast((((*cur & 0x3ff) << 10) | (codeunit & 0x3ff)) + 0x10000); //else // (codeunit & 0xdc00) == 0xdc00 && (*cur & 0xdc00) != 0xd800 ++cur; return static_cast(codeunit & 0xffff); } #if !defined(SRELLDBG_NO_BMH) template static bool is_trailing(const charT2 cu) { return (cu & 0xdc00) == 0xdc00; } #endif // !defined(SRELLDBG_NO_BMH) static uchar32 to_codeunits(charT out[maxseqlen], uchar32 cp) { if (cp < 0x10000) { out[0] = static_cast(cp); return 1; } // else // if (cp < 0x110000) { cp -= 0x10000; out[0] = static_cast(((cp >> 10) & 0x3ff) | 0xd800); out[1] = static_cast((cp & 0x3ff) | 0xdc00); return 2; } } static uchar32 firstcodeunit(const uchar32 cp) { if (cp < 0x10000) return cp; return static_cast((cp >> 10) + 0xd7c0); // aaaaa bbbbcccc ddddeeee -> AA AAbb bbcc/cc dddd eeee where AAAA = aaaaa - 1. } template static bool seek_charboundary(ForwardIterator &begin, const ForwardIterator end) { for (; begin != end; ++begin) { if ((*begin & 0xdc00) != 0xdc00) return true; } return false; } }; template const std::size_t utf16_traits::maxseqlen; template const int utf16_traits::utftype; template const std::size_t utf16_traits::bitsetsize; template const uchar32 utf16_traits::bitsetmask; template const uchar32 utf16_traits::cumask; // utf16_traits // specialisation for char. template <> struct utf_traits : public utf_traits_core { public: template static uchar32 codepoint(ForwardIterator begin, const ForwardIterator /* end */) { return static_cast(static_cast(*begin)); } template static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator /* end */) { return static_cast(static_cast(*begin++)); } template static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator /* begin */) { return static_cast(static_cast(*--cur)); } template static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator /* begin */) { return static_cast(static_cast(*--cur)); } #if !defined(SRELLDBG_NO_BMH) #endif // !defined(SRELLDBG_NO_BMH) }; // utf_traits // specialisation for signed char. template <> struct utf_traits : public utf_traits { }; // (signed) short, (signed) int, (signed) long, (signed) long long, ... #if defined(SRELL_CPP11_CHAR1632_ENABLED) template <> struct utf_traits : public utf16_traits { }; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) template <> struct utf_traits : public utf8_traits { }; #endif } // regex_internal // ... "rei_utf_traits.hpp"] // ["regex_traits.hpp" ... // 28.7, class template regex_traits: template struct regex_traits { public: typedef charT char_type; typedef std::basic_string string_type; typedef std::locale locale_type; // typedef bitmask_type char_class_type; typedef int char_class_type; typedef regex_internal::utf_traits utf_traits; public: // regex_traits(); static std::size_t length(const char_type *p) { return std::char_traits::length(p); } charT translate(const charT c) const { return c; } charT translate_nocase(const charT c) const { return c; } template string_type transform(ForwardIterator first, ForwardIterator last) const { return string_type(first, last); } template string_type transform_primary(ForwardIterator first, ForwardIterator last) const { return string_type(first, last); } template string_type lookup_collatename(ForwardIterator first, ForwardIterator last) const { return string_type(first, last); } template char_class_type lookup_classname(ForwardIterator /* first */, ForwardIterator /* last */, bool /* icase */ = false) const { return static_cast(0); } bool isctype(const charT /* c */, const char_class_type /* f */) const { return false; } int value(const charT /* ch */, const int /* radix */) const { return -1; } locale_type imbue(const locale_type /* l */) { return locale_type(); } locale_type getloc() const { return locale_type(); } }; // regex_traits template struct u8regex_traits : public regex_traits { typedef regex_internal::utf8_traits utf_traits; }; template struct u16regex_traits : public regex_traits { typedef regex_internal::utf16_traits utf_traits; }; // ... "regex_traits.hpp"] // ["rei_memory.hpp" ... namespace regex_internal { /* * Similar to std::basic_string, except for: * a. only allocates memory, does not initialise it. * b. uses realloc() to avoid moving data as much as possible when * resizing an allocated buffer. */ template class simple_array { public: typedef ElemT value_type; typedef std::size_t size_type; typedef ElemT &reference; typedef const ElemT &const_reference; typedef ElemT *pointer; typedef const ElemT *const_pointer; static const size_type npos = static_cast(-1); public: simple_array() : buffer_(NULL) , size_(0) , capacity_(0) { } simple_array(const size_type initsize) : buffer_(NULL) , size_(0) , capacity_(0) { if (initsize) { buffer_ = static_cast(std::malloc(initsize * sizeof (ElemT))); if (buffer_ != NULL) size_ = capacity_ = initsize; else throw std::bad_alloc(); } } simple_array(const simple_array &right, size_type pos, size_type len = npos) : buffer_(NULL) , size_(0) , capacity_(0) { if (pos > right.size_) pos = right.size_; { const size_type len2 = right.size_ - pos; if (len > len2) len = len2; } if (len) { buffer_ = static_cast(std::malloc(len * sizeof (ElemT))); if (buffer_ != NULL) { for (capacity_ = len; size_ < capacity_;) buffer_[size_++] = right[pos++]; } else { throw std::bad_alloc(); } } } simple_array(const simple_array &right) : buffer_(NULL) , size_(0) , capacity_(0) { operator=(right); } #if defined(SRELL_CPP11_MOVE_ENABLED) simple_array(simple_array &&right) SRELL_NOEXCEPT : buffer_(right.buffer_) , size_(right.size_) , capacity_(right.capacity_) { right.size_ = 0; right.capacity_ = 0; right.buffer_ = NULL; } #endif simple_array &operator=(const simple_array &right) { if (this != &right) { resize(right.size_); for (size_type i = 0; i < right.size_; ++i) buffer_[i] = right.buffer_[i]; } return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) simple_array &operator=(simple_array &&right) SRELL_NOEXCEPT { if (this != &right) { if (this->buffer_ != NULL) std::free(this->buffer_); this->size_ = right.size_; this->capacity_ = right.capacity_; this->buffer_ = right.buffer_; right.size_ = 0; right.capacity_ = 0; right.buffer_ = NULL; } return *this; } #endif ~simple_array() { if (buffer_ != NULL) std::free(buffer_); } size_type size() const { return size_; } void clear() { size_ = 0; } void resize(const size_type newsize) { if (newsize > capacity_) reserve(newsize); size_ = newsize; } void resize(const size_type newsize, const ElemT &type) { size_type oldsize = size_; resize(newsize); for (; oldsize < size_; ++oldsize) buffer_[oldsize] = type; } reference operator[](const size_type pos) { return buffer_[pos]; } const_reference operator[](const size_type pos) const { return buffer_[pos]; } void push_back(const_reference n) { const size_type oldsize = size_; if (++size_ > capacity_) reserve(size_); buffer_[oldsize] = n; } void push_backncr(const ElemT e) { push_back(e); } const_reference back() const { return buffer_[size_ - 1]; } reference back() { return buffer_[size_ - 1]; } void pop_back() { --size_; } simple_array &operator+=(const simple_array &right) { return append(right); } simple_array &append(const size_type size, const ElemT &type) { resize(size_ + size, type); return *this; } simple_array &append(const simple_array &right) { size_type oldsize = size_; resize(size_ + right.size_); for (size_type i = 0; i < right.size_; ++i, ++oldsize) buffer_[oldsize] = right.buffer_[i]; return *this; } simple_array &append(const simple_array &right, size_type pos, size_type len /* = npos */) { { const size_type len2 = right.size_ - pos; if (len > len2) len = len2; } size_type oldsize = size_; resize(size_ + len); len += pos; // end. for (; pos < len; ++oldsize, ++pos) buffer_[oldsize] = right.buffer_[pos]; return *this; } void erase(const size_type pos) { if (pos < size_) { std::memmove(buffer_ + pos, buffer_ + pos + 1, (size_ - pos - 1) * sizeof (ElemT)); --size_; } } void erase(const size_type pos, const size_type len) { if (pos < size_) { size_type rmndr = size_ - pos; if (rmndr > len) { rmndr -= len; std::memmove(buffer_ + pos, buffer_ + pos + len, rmndr * sizeof (ElemT)); size_ -= len; } else size_ = pos; } } // For rei_compiler class. void insert(const size_type pos, const ElemT &type) { move_forward(pos, 1); buffer_[pos] = type; } void insert(size_type pos, const simple_array &right) { move_forward(pos, right.size_); for (size_type i = 0; i < right.size_; ++i, ++pos) buffer_[pos] = right.buffer_[i]; } void insert(size_type destpos, const simple_array &right, size_type srcpos, size_type srclen = npos) { { const size_type len2 = right.size_ - srcpos; if (srclen > len2) srclen = len2; } move_forward(destpos, srclen); srclen += srcpos; // srcend. for (; srcpos < srclen; ++destpos, ++srcpos) buffer_[destpos] = right.buffer_[srcpos]; } simple_array &replace(size_type pos, size_type count, const simple_array &right) { if (count < right.size_) move_forward(pos + count, right.size_ - count); else if (count > right.size_) { const pointer base = buffer_ + pos; std::memmove(base + right.size_, base + count, (size_ - pos - count) * sizeof (ElemT)); size_ -= count - right.size_; } for (size_type i = 0; i < right.size_; ++pos, ++i) buffer_[pos] = right[i]; return *this; } size_type find(const value_type c, size_type pos = 0) const { for (; pos <= size_; ++pos) if (buffer_[pos] == c) return pos; return npos; } const_pointer data() const { return buffer_; } int compare(size_type pos, const size_type count1, const_pointer p, const size_type count2) const { size_type count = count1 <= count2 ? count1 : count2; for (; count; ++pos, ++p, --count) { const value_type &v = buffer_[pos]; if (v != *p) return v < *p ? -1 : 1; } return count1 == count2 ? 0 : (count1 < count2 ? -1 : 1); } size_type max_size() const { return maxsize_; } void swap(simple_array &right) { if (this != &right) { const pointer tmpbuffer = this->buffer_; const size_type tmpsize = this->size_; const size_type tmpcapacity = this->capacity_; this->buffer_ = right.buffer_; this->size_ = right.size_; this->capacity_ = right.capacity_; right.buffer_ = tmpbuffer; right.size_ = tmpsize; right.capacity_ = tmpcapacity; } } private: void reserve(const size_type newsize) { // if (newsize > capacity_) { if (newsize <= maxsize_) { // capacity_ = newsize + (newsize >> 1); // newsize * 1.5. capacity_ = ((newsize >> 8) + 1) << 8; // Round up to a multiple of 256. if (capacity_ > maxsize_) capacity_ = maxsize_; const size_type newsize_in_byte = capacity_ * sizeof (ElemT); const pointer oldbuffer = buffer_; buffer_ = static_cast(std::realloc(buffer_, newsize_in_byte)); if (buffer_ != NULL) return; // Even if realloc() failed, already-existing buffer remains valid. std::free(oldbuffer); // buffer_ = NULL; size_ = capacity_ = 0; } throw std::bad_alloc(); } } void move_forward(const size_type pos, const size_type count) { const size_type oldsize = size_; resize(size_ + count); if (pos < oldsize) { const pointer base = buffer_ + pos; std::memmove(base + count, base, (oldsize - pos) * sizeof (ElemT)); } } private: pointer buffer_; size_type size_; size_type capacity_; // static const size_type maxsize_ = (npos - sizeof (simple_array)) / sizeof (ElemT); static const size_type maxsize_ = (npos - sizeof (pointer) - sizeof (size_type) * 2) / sizeof (ElemT) / 2; }; template const typename simple_array::size_type simple_array::npos; // simple_array } // namespace regex_internal // ... "rei_memory.hpp"] // ["rei_bitset.hpp" ... namespace regex_internal { // Always uses a heap instead of the stack. template class bitset { private: typedef unsigned long array_type; public: bitset() : buffer_(static_cast(std::malloc(size_in_byte_))) { if (buffer_ != NULL) { reset(); return; } throw std::bad_alloc(); } bitset(const bitset &right) : buffer_(static_cast(std::malloc(size_in_byte_))) { if (buffer_ != NULL) { operator=(right); return; } throw std::bad_alloc(); } #if defined(SRELL_CPP11_MOVE_ENABLED) bitset(bitset &&right) SRELL_NOEXCEPT : buffer_(right.buffer_) { right.buffer_ = NULL; } #endif bitset &operator=(const bitset &right) { if (this != &right) { // for (std::size_t i = 0; i < arraylength_; ++i) // buffer_[i] = right.buffer_[i]; std::memcpy(buffer_, right.buffer_, size_in_byte_); } return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) bitset &operator=(bitset &&right) SRELL_NOEXCEPT { if (this != &right) { if (this->buffer_ != NULL) std::free(this->buffer_); this->buffer_ = right.buffer_; right.buffer_ = NULL; } return *this; } #endif ~bitset() { if (buffer_ != NULL) std::free(buffer_); } bitset &reset() { std::memset(buffer_, 0, size_in_byte_); return *this; } bitset &reset(const std::size_t bit) { buffer_[bit / bits_per_elem_] &= ~(1 << (bit & bitmask_)); return *this; } bitset &set(const std::size_t bit) { buffer_[bit / bits_per_elem_] |= (1 << (bit & bitmask_)); return *this; } #if 0 void set_range(const std::size_t firstbit, const std::size_t lastbit) { const std::size_t lastelemidx = lastbit / bits_per_elem_; std::size_t firstelemidx = firstbit / bits_per_elem_; const array_type lastelemmask = ~(allbits1_ << ((lastbit & bitmask_) + 1)); array_type ormask = allbits1_ << (firstbit & bitmask_); if (firstelemidx < lastelemidx) { buffer_[firstelemidx] |= ormask; ormask = allbits1_; for (++firstelemidx; firstelemidx < lastelemidx; ++firstelemidx) buffer_[firstelemidx] |= ormask; } ormask &= lastelemmask; buffer_[lastelemidx] |= ormask; } #endif bool test(const std::size_t bit) const { return (buffer_[bit / bits_per_elem_] & (1 << (bit & bitmask_))) != 0; } bool operator[](const std::size_t bit) const { return (buffer_[bit / bits_per_elem_] & (1 << (bit & bitmask_))) != 0; } bitset &flip() { for (std::size_t i = 0; i < arraylength_; ++i) buffer_[i] = ~buffer_[i]; return *this; } void swap(bitset &right) { if (this != &right) { array_type *const tmpbuffer = this->buffer_; this->buffer_ = right.buffer_; right.buffer_ = tmpbuffer; } } private: #if defined(__cpp_constexpr) static constexpr std::size_t pow2leN(const std::size_t n, const std::size_t p2) { return ((p2 << 1) == 0 || (p2 << 1) > n) ? p2 : pow2leN(n, p2 << 1); } static const std::size_t bits_per_elem_ = pow2leN(CHAR_BIT * sizeof (array_type), 8); #else static const std::size_t bpe_tmp_ = CHAR_BIT * sizeof (array_type); static const std::size_t bits_per_elem_ = bpe_tmp_ >= 64 ? 64 : (bpe_tmp_ >= 32 ? 32 : (bpe_tmp_ >= 16 ? 16 : 8)); #endif static const std::size_t bitmask_ = bits_per_elem_ - 1; static const std::size_t arraylength_ = (Bits + bitmask_) / bits_per_elem_; static const std::size_t size_in_byte_ = arraylength_ * sizeof (array_type); static const array_type allbits1_ = ~static_cast(0); array_type *buffer_; }; } // namespace regex_internal // ... "rei_bitset.hpp"] // ["rei_ucf.hpp" ... namespace regex_internal { #if !defined(SRELL_NO_UNICODE_ICASE) namespace ucf_constants { #include "srell_ucfdata2.hpp" } // namespace ucf_constants namespace ucf_internal { typedef ucf_constants::unicode_casefolding ucfdata; } // namespace ucf_internal #endif // !defined(SRELL_NO_UNICODE_ICASE) namespace ucf_constants { #if !defined(SRELL_NO_UNICODE_ICASE) static const uchar32 rev_maxset = ucf_internal::ucfdata::rev_maxset; #else static const uchar32 rev_maxset = 2; #endif } // namespace ucf_constants class unicode_case_folding { public: static uchar32 do_casefolding(const uchar32 cp) { #if !defined(SRELL_NO_UNICODE_ICASE) if (cp <= ucf_internal::ucfdata::ucf_maxcodepoint) return cp + ucf_internal::ucfdata::ucf_deltatable[ucf_internal::ucfdata::ucf_segmenttable[cp >> 8] + (cp & 0xff)]; #else if (cp >= char_alnum::ch_A && cp <= char_alnum::ch_Z) // 'A' && 'Z' return static_cast(cp - char_alnum::ch_A + char_alnum::ch_a); // - 'A' + 'a' #endif return cp; } static uchar32 casefoldedcharset(uchar32 out[ucf_constants::rev_maxset], const uchar32 cp) { #if !defined(SRELL_NO_UNICODE_ICASE) uchar32 count = 0; if (cp <= ucf_internal::ucfdata::rev_maxcodepoint) { const uchar32 offset_of_charset = ucf_internal::ucfdata::rev_indextable[ucf_internal::ucfdata::rev_segmenttable[cp >> 8] + (cp & 0xff)]; const uchar32 *ptr = &ucf_internal::ucfdata::rev_charsettable[offset_of_charset]; for (; *ptr != cfcharset_eos_ && count < ucf_constants::rev_maxset; ++ptr, ++count) out[count] = *ptr; } if (count == 0) out[count++] = cp; return count; #else // const uchar32 nocase = static_cast(cp & ~0x20); const uchar32 nocase = static_cast(cp | constants::asc_icase); out[0] = cp; // if (nocase >= char_alnum::ch_A && nocase <= char_alnum::ch_Z) if (nocase >= char_alnum::ch_a && nocase <= char_alnum::ch_z) { out[1] = static_cast(cp ^ constants::asc_icase); return 2; } return 1; #endif } unicode_case_folding &operator=(const unicode_case_folding &) { return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) unicode_case_folding &operator=(unicode_case_folding &&) SRELL_NOEXCEPT { return *this; } #endif void swap(unicode_case_folding & /* right */) { } private: #if !defined(SRELL_NO_UNICODE_ICASE) static const uchar32 cfcharset_eos_ = ucf_internal::ucfdata::eos; #endif public: // For debug. void print_tables() const; }; // unicode_case_folding } // namespace regex_internal // ... "rei_ucf.hpp"] // ["rei_up.hpp" ... namespace regex_internal { #if !defined(SRELL_NO_UNICODE_PROPERTY) namespace up_constants { #include "srell_updata2.hpp" static const uint_l32 error_property = static_cast(-1); } // namespace up_constants namespace up_internal { typedef up_constants::up_type pname_type; typedef const char *pname_string_type; #if defined(SRELL_UPDATA_VERSION) && (SRELL_UPDATA_VERSION >= 200) struct pvalue_type { pname_type pname; uint_l32 pnumber; pname_string_type csstrings; }; #else struct pvalue_type { pname_type pname; pname_string_type csstrings; uint_l32 pnumber; }; #endif struct offset_and_number { std::size_t offset; std::size_t number_of_pairs; }; typedef up_constants::unicode_property_data< pname_string_type, uchar32, pvalue_type, offset_and_number > updata; } // namespace up_internal //template class unicode_property { public: typedef simple_array pstring; unicode_property() { } unicode_property &operator=(const unicode_property &) { return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) unicode_property &operator=(unicode_property &&) SRELL_NOEXCEPT { return *this; } #endif static uint_l32 lookup_property(const pstring &name, const pstring &value) { pname_type ptype = name.size() ? lookup_property_name(name) : up_constants::uptype_general_category; // property_type property_number = lookup_property_value(ptype, value); uint_l32 property_number = lookup_property_value(ptype, value); if (property_number == static_cast(up_constants::upid_unknown) && name.size() == 0) { ptype = up_constants::uptype_binary; property_number = lookup_property_value(ptype, value); } return property_number != static_cast(up_constants::upid_unknown) ? property_number : up_constants::error_property; } static std::size_t ranges_offset(const uint_l32 property_number) { #if defined(SRELL_UPDATA_VERSION) return updata::positiontable[property_number].offset; #else const offset_and_number *const postable = updata::position_table(); return postable[property_number].offset; #endif } static std::size_t number_of_ranges(const uint_l32 property_number) { #if defined(SRELL_UPDATA_VERSION) return updata::positiontable[property_number].number_of_pairs; #else const offset_and_number *const postable = updata::position_table(); return postable[property_number].number_of_pairs; #endif } static const uchar32 *ranges_address(const uint_l32 property_number) { #if defined(SRELL_UPDATA_VERSION) return &updata::rangetable[ranges_offset(property_number) << 1]; #else const uchar32 *const ranges = updata::ranges(); return &ranges[ranges_offset(property_number) << 1]; #endif } static bool is_valid_pno(const uint_l32 pno) { return pno != up_constants::error_property && pno <= max_property_number; } static bool is_pos(const uint_l32 pno) { return pno > max_property_number && pno <= max_pos_number; } private: typedef up_internal::pname_type pname_type; typedef up_internal::pname_string_type pname_string_type; typedef up_internal::pvalue_type pvalue_type; typedef up_internal::offset_and_number offset_and_number; typedef up_internal::updata updata; static pname_type lookup_property_name(const pstring &name) { #if defined(SRELL_UPDATA_VERSION) for (std::size_t pno = 0; *updata::propertynametable[pno]; ++pno) { if (check_if_included(name, updata::propertynametable[pno])) return static_cast(pno); } #else const pname_string_type *const pname_table = updata::propertyname_table(); for (std::size_t pno = 0; *pname_table[pno]; ++pno) { if (check_if_included(name, pname_table[pno])) return static_cast(pno); } #endif return up_constants::uptype_unknown; } // Checks if value is included in colon-separated strings. static bool check_if_included(const pstring &value, pname_string_type csstrings) { if (static_cast(*csstrings) != meta_char::mc_astrsk) // '*' { while (*csstrings) { const pname_string_type begin = csstrings; for (; static_cast(*csstrings) != meta_char::mc_colon && static_cast(*csstrings) != char_ctrl::cc_nul; ++csstrings); const std::size_t length = csstrings - begin; if (static_cast(value.size()) == length) if (value.compare(0, value.size(), begin, length) == 0) return true; if (static_cast(*csstrings) == meta_char::mc_colon) ++csstrings; } } return false; } static uint_l32 lookup_property_value(const pname_type ptype, const pstring &value) { #if defined(SRELL_UPDATA_VERSION) for (std::size_t pno = 0; *updata::rangenumbertable[pno].csstrings; ++pno) { const pvalue_type &pvalue = updata::rangenumbertable[pno]; if (pvalue.pname == ptype && check_if_included(value, pvalue.csstrings)) return pvalue.pnumber; } #else const pvalue_type *const pvalue_table = updata::rangenumber_table(); for (std::size_t pno = 0; *pvalue_table[pno].csstrings; ++pno) { const pvalue_type &pvalue = pvalue_table[pno]; if (pvalue.pname == ptype && check_if_included(value, pvalue.csstrings)) return pvalue.pnumber; } #endif return static_cast(up_constants::upid_unknown); } private: static const std::size_t max_property_number = static_cast(up_constants::upid_max_property_number); static const std::size_t max_pos_number = static_cast(up_constants::upid_max_pos_number); }; // unicode_property #endif // !defined(SRELL_NO_UNICODE_PROPERTY) } // namespace regex_internal // ... "rei_up.hpp"] // ["rei_range_pair.hpp" ... namespace regex_internal { struct range_pair // , public std::pair { uchar32 second; uchar32 first; void set(const uchar32 min, const uchar32 max) { this->first = min; this->second = max; } void set(const uchar32 minmax) { this->first = minmax; this->second = minmax; } bool is_range_valid() const { return first <= second; } bool operator==(const range_pair &right) const { return this->first == right.first && this->second == right.second; } bool operator<(const range_pair &right) const { return this->second < right.first; // This assumes that optimise() has been called. } void swap(range_pair &right) { const range_pair tmp = *this; *this = right; right = tmp; } bool unify_range(const range_pair &right) { range_pair &left = *this; if (right.first <= left.second || left.second + 1 == right.first) // r1 <= l2 || l2+1 == r1 { // l1 l2+1 < r1 r2 excluded. if (left.first <= right.second || right.second + 1 == left.first) // l1 <= r2 || r2+1 == l1 { // r1 r2+1 < l1 l2 excluded. if (left.first > right.first) left.first = right.first; if (left.second < right.second) left.second = right.second; return true; } } return false; } }; // range_pair struct range_pair_helper : public range_pair { range_pair_helper(const uchar32 min, const uchar32 max) { this->first = min; this->second = max; } range_pair_helper(const uchar32 minmax) { this->first = minmax; this->second = minmax; } }; // range_pair_helper struct range_pairs // : public simple_array { public: typedef simple_array array_type; typedef array_type::size_type size_type; range_pairs() { } range_pairs(const range_pairs &rp) : rparray_(rp.rparray_) { } range_pairs &operator=(const range_pairs &rp) { rparray_.operator=(rp.rparray_); return *this; } range_pairs(const size_type initsize) : rparray_(initsize) { } range_pairs(const range_pairs &right, size_type pos, size_type size) : rparray_(right.rparray_, pos, size) { } #if defined(SRELL_CPP11_MOVE_ENABLED) range_pairs(range_pairs &&rp) SRELL_NOEXCEPT : rparray_(std::move(rp.rparray_)) { } range_pairs &operator=(range_pairs &&rp) SRELL_NOEXCEPT { rparray_.operator=(std::move(rp.rparray_)); return *this; } #endif void clear() { rparray_.clear(); } size_type size() const { return rparray_.size(); } const range_pair &operator[](const size_type pos) const { return rparray_[pos]; } range_pair &operator[](const size_type pos) { return rparray_[pos]; } void resize(const size_type size) { rparray_.resize(size); } void swap(range_pairs &right) { rparray_.swap(right.rparray_); } void set_solerange(const range_pair &right) { rparray_.clear(); rparray_.push_back(right); } void append_newclass(const range_pairs &right) { rparray_.append(right.rparray_); } void append_newpair(const range_pair &right) { rparray_.push_back(right); } void join(const range_pair &right) { size_type pos = 0; for (; pos < rparray_.size(); ++pos) { range_pair &curpair = rparray_[pos]; if (curpair.unify_range(right)) { for (++pos; pos < rparray_.size();) { if (curpair.unify_range(rparray_[pos])) rparray_.erase(pos); else break; } return; } if (right.second < curpair.first) break; } rparray_.insert(pos, right); } void merge(const range_pairs &right) { for (size_type i = 0; i < right.size(); ++i) join(right[i]); } bool same(uchar32 pos, const uchar32 count, const range_pairs &right) const { if (count == right.size()) { for (uchar32 i = 0; i < count; ++i, ++pos) if (!(rparray_[pos] == right[i])) return false; return true; } return false; } int relationship(const range_pairs &right) const { if (rparray_.size() == right.rparray_.size()) { for (size_type i = 0; i < rparray_.size(); ++i) { if (!(this->rparray_[i] == right.rparray_[i])) { if (i == 0) goto check_overlap; return 1; // Overlapped. } } return 0; // Same. } check_overlap: return is_overlap(right) ? 1 : 2; // Overlapped or exclusive. } void negation() { uchar32 begin = 0; range_pairs newpairs; for (size_type i = 0; i < rparray_.size(); ++i) { const range_pair &range = rparray_[i]; if (begin < range.first) newpairs.join(range_pair_helper(begin, range.first - 1)); begin = range.second + 1; } if (begin <= constants::unicode_max_codepoint) newpairs.join(range_pair_helper(begin, constants::unicode_max_codepoint)); *this = newpairs; } bool is_overlap(const range_pairs &right) const { for (size_type i = 0; i < rparray_.size(); ++i) { const range_pair &leftrange = rparray_[i]; for (size_type j = 0; j < right.size(); ++j) { const range_pair &rightrange = right[j]; if (rightrange.first <= leftrange.second) // Excludes l1 l2 < r1 r2. if (leftrange.first <= rightrange.second) // Excludes r1 r2 < l1 l2. return true; } } return false; } void load_from_memory(const uchar32 *array, size_type number_of_pairs) { for (; number_of_pairs; --number_of_pairs, array += 2) join(range_pair_helper(array[0], array[1])); } void make_caseunfoldedcharset() { uchar32 table[ucf_constants::rev_maxset] = {}; bitset bs; for (size_type i = 0; i < rparray_.size(); ++i) { const range_pair &range = rparray_[i]; for (uchar32 ucp = range.first; ucp <= range.second; ++ucp) { const uchar32 setnum = unicode_case_folding::casefoldedcharset(table, ucp); for (uchar32 j = 0; j < setnum; ++j) bs.set(table[j]); } } load_from_bitset(bs); } // For updataout.hpp. void remove_range(const range_pair &right) { for (size_type pos = 0; pos < rparray_.size();) { range_pair &left = rparray_[pos]; if (right.first <= left.first && left.first <= right.second) // r1 <= l1 <= r2. { if (left.second > right.second) // r1 <= l1 <= r2 < l2. { left.first = right.second + 1; // carry doesn't happen. ++pos; } else // r1 <= l1 <= l2 <= r2. rparray_.erase(pos); } else if (right.first <= left.second && left.second <= right.second) // r1 <= l2 <= r2. { if (left.first < right.first) // l1 < r1 <= l2 <= r2. { left.second = right.first - 1; ++pos; } else // r1 <= l1 <= l2 <= r2. rparray_.erase(pos); } else if (left.first < right.first && right.second < left.second) // l1 < r1 && r2 < l2. { range_pair newrange(left); left.second = right.first - 1; newrange.first = right.second + 1; rparray_.insert(++pos, newrange); ++pos; } else ++pos; } } // template uchar32 consists_of_one_character(const bool icase) const { if (rparray_.size() >= 1) { uchar32 (*const casefolding_func)(const uchar32) = !icase ? do_nothing : unicode_case_folding::do_casefolding; const uchar32 ucp1st = casefolding_func(rparray_[0].first); for (size_type no = 0; no < rparray_.size(); ++no) { const range_pair &cr = rparray_[no]; for (uchar32 ucp = cr.first;; ++ucp) { if (ucp1st != casefolding_func(ucp)) return constants::invalid_u32value; if (ucp == cr.second) break; } } return ucp1st; } return constants::invalid_u32value; } void split_ranges(range_pairs &kept, range_pairs &removed, const range_pairs &rightranges) const { range_pair newpair; kept.rparray_ = this->rparray_; // Subtraction set. removed.clear(); // Intersection set. for (size_type i = 0;; ++i) { RETRY_SAMEINDEXNO: if (i >= kept.rparray_.size()) break; range_pair &left = kept.rparray_[i]; for (size_type j = 0; j < rightranges.rparray_.size(); ++j) { const range_pair &right = rightranges.rparray_[j]; if (right.first <= left.second) // Excludes l1 l2 < r1 r2. { if (left.first <= right.second) // Excludes r1 r2 < l1 l2. { if (left.first < right.first) { if (right.second < left.second) { removed.join(range_pair_helper(right.first, right.second)); newpair.set(right.second + 1, left.second); left.second = right.first - 1; kept.rparray_.insert(i + 1, newpair); } else { removed.join(range_pair_helper(right.first, left.second)); left.second = right.first - 1; } } else if (right.second < left.second) { removed.join(range_pair_helper(left.first, right.second)); left.first = right.second + 1; } else { removed.join(range_pair_helper(left.first, left.second)); kept.rparray_.erase(i); goto RETRY_SAMEINDEXNO; } } } else break; } } } #if defined(SRELLDBG_NO_BITSET) bool is_included(const uchar32 ch) const { #if 01 const range_pair *const end = rparray_.data() + rparray_.size(); for (const range_pair *cur = rparray_.data(); cur != end; ++cur) { if (ch <= cur->second) return ch >= cur->first; #else for (size_type i = 0; i < rparray_.size(); ++i) { if (rparray_[i].is_included(ch)) return true; #endif } return false; } #endif // defined(SRELLDBG_NO_BITSET) // For multiple_range_pairs functions. bool is_included_ls(const uchar32 pos, uchar32 count, const uchar32 c) const { const range_pair *cur = &rparray_[pos]; for (; count; ++cur, --count) { if (c <= cur->second) return c >= cur->first; } return false; } bool is_included(const uchar32 pos, uchar32 count, const uchar32 c) const { const range_pair *base = &rparray_[pos]; while (count) { uchar32 mid = count >> 1; const range_pair &rp = base[mid]; if (c <= rp.second) { if (c >= rp.first) return true; count = mid; } else { ++mid; count -= mid; base += mid; } } return false; } void replace(const size_type pos, const size_type count, const range_pairs &right) { rparray_.replace(pos, count, right.rparray_); } #if !defined(SRELLDBG_NO_CCPOS) // For Eytzinger layout functions. bool is_included_el(uchar32 pos, const uchar32 len, const uchar32 c) const { const range_pair *const base = &rparray_[pos]; #if defined(__GNUC__) __builtin_prefetch(base); #endif for (pos = 0; pos < len;) { const range_pair &rp = base[pos]; if (c <= rp.second) { if (c >= rp.first) return true; pos = (pos << 1) + 1; } else { pos = (pos << 1) + 2; } } return false; } uchar32 create_el(const range_pair *srcbase, const uchar32 srcsize) { const uchar32 basepos = static_cast(rparray_.size()); rparray_.resize(basepos + srcsize); set_eytzinger_layout(0, srcbase, srcsize, &rparray_[basepos], 0); return srcsize; } #endif // !defined(SRELLDBG_NO_CCPOS) uint_l32 total_codepoints() const { uint_l32 num = 0; for (size_type no = 0; no < rparray_.size(); ++no) { const range_pair &cr = rparray_[no]; num += cr.second - cr.first + 1; } return num; } private: #if !defined(SRELLDBG_NO_CCPOS) uchar32 set_eytzinger_layout(uchar32 srcpos, const range_pair *const srcbase, const uchar32 srclen, range_pair *const destbase, const uchar32 destpos) { if (destpos < srclen) { const uchar32 nextpos = (destpos << 1) + 1; srcpos = set_eytzinger_layout(srcpos, srcbase, srclen, destbase, nextpos); destbase[destpos] = srcbase[srcpos++]; srcpos = set_eytzinger_layout(srcpos, srcbase, srclen, destbase, nextpos + 1); } return srcpos; } #endif // !defined(SRELLDBG_NO_CCPOS) static uchar32 do_nothing(const uchar32 cp) { return cp; } template void load_from_bitset(const BitSetT &bs) { uchar32 begin = constants::invalid_u32value; range_pairs newranges; for (uchar32 ucp = 0;; ++ucp) { if (ucp > constants::unicode_max_codepoint || !bs.test(ucp)) { if (begin != constants::invalid_u32value) { newranges.join(range_pair_helper(begin, ucp - 1)); begin = constants::invalid_u32value; } if (ucp > constants::unicode_max_codepoint) break; } else if (begin == constants::invalid_u32value && bs.test(ucp)) begin = ucp; } rparray_.swap(newranges.rparray_); } array_type rparray_; public: // For debug. void print_pairs(const int, const char *const = NULL, const char *const = NULL) const; }; // range_pairs } // namespace regex_internal // ... "rei_range_pair.hpp"] // ["rei_char_class.hpp" ... namespace regex_internal { #if !defined(SRELL_NO_UNICODE_PROPERTY) // For RegExpIdentifierStart and RegExpIdentifierPart struct identifier_charclass { public: void clear() { char_class_.clear(); char_class_pos_.clear(); } void setup() { if (char_class_pos_.size() == 0) { static const uchar32 additions[] = { // reg_exp_identifier_start, reg_exp_identifier_part. 0x24, 0x24, 0x5f, 0x5f, 0x200c, 0x200d // '$' '_' - }; range_pairs ranges; // For reg_exp_identifier_start. { const uchar32 *const IDs_address = unicode_property::ranges_address(upid_bp_ID_Start); const std::size_t IDs_number = unicode_property::number_of_ranges(upid_bp_ID_Start); ranges.load_from_memory(IDs_address, IDs_number); } ranges.load_from_memory(&additions[0], 2); append_charclass(ranges); // For reg_exp_identifier_part. ranges.clear(); { const uchar32 *const IDc_address = unicode_property::ranges_address(upid_bp_ID_Continue); const std::size_t IDc_number = unicode_property::number_of_ranges(upid_bp_ID_Continue); ranges.load_from_memory(IDc_address, IDc_number); } ranges.load_from_memory(&additions[0], 3); append_charclass(ranges); } } bool is_identifier(const uchar32 ch, const bool part) const { const range_pair &rp = char_class_pos_[part ? 1 : 0]; return char_class_.is_included(rp.first, rp.second, ch); } private: void append_charclass(const range_pairs &rps) { char_class_pos_.push_back(range_pair_helper(static_cast(char_class_.size()), static_cast(rps.size()))); char_class_.append_newclass(rps); } range_pairs char_class_; range_pairs::array_type char_class_pos_; // UnicodeIDStart:: // any Unicode code point with the Unicode property "ID_Start" // UnicodeIDContinue:: // any Unicode code point with the Unicode property "ID_Continue" static const uint_l32 upid_bp_ID_Start = static_cast(up_constants::bp_ID_Start); static const uint_l32 upid_bp_ID_Continue = static_cast(up_constants::bp_ID_Continue); }; // identifier_charclass #endif // !defined(SRELL_NO_UNICODE_PROPERTY) class re_character_class { public: enum { // 0 1 2 3 4 5 newline, dotall, space, digit, word, icase_word, // 6 number_of_predefcls }; #if !defined(SRELL_NO_UNICODE_PROPERTY) typedef unicode_property::pstring pstring; #endif re_character_class() { setup_predefinedclass(); } re_character_class &operator=(const re_character_class &that) { if (this != &that) { this->char_class_ = that.char_class_; this->char_class_pos_ = that.char_class_pos_; #if !defined(SRELLDBG_NO_CCPOS) this->char_class_el_ = that.char_class_el_; this->char_class_pos_el_ = that.char_class_pos_el_; #endif } return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) re_character_class &operator=(re_character_class &&that) SRELL_NOEXCEPT { if (this != &that) { this->char_class_ = std::move(that.char_class_); this->char_class_pos_ = std::move(that.char_class_pos_); #if !defined(SRELLDBG_NO_CCPOS) this->char_class_el_ = std::move(that.char_class_el_); this->char_class_pos_el_ = std::move(that.char_class_pos_el_); #endif } return *this; } #endif bool is_included(const uint_l32 class_number, const uchar32 c) const { // return char_class_.is_included(char_class_pos_[class_number], c); const range_pair &rp = char_class_pos_[class_number]; return char_class_.is_included(rp.first, rp.second, c); } #if !defined(SRELLDBG_NO_CCPOS) // bool is_included(const uint_l32 pos, const uint_l32 len, const uchar32 &c) const bool is_included(const uchar32 pos, const uchar32 len, const uchar32 c) const { return char_class_el_.is_included_el(pos, len, c); } #endif void setup_icase_word() { range_pair &icase_pos = char_class_pos_[icase_word]; if (icase_pos.second == char_class_pos_[word].second) { range_pairs icasewordclass(char_class_, icase_pos.first, icase_pos.second); icasewordclass.make_caseunfoldedcharset(); // Includes 017f and 212a so that they and their case-folded // characters 's' and 'k' will be excluded from the character // set that /[\W]/i matches. char_class_.replace(icase_pos.first, icase_pos.second, icasewordclass); if (icase_pos.second < static_cast(icasewordclass.size())) { const uchar32 delta = static_cast(icasewordclass.size() - icase_pos.second); for (int i = number_of_predefcls; i < static_cast(char_class_pos_.size()); ++i) char_class_pos_[i].first += delta; } icase_pos.second = static_cast(icasewordclass.size()); } } void clear() { char_class_pos_.resize(number_of_predefcls); uchar32 basesize = 0; for (int i = 0; i < number_of_predefcls; ++i) basesize += char_class_pos_[i].second; char_class_.resize(basesize); #if !defined(SRELLDBG_NO_CCPOS) char_class_el_.clear(); char_class_pos_el_.clear(); #endif } uint_l32 register_newclass(const range_pairs &rps) { for (range_pairs::size_type no = 0; no < char_class_pos_.size(); ++no) { const range_pair &rp = char_class_pos_[no]; if (char_class_.same(rp.first, rp.second, rps)) return static_cast(no); } append_charclass(rps); return static_cast(char_class_pos_.size() - 1); } range_pairs operator[](const uint_l32 no) const { const range_pair &ccpos = char_class_pos_[no]; range_pairs rp(ccpos.second); for (uchar32 i = 0; i < ccpos.second; ++i) rp[i] = char_class_[ccpos.first + i]; return rp; } #if !defined(SRELLDBG_NO_CCPOS) const range_pair &charclasspos(const uint_l32 no) // const { const range_pair &pos = char_class_pos_el_[no]; if (pos.second == 0) finalise(no); return pos; } void finalise() { char_class_el_.clear(); char_class_pos_el_.resize(char_class_pos_.size()); std::memset(&char_class_pos_el_[0], 0, char_class_pos_el_.size() * sizeof (range_pairs::array_type::value_type)); } void finalise(const uint_l32 no) { const range_pair &posinfo = char_class_pos_[no]; range_pair &outpair = char_class_pos_el_[no]; outpair.first = static_cast(char_class_el_.size()); outpair.second = char_class_el_.create_el(&char_class_[posinfo.first], posinfo.second); //arraysize; } #endif // #if !defined(SRELLDBG_NO_CCPOS) void optimise() { } #if !defined(SRELL_NO_UNICODE_PROPERTY) uint_l32 get_propertynumber(const pstring &pname, const pstring &pvalue) const { const uint_l32 pno = static_cast(unicode_property::lookup_property(pname, pvalue)); return (pno != up_constants::error_property) ? pno : up_constants::error_property; } bool load_upranges(range_pairs &newranges, const uint_l32 property_number) const { newranges.clear(); if (unicode_property::is_valid_pno(property_number)) { if (property_number == upid_bp_Assigned) { load_updata(newranges, upid_gc_Cn); newranges.negation(); } else load_updata(newranges, property_number); return true; } return false; } // Properties of strings. bool is_pos(const uint_l32 pno) const { return unicode_property::is_pos(pno); } bool get_prawdata(simple_array &seq, uint_l32 property_number) { if (property_number != up_constants::error_property) { if (property_number == upid_bp_Assigned) property_number = upid_gc_Cn; const uchar32 *const address = unicode_property::ranges_address(property_number); // const std::size_t offset = unicode_property::ranges_offset(property_number); const std::size_t number = unicode_property::number_of_ranges(property_number) * 2; seq.resize(number); for (uchar32 i = 0; i < number; ++i) seq[i] = address[i]; return true; } seq.clear(); return false; } #endif // !defined(SRELL_NO_UNICODE_PROPERTY) void swap(re_character_class &right) { if (this != &right) { this->char_class_.swap(right.char_class_); this->char_class_pos_.swap(right.char_class_pos_); #if !defined(SRELLDBG_NO_CCPOS) this->char_class_el_.swap(right.char_class_el_); this->char_class_pos_el_.swap(right.char_class_pos_el_); #endif } } private: #if !defined(SRELL_NO_UNICODE_PROPERTY) void load_updata(range_pairs &newranges, const uint_l32 property_number) const { const uchar32 *const address = unicode_property::ranges_address(property_number); // const std::size_t offset = unicode_property::ranges_offset(property_number); const std::size_t number = unicode_property::number_of_ranges(property_number); newranges.load_from_memory(address, number); } #endif // !defined(SRELL_NO_UNICODE_PROPERTY) void append_charclass(const range_pairs &rps) { char_class_pos_.push_back(range_pair_helper(static_cast(char_class_.size()), static_cast(rps.size()))); char_class_.append_newclass(rps); } // The production CharacterClassEscape::s evaluates as follows: // Return the set of characters containing the characters that are on the right-hand side of the WhiteSpace or LineTerminator productions. // WhiteSpace:: // 0009 000B 000C 0020 00A0 FEFF Zs // LineTerminator:: // 000A 000D 2028 2029 void setup_predefinedclass() { #if !defined(SRELL_NO_UNICODE_PROPERTY) const uchar32 *const Zs_address = unicode_property::ranges_address(upid_gc_Zs); // const std::size_t Zs_offset = unicode_property::ranges_offset(upid_gc_Zs); const std::size_t Zs_number = unicode_property::number_of_ranges(upid_gc_Zs); #else static const uchar32 Zs[] = { 0x1680, 0x1680, 0x2000, 0x200a, // 0x2028, 0x2029, 0x202f, 0x202f, 0x205f, 0x205f, 0x3000, 0x3000 }; #endif // defined(SRELL_NO_UNICODE_PROPERTY) static const uchar32 allranges[] = { // dotall. 0x0000, 0x10ffff, // newline. 0x0a, 0x0a, 0x0d, 0x0d, // \n \r // newline, space. 0x2028, 0x2029, // space. 0x09, 0x0d, // \t \n \v \f \r 0x20, 0x20, // ' ' 0xa0, 0xa0, // 0xfeff, 0xfeff, // // digit, word. 0x30, 0x39, // '0'-'9' 0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a // 'A'-'Z' '_' 'a'-'z' }; range_pairs ranges; // newline. ranges.load_from_memory(&allranges[2], 3); append_charclass(ranges); // dotall. ranges.clear(); ranges.load_from_memory(&allranges[0], 1); append_charclass(ranges); // space. ranges.clear(); ranges.load_from_memory(&allranges[6], 5); #if !defined(SRELL_NO_UNICODE_PROPERTY) ranges.load_from_memory(Zs_address, Zs_number); #else ranges.load_from_memory(Zs, 5); #endif append_charclass(ranges); // digit. ranges.clear(); ranges.load_from_memory(&allranges[16], 1); append_charclass(ranges); // word. ranges.clear(); ranges.load_from_memory(&allranges[16], 4); append_charclass(ranges); // Reservation for icase_word. append_charclass(ranges); } private: range_pairs char_class_; range_pairs::array_type char_class_pos_; #if !defined(SRELLDBG_NO_CCPOS) range_pairs char_class_el_; range_pairs::array_type char_class_pos_el_; #endif #if !defined(SRELL_NO_UNICODE_PROPERTY) static const uint_l32 upid_gc_Zs = static_cast(up_constants::gc_Space_Separator); static const uint_l32 upid_gc_Cn = static_cast(up_constants::gc_Unassigned); static const uint_l32 upid_bp_Assigned = static_cast(up_constants::bp_Assigned); #endif public: // For debug. void print_classes(const int) const; }; // re_character_class } // namespace regex_internal // ... "rei_char_class.hpp"] // ["rei_groupname_mapper.hpp" ... namespace regex_internal { #if !defined(SRELL_NO_NAMEDCAPTURE) template class groupname_mapper { public: typedef simple_array gname_string; typedef typename gname_string::size_type size_type; static const uint_l32 notfound = static_cast(-1); groupname_mapper() { } groupname_mapper(const groupname_mapper &right) : names_(right.names_), keysize_classno_(right.keysize_classno_) { } #if defined(SRELL_CPP11_MOVE_ENABLED) groupname_mapper(groupname_mapper &&right) SRELL_NOEXCEPT : names_(std::move(right.names_)), keysize_classno_(std::move(right.keysize_classno_)) { } #endif groupname_mapper &operator=(const groupname_mapper &right) { if (this != &right) { names_ = right.names_; keysize_classno_ = right.keysize_classno_; } return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) groupname_mapper &operator=(groupname_mapper &&right) SRELL_NOEXCEPT { if (this != &right) { names_ = std::move(right.names_); keysize_classno_ = std::move(right.keysize_classno_); } return *this; } #endif void clear() { names_.clear(); keysize_classno_.clear(); } uint_l32 operator[](const gname_string &gname) const { uint_l32 pos = 0; for (std::size_t i = 0; i < static_cast(keysize_classno_.size()); i += 2) { const uint_l32 keysize = keysize_classno_[i]; if (keysize == static_cast(gname.size()) && sameseq(pos, gname)) return keysize_classno_[++i]; pos += keysize; } return notfound; } gname_string operator[](const uint_l32 indexno) const { uint_l32 pos = 0; for (std::size_t i = 0; i < static_cast(keysize_classno_.size()); ++i) { const uint_l32 keysize = keysize_classno_[i]; const uint_l32 classno = keysize_classno_[++i]; if (classno == indexno) return gname_string(names_, pos, keysize); pos += keysize; } return gname_string(); } size_type size() const { return static_cast(keysize_classno_.size() >> 1); } bool push_back(const gname_string &gname, const uint_l32 class_number) { const uint_l32 num = operator[](gname); if (num == notfound) { names_.append(gname); keysize_classno_.append(1, static_cast(gname.size())); keysize_classno_.append(1, class_number); return true; } return false; // Already exists. } void swap(groupname_mapper &right) { this->names_.swap(right.names_); keysize_classno_.swap(right.keysize_classno_); } private: bool sameseq(size_type pos, const gname_string &gname) const { for (size_type i = 0; i < gname.size(); ++i, ++pos) if (pos >= names_.size() || names_[pos] != gname[i]) return false; return true; } gname_string names_; simple_array keysize_classno_; public: // For debug. void print_mappings(const int) const; }; template const uint_l32 groupname_mapper::notfound; // groupname_mapper #endif // !defined(SRELL_NO_NAMEDCAPTURE) } // namespace regex_internal // ... "rei_groupname_mapper.hpp"] // ["rei_state.hpp" ... namespace regex_internal { struct re_quantifier { // atleast and atmost: for check_counter. // offset and length: for charcter_class. // (Special case 1) in roundbracket_open and roundbracket_pop atleast and atmost represent // the minimum and maximum bracket numbers respectively inside the brackets itself. // (Special case 2) in repeat_in_push and repeat_in_pop atleast and atmost represent the // minimum and maximum bracket numbers respectively inside the repetition. union { uint_l32 atleast; // (Special case 3: v1) in lookaround_open represents the number of characters to be rewound. // (Special case 3: v2) in lookaround_open represents: 0=lookaheads, 1=lookbehinds, // 2=matchpointrewinder. // (Special case 4) in NFA_states[0] represents the class number of the first character class. uchar32 offset; }; union { uint_l32 atmost; uchar32 length; }; union { bool is_greedy; uint_l32 padding_; }; void reset(const uint_l32 len = 1) { atleast = atmost = len; is_greedy = true; } void set(const uint_l32 min, const uint_l32 max) { atleast = min; atmost = max; } void set(const uint_l32 min, const uint_l32 max, const bool greedy) { atleast = min; atmost = max; is_greedy = greedy; } void setccpos(const uchar32 o, const uchar32 l) { offset = o; length = l; } bool is_valid() const { return atleast <= atmost && atmost > 0; } void set_infinity() { atmost = constants::infinity; } bool is_infinity() const { return atmost == constants::infinity; } bool is_same() const { return atleast == atmost; } bool is_default() const { return atleast == 1 && atmost == 1; } bool is_asterisk() const { return atleast == 0 && atmost == constants::infinity; } bool is_plus() const { return atleast == 1 && atmost == constants::infinity; } bool is_asterisk_or_plus() const { return atleast <= 1 && atmost == constants::infinity; } bool is_question_or_asterisk() const { return atleast == 0 && (atmost == 1 || atmost == constants::infinity); } bool has_simple_equivalence() const { return (atleast <= 1 && atmost <= 3) || (atleast == 2 && atmost <= 4) || (atleast == atmost && atmost <= 6); } void multiply(const re_quantifier &q) { if (atleast != constants::infinity) { if (q.atleast != constants::infinity) atleast *= q.atleast; else atleast = constants::infinity; } if (atmost != constants::infinity) { if (q.atmost != constants::infinity) atmost *= q.atmost; else atmost = constants::infinity; } } void add(const re_quantifier &q) { if (atleast != constants::infinity) { if (q.atleast != constants::infinity && (atleast + q.atleast) >= atleast) atleast += q.atleast; else atleast = constants::infinity; } if (atmost != constants::infinity) { if (q.atmost != constants::infinity && (atmost + q.atmost) >= atmost) atmost += q.atmost; else atmost = constants::infinity; } } }; // re_quantifier struct re_state { union { uchar32 character; // For character. uint_l32 number; // For character_class, brackets, counter, repeat, backreference. }; re_state_type type; union { std::ptrdiff_t next1; re_state *next_state1; // Points to the next state. // (Special case 1) in lookaround_open points to the next of lookaround_close. }; union { std::ptrdiff_t next2; re_state *next_state2; // character and character_class: points to another possibility, non-backtracking. // epsilon: points to another possibility, backtracking. // save_and_reset_counter, roundbracket_open, and repeat_in_push: points to a // restore state, backtracking. // check_counter: complementary to next1 based on quantifier.is_greedy. // (Special case 1) roundbracket_close, check_0_width_repeat, and backreference: // points to the next state as an exit after 0 width match. // (Special case 2) in NFA_states[0] holds the entry point for match_continuous/regex_match. // (Special case 3) in lookaround_open points to the contents of brackets. }; re_quantifier quantifier; // For check_counter, roundbrackets, repeasts, (?<=...) and (? number of chars to be rewound (for (?<=...) (? 0: lookahead, 1: lookbehind, 2: mprewinder. // q.atmost: - // q.greedy: - // is_not/dont_push: not // st_bol, // 0x0f // char/number: - // next1/next2: - // quantifiers: - // is_not/dont_push: - // st_eol, // 0x10 // char/number: - // next1/next2: - // quantifiers: - // is_not/dont_push: - // st_boundary, // 0x11 // char/number: - // next1/next2: - // quantifiers: - // is_not/dont_push: not // st_success, // 0x12 // char/number: - // next1/next2: - // quantifiers: - // is_not/dont_push: - // st_move_nextpos, // 0x13 // char/number: - // next1/next2: - // quantifiers: - // is_not/dont_push: - void reset() { number = 0; type = st_character; next1 = 1; next2 = 0; is_not = false; quantifier.reset(); } bool is_character_or_class() const { return type == st_character || type == st_character_class; } bool has_quantifier() const { // 1. character: size == 1 && type == character, // 2. [...]: size == 1 && type == character_class, // 3. (...): size == ? && type == roundbracket_open, // 4. (?:...): size == ? && type == epsilon && character == ':', // 5. backref: size == ? && type == backreference, // -- assertions boundary -- // 6. lookaround: size == ? && type == lookaround, // 7. assertion: size == 0 && type == one of assertions (^, $, \b and \B). #if !defined(SRELL_ENABLE_GT) return type < st_zero_width_boundary; #else // 5.5. independent: size == ? && type == lookaround && character == '>', return type < st_zero_width_boundary || (type == st_lookaround_open && character == meta_char::mc_gt); #endif } bool is_noncapturinggroup() const { return type == st_epsilon && character == meta_char::mc_colon; } bool has_0widthchecker() const { return type == st_roundbracket_open || type == st_backreference; } bool is_negcharclass() const { return type == st_character_class && is_not; } bool is_branch() const { return type == st_epsilon && next2 != 0 && character == meta_char::mc_bar; // '|' } }; // re_state struct re_flags { // bool i; // bool m; // bool s; #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) bool back; #endif void reset(const regex_constants::syntax_option_type /* flags */) { // i = (flags & regex_constants::icase) != 0; // Case-insensitive. // m = (flags & regex_constants::multiline) != 0; // s = (flags & regex_constants::dotall) != 0; #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) back = false; #endif } void restore_from(const re_flags &backup) { #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) back = backup.back; #endif } }; // re_flags template struct re_compiler_state : public re_flags { bool backref_used; simple_array atleast_widths_of_brackets; #if !defined(SRELL_NO_NAMEDCAPTURE) groupname_mapper unresolved_gnames; #endif #if !defined(SRELL_NO_UNICODE_PROPERTY) identifier_charclass idchecker; #endif void reset(const regex_constants::syntax_option_type flags) { re_flags::reset(flags); backref_used = false; atleast_widths_of_brackets.clear(); #if !defined(SRELL_NO_NAMEDCAPTURE) unresolved_gnames.clear(); #endif #if !defined(SRELL_NO_UNICODE_PROPERTY) // idchecker.clear(); // Keeps data once created. #endif } }; // re_compiler_state } // namespace regex_internal // ... "rei_state.hpp"] // ["rei_search_state.hpp" ... template class sub_match /* : std::pair */; namespace regex_internal { //template struct re_state; template struct re_search_state_core { const re_state/* */ *in_NFA_states; BidirectionalIterator in_string; }; template struct re_submatch_core { BidirectionalIterator open_at; BidirectionalIterator close_at; }; template struct re_submatch_type { re_submatch_core core; uint_l32 counter; }; template struct re_search_state_types { typedef re_submatch_core submatch_core; typedef re_submatch_type submatch_type; typedef uint_l32 counter_type; typedef BidirectionalIterator position_type; typedef std::vector submatch_array; typedef re_search_state_core search_core_state; typedef std::vector backtracking_array; typedef std::vector capture_array; typedef simple_array counter_array; typedef std::vector repeat_array; }; template struct re_search_state_types { typedef re_submatch_core submatch_core; typedef re_submatch_type submatch_type; typedef uint_l32 counter_type; typedef const charT2 *position_type; typedef simple_array submatch_array; typedef re_search_state_core search_core_state; typedef simple_array backtracking_array; typedef simple_array capture_array; typedef simple_array repeat_array; typedef simple_array counter_array; }; // re_search_state_types template class re_search_state : public re_search_state_types { private: typedef re_search_state_types base_type; public: typedef typename base_type::submatch_core submatchcore_type; typedef typename base_type::submatch_type submatch_type; typedef typename base_type::counter_type counter_type; typedef typename base_type::position_type position_type; typedef typename base_type::submatch_array submatch_array; typedef typename base_type::search_core_state search_core_state; typedef typename base_type::backtracking_array backtracking_array; typedef typename base_type::capture_array capture_array; typedef typename base_type::counter_array counter_array; typedef typename base_type::repeat_array repeat_array; typedef typename backtracking_array::size_type btstack_size_type; public: struct bottom_state { btstack_size_type btstack_size; typename capture_array::size_type capturestack_size; typename counter_array::size_type counterstack_size; typename repeat_array::size_type repeatstack_size; bottom_state( const btstack_size_type bt, const typename capture_array::size_type h, const typename counter_array::size_type c, const typename repeat_array::size_type r) : btstack_size(bt) , capturestack_size(h) , counterstack_size(c) , repeatstack_size(r) { } }; public: search_core_state nth; #if !defined(SRELL_NO_LIMIT_COUNTER) std::size_t failure_counter; #endif BidirectionalIterator srchend; BidirectionalIterator lblim; BidirectionalIterator nextpos; backtracking_array bt_stack; capture_array capture_stack; counter_array counter_stack; repeat_array repeat_stack; submatch_array bracket; counter_array counter; repeat_array repeat; btstack_size_type btstack_size; BidirectionalIterator srchbegin; public: void init ( const BidirectionalIterator begin, const BidirectionalIterator end, const BidirectionalIterator lookbehindlimit, const regex_constants::match_flag_type flags ) { lblim = lookbehindlimit; nextpos = srchbegin = begin; srchend = end; flags_ = flags; } void set_entrypoint(const re_state *const entry) { entry_state_ = entry; } void init_for_automaton ( uint_l32 num_of_submatches, const uint_l32 num_of_counters, const uint_l32 num_of_repeats ) { bracket.resize(num_of_submatches); counter.resize(num_of_counters); repeat.resize(num_of_repeats); nth.in_string = (flags_ & regex_constants::match_continuous) ? srchbegin : srchend; while (num_of_submatches > 1) { submatch_type &br = bracket[--num_of_submatches]; br.core.open_at = br.core.close_at = this->srchend; br.counter = 0; // 15.10.2.9; AtomEscape: // If the regular expression has n or more capturing parentheses // but the nth one is undefined because it hasn't captured anything, // then the backreference always succeeds. // C.f., table 27 and 28 on TR1, table 142 and 143 on C++11. } clear_stacks(); } #if defined(SRELL_NO_LIMIT_COUNTER) void reset(/* const BidirectionalIterator start */) #else void reset(/* const BidirectionalIterator start, */ const std::size_t limit) #endif { nth.in_NFA_states = this->entry_state_; bracket[0].core.open_at = nth.in_string; #if !defined(SRELL_NO_LIMIT_COUNTER) failure_counter = limit; #endif } bool is_at_lookbehindlimit() const { return nth.in_string == this->lblim; } bool is_at_srchend() const { return nth.in_string == this->srchend; } bool is_null() const { return nth.in_string == bracket[0].core.open_at; } // regex_constants::match_flag_type flags() const // { // return this->flags_; // } bool match_not_bol_flag() const { if (this->flags_ & regex_constants::match_not_bol) return true; return false; } bool match_not_eol_flag() const { if (this->flags_ & regex_constants::match_not_eol) return true; return false; } bool match_not_bow_flag() const { if (this->flags_ & regex_constants::match_not_bow) return true; return false; } bool match_not_eow_flag() const { if (this->flags_ & regex_constants::match_not_eow) return true; return false; } bool match_prev_avail_flag() const { if (this->flags_ & regex_constants::match_prev_avail) return true; return false; } bool match_not_null_flag() const { if (this->flags_ & regex_constants::match_not_null) return true; return false; } bool match_continuous_flag() const { if (this->flags_ & regex_constants::match_continuous) return true; return false; } bool match_match_flag() const { if (this->flags_ & regex_constants::match_match_) return true; return false; } bool set_bracket0(const BidirectionalIterator begin, const BidirectionalIterator end) { nth.in_string = begin; nextpos = end; return true; } void clear_stacks() { btstack_size = 0; bt_stack.clear(); capture_stack.clear(); repeat_stack.clear(); counter_stack.clear(); } btstack_size_type size() const // For debug. { return bt_stack.size(); } bool is_empty() const // For debug. { if (btstack_size == 0 && bt_stack.size() == 0 && capture_stack.size() == 0 && repeat_stack.size() == 0 && counter_stack.size() == 0) return true; return false; } private: /* const */regex_constants::match_flag_type flags_; const re_state/* */ * /* const */entry_state_; }; // re_search_state } // namespace regex_internal // ... "rei_search_state.hpp"] // ["rei_bmh.hpp" ... namespace regex_internal { #if !defined(SRELLDBG_NO_BMH) template class re_bmh { public: re_bmh() { } re_bmh(const re_bmh &right) { operator=(right); } #if defined(SRELL_CPP11_MOVE_ENABLED) re_bmh(re_bmh &&right) SRELL_NOEXCEPT { operator=(std::move(right)); } #endif re_bmh &operator=(const re_bmh &that) { if (this != &that) { this->u32string_ = that.u32string_; this->bmtable_ = that.bmtable_; this->repseq_ = that.repseq_; } return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) re_bmh &operator=(re_bmh &&that) SRELL_NOEXCEPT { if (this != &that) { this->u32string_ = std::move(that.u32string_); this->bmtable_ = std::move(that.bmtable_); this->repseq_ = std::move(that.repseq_); } return *this; } #endif void clear() { u32string_.clear(); bmtable_.clear(); repseq_.clear(); } void setup(const simple_array &u32s, const bool icase) { u32string_ = u32s; setup_(); if (!icase) setup_for_casesensitive(); else setup_for_icase(); } template bool do_casesensitivesearch(re_search_state &sstate, const std::random_access_iterator_tag) const { RandomAccessIterator begin = sstate.srchbegin; const RandomAccessIterator end = sstate.srchend; std::size_t offset = static_cast(repseq_.size() - 1); const charT *const relastchar = &repseq_[offset]; for (; static_cast(end - begin) > offset;) { begin += offset; if (*begin == *relastchar) { const charT *re = relastchar; RandomAccessIterator tail = begin; for (; *--re == *--tail;) { if (re == repseq_.data()) return sstate.set_bracket0(tail, ++begin); } } offset = bmtable_[*begin & 0xff]; } return false; } template bool do_casesensitivesearch(re_search_state &sstate, const std::bidirectional_iterator_tag) const { BidirectionalIterator begin = sstate.srchbegin; const BidirectionalIterator end = sstate.srchend; std::size_t offset = static_cast(repseq_.size() - 1); const charT *const relastchar = &repseq_[offset]; for (;;) { for (; offset; --offset, ++begin) if (begin == end) return false; if (*begin == *relastchar) { const charT *re = relastchar; BidirectionalIterator tail = begin; for (; *--re == *--tail;) { if (re == repseq_.data()) return sstate.set_bracket0(tail, ++begin); } } offset = bmtable_[*begin & 0xff]; } } template bool do_icasesearch(re_search_state &sstate, const std::random_access_iterator_tag) const { const RandomAccessIterator begin = sstate.srchbegin; const RandomAccessIterator end = sstate.srchend; std::size_t offset = bmtable_[256]; const uchar32 entrychar = u32string_[u32string_.size() - 1]; const uchar32 *const re2ndlastchar = &u32string_[u32string_.size() - 2]; RandomAccessIterator curpos = begin; for (; static_cast(end - curpos) > offset;) { curpos += offset; for (; utf_traits::is_trailing(*curpos);) if (++curpos == end) return false; const uchar32 txtlastchar = utf_traits::codepoint(curpos, end); if (txtlastchar == entrychar || unicode_case_folding::do_casefolding(txtlastchar) == entrychar) { const uchar32 *re = re2ndlastchar; RandomAccessIterator tail = curpos; // for (; *--re == unicode_case_folding::do_casefolding(utf_traits::dec_codepoint(tail, begin));) for (; *re == unicode_case_folding::do_casefolding(utf_traits::dec_codepoint(tail, begin)); --re) { if (re == u32string_.data()) { utf_traits::codepoint_inc(curpos, end); return sstate.set_bracket0(tail, curpos); } if (tail == begin) break; } } offset = bmtable_[txtlastchar & 0xff]; } return false; } template bool do_icasesearch(re_search_state &sstate, const std::bidirectional_iterator_tag) const { const BidirectionalIterator begin = sstate.srchbegin; const BidirectionalIterator end = sstate.srchend; if (begin != end) { std::size_t offset = bmtable_[256]; //static_cast(u32string_.size() - 1); const uchar32 entrychar = u32string_[offset]; const uchar32 *const re2ndlastchar = &u32string_[offset - 1]; BidirectionalIterator curpos = begin; for (;;) { for (;;) { if (++curpos == end) return false; if (!utf_traits::is_trailing(*curpos)) if (--offset == 0) break; } // const uchar32 txtlastchar = unicode_case_folding::do_casefolding(utf_traits::codepoint(curpos, end)); const uchar32 txtlastchar = utf_traits::codepoint(curpos, end); // if (txtlastchar == *re2ndlastchar) // if (txtlastchar == *re2ndlastchar || unicode_case_folding::do_casefolding(txtlastchar) == *re2ndlastchar) if (txtlastchar == entrychar || unicode_case_folding::do_casefolding(txtlastchar) == entrychar) { const uchar32 *re = re2ndlastchar; BidirectionalIterator tail = curpos; for (; *re == unicode_case_folding::do_casefolding(utf_traits::dec_codepoint(tail, begin)); --re) { if (re == u32string_.data()) { utf_traits::codepoint_inc(curpos, end); return sstate.set_bracket0(tail, curpos); } if (tail == begin) break; } } offset = bmtable_[txtlastchar & 0xff]; } } return false; } private: void setup_() { bmtable_.resize(257); } void setup_for_casesensitive() { charT mbstr[utf_traits::maxseqlen]; const std::size_t u32str_lastcharpos_ = static_cast(u32string_.size() - 1); repseq_.clear(); for (std::size_t i = 0; i <= u32str_lastcharpos_; ++i) { const uchar32 seqlen = utf_traits::to_codeunits(mbstr, u32string_[i]); for (uchar32 j = 0; j < seqlen; ++j) repseq_.push_back(mbstr[j]); } for (std::size_t i = 0; i < 256; ++i) bmtable_[i] = static_cast(repseq_.size()); const std::size_t repseq_lastcharpos_ = static_cast(repseq_.size() - 1); for (std::size_t i = 0; i < repseq_lastcharpos_; ++i) bmtable_[repseq_[i] & 0xff] = repseq_lastcharpos_ - i; } void setup_for_icase() { charT mbstr[utf_traits::maxseqlen]; uchar32 u32table[ucf_constants::rev_maxset]; const std::size_t u32str_lastcharpos = static_cast(u32string_.size() - 1); simple_array minlen(u32string_.size()); std::size_t cu_repseq_lastcharpos = 0; for (std::size_t i = 0; i <= u32str_lastcharpos; ++i) { const uchar32 setnum = unicode_case_folding::casefoldedcharset(u32table, u32string_[i]); uchar32 u32c = u32table[0]; for (uchar32 j = 1; j < setnum; ++j) if (u32c > u32table[j]) u32c = u32table[j]; if (i < u32str_lastcharpos) cu_repseq_lastcharpos += minlen[i] = utf_traits::to_codeunits(mbstr, u32c); } ++cu_repseq_lastcharpos; for (std::size_t i = 0; i < 256; ++i) bmtable_[i] = cu_repseq_lastcharpos; bmtable_[256] = --cu_repseq_lastcharpos; for (std::size_t i = 0; i < u32str_lastcharpos; ++i) { const uchar32 setnum = unicode_case_folding::casefoldedcharset(u32table, u32string_[i]); for (uchar32 j = 0; j < setnum; ++j) bmtable_[u32table[j] & 0xff] = cu_repseq_lastcharpos; cu_repseq_lastcharpos -= minlen[i]; } } public: // For debug. void print_table() const; void print_seq() const; private: simple_array u32string_; // std::size_t bmtable_[256]; simple_array bmtable_; simple_array repseq_; }; // re_bmh #endif // !defined(SRELLDBG_NO_BMH) } // namespace regex_internal // ... "rei_bmh.hpp"] // ["rei_upos.hpp" ... namespace regex_internal { struct posdata_holder { simple_array indices; simple_array seqs; range_pairs ranges; range_pair length; void clear() { indices.clear(); seqs.clear(); ranges.clear(); length.set(1); } bool has_empty() const { return (indices.size() >= 2 && indices[0] != indices[1]) ? true : false; } bool has_data() const { return ranges.size() > 0 || indices.size() > 0; } bool may_contain_strings() const { return indices.size() > 0; // >= 2; } void swap(posdata_holder &right) { indices.swap(right.indices); seqs.swap(right.seqs); ranges.swap(right.ranges); length.swap(right.length); } void do_union(const posdata_holder &right) { simple_array curseq; ranges.merge(right.ranges); if (right.has_empty() && !has_empty()) register_emptystring(); for (uchar32 seqlen = 2; seqlen < static_cast(right.indices.size()); ++seqlen) { const uchar32 end = right.indices[seqlen - 1]; uchar32 begin = right.indices[seqlen]; if (begin != end) { const std::size_t complen = seqlen * sizeof (uchar32); ensure_length(seqlen); curseq.resize(seqlen); for (; begin < end;) { const uchar32 inspos = find_seq(&right.seqs[begin], seqlen, complen); if (inspos == indices[seqlen - 1]) { for (uchar32 i = 0; i < seqlen; ++i, ++begin) curseq[i] = right.seqs[begin]; seqs.insert(inspos, curseq); for (uchar32 i = 0; i < seqlen; ++i) indices[i] += seqlen; } else begin += seqlen; } } } check_lengths(); } void do_subtract(const posdata_holder &right) { const uchar32 maxlen = static_cast(indices.size() <= right.indices.size() ? indices.size() : right.indices.size()); { range_pairs kept; range_pairs removed; ranges.split_ranges(kept, removed, right.ranges); ranges.swap(kept); } if (right.has_empty() && has_empty()) unregister_emptystring(); for (uchar32 seqlen = 2; seqlen < maxlen; ++seqlen) { const uchar32 end = right.indices[seqlen - 1]; uchar32 begin = right.indices[seqlen]; if (begin != end) { const std::size_t complen = seqlen * sizeof (uchar32); for (; begin < end;) { const uchar32 delpos = find_seq(&right.seqs[begin], seqlen, complen); if (delpos < indices[seqlen - 1]) { seqs.erase(delpos, seqlen); for (uchar32 i = 0; i < seqlen; ++i) indices[i] -= seqlen; } else begin += seqlen; } } } check_lengths(); } void do_and(const posdata_holder &right) { const uchar32 maxlen = static_cast(indices.size() <= right.indices.size() ? indices.size() : right.indices.size()); posdata_holder newpos; simple_array curseq; { range_pairs kept; ranges.split_ranges(kept, newpos.ranges, right.ranges); ranges.swap(newpos.ranges); } if (has_empty() && right.has_empty()) newpos.register_emptystring(); else if (may_contain_strings() || right.may_contain_strings()) ensure_length(1); for (uchar32 seqlen = 2; seqlen < maxlen; ++seqlen) { const uchar32 end = right.indices[seqlen - 1]; uchar32 begin = right.indices[seqlen]; if (begin != end) { const std::size_t complen = seqlen * sizeof (uchar32); const uchar32 myend = indices[seqlen - 1]; curseq.resize(seqlen); for (; begin < end; begin += seqlen) { const uchar32 srcpos = find_seq(&right.seqs[begin], seqlen, complen); if (srcpos < myend) { newpos.ensure_length(seqlen); const uchar32 inspos = newpos.find_seq(&right.seqs[begin], seqlen, complen); if (inspos == newpos.indices[seqlen - 1]) { for (uchar32 i = 0; i < seqlen; ++i) curseq[i] = right.seqs[begin + i]; newpos.seqs.insert(inspos, curseq); for (uchar32 i = 0; i < seqlen; ++i) newpos.indices[i] += seqlen; } } } } } this->indices.swap(newpos.indices); this->seqs.swap(newpos.seqs); check_lengths(); } void split_seqs_and_ranges(const simple_array &inseqs, const bool icase, const bool back) { const uchar32 max = static_cast(inseqs.size()); simple_array curseq; clear(); for (uchar32 indx = 0; indx < max;) { const uchar32 elen = inseqs[indx++]; if (elen == 1) // Range. { ranges.join(range_pair_helper(inseqs[indx], inseqs[indx + 1])); indx += 2; } else if (elen == 2) { const uchar32 ucpval = inseqs[indx++]; if (ucpval != constants::ccstr_empty) ranges.join(range_pair_helper(ucpval)); else register_emptystring(); } else if (elen >= 3) { const uchar32 seqlen = elen - 1; ensure_length(seqlen); const uchar32 inspos = indices[seqlen - 1]; curseq.resize(seqlen); if (!back) { for (uchar32 j = 0; j < seqlen; ++j, ++indx) curseq[j] = inseqs[indx]; } else { for (uchar32 j = seqlen; j; ++indx) curseq[--j] = inseqs[indx]; } if (icase) { for (simple_array::size_type i = 0; i < curseq.size(); ++i) curseq[i] = unicode_case_folding::do_casefolding(curseq[i]); } const std::size_t complen = seqlen * sizeof (uchar32); for (uchar32 i = indices[seqlen];; i += seqlen) { if (i == inspos) { seqs.insert(inspos, curseq); for (uchar32 i = 0; i < seqlen; ++i) indices[i] += seqlen; break; } if (std::memcmp(&seqs[i], curseq.data(), complen) == 0) break; } } //elen == 0: Padding. } // if (this->is_icase()) if (icase) ranges.make_caseunfoldedcharset(); check_lengths(); } private: void register_emptystring() { if (indices.size() < 2) { indices.resize(2); indices[1] = 0; indices[0] = 1; } else if (indices[0] == indices[1]) { ++indices[0]; } length.first = 0; } void unregister_emptystring() { if (indices.size() >= 2 && indices[0] != indices[1]) indices[0] = indices[1]; } void ensure_length(const uchar32 seqlen) { uchar32 curlen = static_cast(indices.size()); if (seqlen >= curlen) { indices.resize(seqlen + 1); for (; curlen <= seqlen; ++curlen) indices[curlen] = 0; } } uchar32 find_seq(const uchar32 *const seqbegin, const uchar32 seqlen, const std::size_t complen) const { const uchar32 end = indices[seqlen - 1]; for (uchar32 begin = indices[seqlen]; begin < end; begin += seqlen) { if (std::memcmp(seqbegin, &seqs[begin], complen) == 0) return begin; } return end; } void check_lengths() { length.set(constants::max_u32value, 0); for (uchar32 i = 2; i < static_cast(indices.size()); ++i) { if (indices[i] != indices[i - 1]) { if (length.first > i) length.first = i; if (length.second < i) length.second = i; } } if (ranges.size()) { if (length.first > 1) length.first = 1; if (length.second < 1) length.second = 1; } if (has_empty()) length.first = 0; if (length.second == 0) length.first = 0; } }; // posdata_holder } // namespace regex_internal // ... "rei_upos.hpp"] // ["rei_compiler.hpp" ... namespace regex_internal { template struct re_object_core { protected: typedef re_state/**/ state_type; typedef simple_array state_array; state_array NFA_states; re_character_class character_class; #if !defined(SRELLDBG_NO_1STCHRCLS) #if !defined(SRELLDBG_NO_BITSET) bitset firstchar_class_bs; #else range_pairs firstchar_class; #endif #endif #if !defined(SRELL_NO_LIMIT_COUNTER) public: std::size_t limit_counter; protected: #endif typedef typename traits::utf_traits utf_traits; uint_l32 number_of_brackets; uint_l32 number_of_counters; uint_l32 number_of_repeats; regex_constants::syntax_option_type soflags; #if !defined(SRELL_NO_NAMEDCAPTURE) groupname_mapper namedcaptures; typedef typename groupname_mapper::gname_string gname_string; #endif #if !defined(SRELLDBG_NO_BMH) re_bmh *bmdata; #endif #if !defined(SRELL_NO_LIMIT_COUNTER) private: static const std::size_t lcounter_defnum_ = 16777216; #endif protected: re_object_core() #if !defined(SRELL_NO_LIMIT_COUNTER) : limit_counter(lcounter_defnum_) #if !defined(SRELLDBG_NO_BMH) , bmdata(NULL) #endif #elif !defined(SRELLDBG_NO_BMH) : bmdata(NULL) #endif { } re_object_core(const re_object_core &right) #if !defined(SRELLDBG_NO_BMH) : bmdata(NULL) #endif { operator=(right); } #if defined(SRELL_CPP11_MOVE_ENABLED) re_object_core(re_object_core &&right) SRELL_NOEXCEPT #if !defined(SRELLDBG_NO_BMH) : bmdata(NULL) #endif { operator=(std::move(right)); } #endif #if !defined(SRELLDBG_NO_BMH) ~re_object_core() { if (bmdata) delete bmdata; } #endif void reset(const regex_constants::syntax_option_type flags) { NFA_states.clear(); character_class.clear(); #if !defined(SRELLDBG_NO_1STCHRCLS) #if !defined(SRELLDBG_NO_BITSET) firstchar_class_bs.reset(); #else firstchar_class.clear(); #endif #endif #if !defined(SRELL_NO_LIMIT_COUNTER) limit_counter = lcounter_defnum_; #endif number_of_brackets = 1; number_of_counters = 0; number_of_repeats = 0; soflags = flags; // regex_constants::ECMAScript; #if !defined(SRELL_NO_NAMEDCAPTURE) namedcaptures.clear(); #endif #if !defined(SRELLDBG_NO_BMH) // bmdata->clear(); if (bmdata) delete bmdata; bmdata = NULL; #endif } re_object_core &operator=(const re_object_core &that) { if (this != &that) { this->NFA_states = that.NFA_states; this->character_class = that.character_class; #if !defined(SRELLDBG_NO_1STCHRCLS) #if !defined(SRELLDBG_NO_BITSET) this->firstchar_class_bs = that.firstchar_class_bs; #else this->firstchar_class = that.firstchar_class; #endif #endif #if !defined(SRELL_NO_LIMIT_COUNTER) this->limit_counter = that.limit_counter; #endif // this->utf_traits_inst = that.utf_traits_inst; this->number_of_brackets = that.number_of_brackets; this->number_of_counters = that.number_of_counters; this->number_of_repeats = that.number_of_repeats; this->soflags = that.soflags; #if !defined(SRELL_NO_NAMEDCAPTURE) this->namedcaptures = that.namedcaptures; #endif #if !defined(SRELLDBG_NO_BMH) if (that.bmdata) { if (this->bmdata) *this->bmdata = *that.bmdata; else this->bmdata = new re_bmh(*that.bmdata); } else if (this->bmdata) { delete this->bmdata; this->bmdata = NULL; } #endif if (that.NFA_states.size()) repair_nextstates(&that.NFA_states[0]); } return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) re_object_core &operator=(re_object_core &&that) SRELL_NOEXCEPT { if (this != &that) { this->NFA_states = std::move(that.NFA_states); this->character_class = std::move(that.character_class); #if !defined(SRELLDBG_NO_1STCHRCLS) #if !defined(SRELLDBG_NO_BITSET) this->firstchar_class_bs = std::move(that.firstchar_class_bs); #else this->firstchar_class = std::move(that.firstchar_class); #endif #endif #if !defined(SRELL_NO_LIMIT_COUNTER) this->limit_counter = that.limit_counter; #endif this->number_of_brackets = that.number_of_brackets; this->number_of_counters = that.number_of_counters; this->number_of_repeats = that.number_of_repeats; this->soflags = that.soflags; #if !defined(SRELL_NO_NAMEDCAPTURE) this->namedcaptures = std::move(that.namedcaptures); #endif #if !defined(SRELLDBG_NO_BMH) if (this->bmdata) delete this->bmdata; this->bmdata = that.bmdata; that.bmdata = NULL; #endif } return *this; } #endif // defined(SRELL_CPP11_MOVE_ENABLED) void swap(re_object_core &right) { if (this != &right) { this->NFA_states.swap(right.NFA_states); this->character_class.swap(right.character_class); #if !defined(SRELLDBG_NO_1STCHRCLS) #if !defined(SRELLDBG_NO_BITSET) this->firstchar_class_bs.swap(right.firstchar_class_bs); #else this->firstchar_class.swap(right.firstchar_class); #endif #endif #if !defined(SRELL_NO_LIMIT_COUNTER) { const std::size_t tmp_limit_counter = this->limit_counter; this->limit_counter = right.limit_counter; right.limit_counter = tmp_limit_counter; } #endif // this->utf_traits_inst.swap(right.utf_traits_inst); { const uint_l32 tmp_numof_brackets = this->number_of_brackets; this->number_of_brackets = right.number_of_brackets; right.number_of_brackets = tmp_numof_brackets; } { const uint_l32 tmp_numof_counters = this->number_of_counters; this->number_of_counters = right.number_of_counters; right.number_of_counters = tmp_numof_counters; } { const uint_l32 tmp_numof_repeats = this->number_of_repeats; this->number_of_repeats = right.number_of_repeats; right.number_of_repeats = tmp_numof_repeats; } { const regex_constants::syntax_option_type tmp_soflags = this->soflags; this->soflags = right.soflags; right.soflags = tmp_soflags; } #if !defined(SRELL_NO_NAMEDCAPTURE) this->namedcaptures.swap(right.namedcaptures); #endif #if !defined(SRELLDBG_NO_BMH) { re_bmh *const tmp_bmdata = this->bmdata; this->bmdata = right.bmdata; right.bmdata = tmp_bmdata; } #endif } } void throw_error(const regex_constants::error_type &e) { // reset(); NFA_states.clear(); #if !defined(SRELLDBG_NO_BMH) if (bmdata) delete bmdata; bmdata = NULL; #endif throw regex_error(e); } private: void repair_nextstates(const state_type *const oldbase) { state_type *const newbase = &this->NFA_states[0]; for (typename state_array::size_type i = 0; i < this->NFA_states.size(); ++i) { state_type &state = this->NFA_states[i]; if (state.next_state1) state.next_state1 = state.next_state1 - oldbase + newbase; if (state.next_state2) state.next_state2 = state.next_state2 - oldbase + newbase; } } }; // re_object_core template class re_compiler : public re_object_core { protected: template bool compile(ForwardIterator begin, const ForwardIterator end, const regex_constants::syntax_option_type flags /* = regex_constants::ECMAScript */) { simple_array u32; while (begin != end) { const uchar32 u32c = utf_traits::codepoint_inc(begin, end); if (u32c > constants::unicode_max_codepoint) this->throw_error(regex_constants::error_utf8); u32.push_backncr(u32c); } return compile_core(u32.data(), u32.data() + u32.size(), flags); } bool is_icase() const { #if !defined(SRELL_NO_ICASE) if (this->soflags & regex_constants::icase) return true; #endif return false; } bool is_ricase() const { #if !defined(SRELL_NO_ICASE) return /* this->NFA_states.size() && */ this->NFA_states[0].icase == true; #else return false; #endif } bool is_multiline() const { if (this->soflags & regex_constants::multiline) return true; return false; } bool is_dotall() const { return (this->soflags & regex_constants::dotall) ? true : false; } bool is_vmode() const { #if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) return (this->soflags & regex_constants::unicodesets) ? true : false; #else return false; #endif } bool is_optimize() const { return (this->soflags & regex_constants::optimize) ? true : false; } private: typedef re_object_core base_type; typedef typename base_type::utf_traits utf_traits; typedef typename base_type::state_type state_type; typedef typename base_type::state_array state_array; #if !defined(SRELL_NO_NAMEDCAPTURE) typedef typename base_type::gname_string gname_string; #endif #if !defined(SRELL_NO_UNICODE_PROPERTY) typedef typename re_character_class::pstring pstring; #endif typedef typename state_array::size_type state_size_type; bool compile_core(const uchar32 *begin, const uchar32 *const end, const regex_constants::syntax_option_type flags) { re_quantifier piecesize; re_compiler_state cstate; state_type atom; this->reset(flags); // this->soflags = flags; cstate.reset(flags); atom.reset(); atom.type = st_epsilon; atom.next2 = 1; this->NFA_states.push_back(atom); if (!make_nfa_states(this->NFA_states, piecesize, begin, end, cstate)) { return false; } if (begin != end) this->throw_error(regex_constants::error_paren); // ')'s are too many. if (!check_backreferences(cstate)) this->throw_error(regex_constants::error_backref); #if !defined(SRELL_NO_ICASE) if (this->is_icase()) this->NFA_states[0].icase = check_if_really_needs_icase_search(); #endif #if !defined(SRELLDBG_NO_BMH) setup_bmhdata(); #endif atom.type = st_success; atom.next1 = 0; atom.next2 = 0; this->NFA_states.push_back(atom); optimise(); relativejump_to_absolutejump(); return true; } bool make_nfa_states(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { typename state_array::size_type prevbranch_end = 0; state_type atom; state_array branch; re_quantifier branchsize; piecesize.reset(0); for (;;) { branch.clear(); if (!make_branch(branch, branchsize, curpos, end, cstate)) return false; // For piecesize.atleast, 0 as the initial value and 0 as an // actual value must be distinguished. if (piecesize.atmost == 0 || piecesize.atleast > branchsize.atleast) piecesize.atleast = branchsize.atleast; if (piecesize.atmost < branchsize.atmost) piecesize.atmost = branchsize.atmost; if (curpos != end && *curpos == meta_char::mc_bar) { atom.reset(); atom.character = meta_char::mc_bar; atom.type = st_epsilon; atom.next2 = static_cast(branch.size()) + 2; branch.insert(0, atom); } if (prevbranch_end) piece[prevbranch_end].next1 = static_cast(branch.size()) + 1; piece += branch; // end or ')' if (curpos == end || *curpos == meta_char::mc_rbracl) break; // *curpos == '|' prevbranch_end = piece.size(); atom.reset(); atom.type = st_epsilon; piece.push_back(atom); ++curpos; } return true; } bool make_branch(state_array &branch, re_quantifier &branchsize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { state_array piece; state_array piece_with_quantifier; re_quantifier quantifier; branchsize.reset(0); for (;;) { re_quantifier piecesize; if (curpos == end) return true; piece.clear(); piece_with_quantifier.clear(); switch (*curpos) { // case char_ctrl::cc_nul: // '\0': case meta_char::mc_bar: // '|': case meta_char::mc_rbracl: // ')': return true; default: if (!get_atom(piece, piecesize, curpos, end, cstate)) return false; } if (piece.size()) { const state_type &firstatom = piece[0]; quantifier.reset(); // quantifier.atleast = quantifier.atmost = 1; if (firstatom.has_quantifier()) { if (curpos != end && !get_quantifier(quantifier, curpos, end)) return false; } if (piece.size() == 2 && firstatom.is_noncapturinggroup() && piece[1].is_noncapturinggroup()) { // (?:) alone or followed by a quantifier. // piece_with_quantifier += piece; ; // Do nothing. } else combine_piece_with_quantifier(piece_with_quantifier, piece, quantifier, piecesize); #if 01 piecesize.multiply(quantifier); branchsize.add(piecesize); #else branchsize.atleast += piecesize.atleast * quantifier.atleast; if (!branchsize.is_infinity()) { if (piecesize.is_infinity() || quantifier.is_infinity()) branchsize.set_infinity(); else branchsize.atmost += piecesize.atmost * quantifier.atmost; } #endif #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) if (!cstate.back) branch += piece_with_quantifier; else branch.insert(0, piece_with_quantifier); #else branch += piece_with_quantifier; #endif } } } bool get_atom(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { state_type atom; atom.reset(); atom.character = *curpos++; switch (atom.character) { case meta_char::mc_rbraop: // '(': return get_piece_in_roundbrackets(piece, piecesize, curpos, end, cstate); case meta_char::mc_sbraop: // '[': #if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) if (this->is_vmode()) // vmode. return parse_charclass_v(piece, piecesize, curpos, end, cstate); #endif if (!register_character_class(atom, curpos, end, cstate)) return false; break; case meta_char::mc_escape: // '\\': if (!translate_atom_escape(atom, piece, piecesize, curpos, end, cstate)) return false; if (piece.size()) return true; break; case meta_char::mc_period: // '.': atom.type = st_character_class; #if !defined(SRELL_NO_SINGLELINE) if (this->is_dotall()) { atom.number = static_cast(re_character_class::dotall); } else #endif { // atom.number = static_cast(re_character_class::newline); range_pairs nlclass = this->character_class[static_cast(re_character_class::newline)]; nlclass.negation(); atom.number = this->character_class.register_newclass(nlclass); } break; case meta_char::mc_caret: // '^': atom.type = st_bol; atom.quantifier.reset(0); // if (current_flags.m) if (is_multiline()) atom.multiline = true; break; case meta_char::mc_dollar: // '$': atom.type = st_eol; atom.quantifier.reset(0); // if (current_flags.m) if (is_multiline()) atom.multiline = true; break; case meta_char::mc_astrsk: // '*': case meta_char::mc_plus: // '+': case meta_char::mc_query: // '?': case meta_char::mc_cbraop: // '{' this->throw_error(regex_constants::error_badrepeat); default:; } if (atom.type == st_character) { if (this->is_icase()) atom.character = unicode_case_folding::do_casefolding(atom.character); } piece.push_back(atom); piecesize = atom.quantifier; return true; } // '('. bool get_piece_in_roundbrackets(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { const re_flags originalflags(cstate); state_type atom; if (curpos == end) this->throw_error(regex_constants::error_paren); atom.reset(); atom.type = st_roundbracket_open; if (*curpos == meta_char::mc_query) // '?' { if (!extended_roundbrackets(piece, atom, ++curpos, end, cstate)) return false; } if (atom.type == st_roundbracket_open) { push_bracket_open(piece, atom); } // if (curpos == end) // this->throw_error(regex_constants::error_paren); if (!make_nfa_states(piece, piecesize, curpos, end, cstate)) return false; // end or ')'? if (curpos == end) this->throw_error(regex_constants::error_paren); ++curpos; cstate.restore_from(originalflags); switch (atom.type) { case st_epsilon: // if (piece.size() <= 2) // ':' or ':' + one. if (piece.size() == 2) // ':' + something. { piece.erase(0); return true; } piece[0].quantifier.atmost = this->number_of_brackets - 1; break; // case st_lookaround_pop: case st_lookaround_open: { state_type &firstatom = piece[0]; #if defined(SRELL_FIXEDWIDTHLOOKBEHIND) // if (firstatom.reverse) if (firstatom.quantifier.atleast) // > 0 means lookbehind. { if (!piecesize.is_same() || piecesize.is_infinity()) this->throw_error(regex_constants::error_lookbehind); firstatom.quantifier = piecesize; } #endif #if defined(SRELL_ENABLE_GT) if (firstatom.character != meta_char::mc_gt) #endif piecesize.reset(0); firstatom.next1 = static_cast(piece.size()) + 1; atom.type = st_lookaround_close; atom.next1 = 0; atom.next2 = 0; } break; default: set_bracket_close(piece, atom, piecesize, cstate); } piece.push_back(atom); return true; } bool extended_roundbrackets(state_array &piece, state_type &atom, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) bool lookbehind = false; #endif if (curpos == end) this->throw_error(regex_constants::error_paren); atom.character = *curpos; if (atom.character == meta_char::mc_lt) // '<' { #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) lookbehind = true; #endif if (++curpos == end) this->throw_error(regex_constants::error_paren); atom.character = *curpos; if (atom.character != meta_char::mc_eq && atom.character != meta_char::mc_exclam) { #if !defined(SRELL_NO_NAMEDCAPTURE) return parse_groupname(curpos, end, cstate); #else this->throw_error(regex_constants::error_paren); #endif // !defined(SRELL_NO_NAMEDCAPTURE) } } else atom.quantifier.atleast = 0; // Sets atleast to 0 for other assertions than lookbehinds. The automaton // checks atleast to know whether lookbehinds or other assertions. switch (atom.character) { case meta_char::mc_colon: atom.type = st_epsilon; atom.quantifier.atleast = this->number_of_brackets; break; case meta_char::mc_exclam: // '!': atom.is_not = true; //@fallthrough@ case meta_char::mc_eq: // '=': #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) cstate.back = lookbehind; #else // atom.reverse = lookbehind; #endif #if defined(SRELL_ENABLE_GT) case meta_char::mc_gt: #endif atom.type = st_lookaround_open; atom.next2 = 1; break; default: this->throw_error(regex_constants::error_paren); } ++curpos; piece.push_back(atom); return true; } void push_bracket_open(state_array &piece, state_type &atom) { atom.number = this->number_of_brackets; atom.next1 = 2; atom.next2 = 1; piece.push_back(atom); ++this->number_of_brackets; atom.type = st_roundbracket_pop; atom.next1 = 0; atom.next2 = 0; piece.push_back(atom); } void set_bracket_close(state_array &piece, state_type &atom, const re_quantifier &piecesize, re_compiler_state &cstate) { // uint_l32 max_bracketno = atom.number; atom.type = st_roundbracket_close; atom.next1 = 1; atom.next2 = 1; #if 0 for (typename state_array::size_type i = 0; i < piece.size(); ++i) { const state_type &state = piece[i]; if (state.type == st_roundbracket_open && max_bracketno < state.number) max_bracketno = state.number; } #endif re_quantifier &rb_open = piece[0].quantifier; re_quantifier &rb_pop = piece[1].quantifier; rb_open.atleast = rb_pop.atleast = atom.number + 1; rb_open.atmost = rb_pop.atmost = this->number_of_brackets - 1; // max_bracketno; if (cstate.atleast_widths_of_brackets.size() < atom.number) cstate.atleast_widths_of_brackets.resize(atom.number, 0); cstate.atleast_widths_of_brackets[atom.number - 1] = piecesize.atleast; } void combine_piece_with_quantifier(state_array &piece_with_quantifier, state_array &piece, const re_quantifier &quantifier, const re_quantifier &piecesize) { state_type &firstatom = piece[0]; // const bool firstpiece_is_roundbracket_open = (firstatom.type == st_roundbracket_open); const bool piece_has_0widthchecker = firstatom.has_0widthchecker(); const bool piece_is_noncapturinggroup_contaning_capturinggroup = firstatom.is_noncapturinggroup() && firstatom.quantifier.is_valid(); state_type atom; if (quantifier.atmost == 0) return; atom.reset(); atom.quantifier = quantifier; if (firstatom.is_character_or_class()) atom.character = meta_char::mc_astrsk; // For nextpos_optimisation1_3(). if (quantifier.atmost == 1) { if (quantifier.atleast == 0) { atom.type = st_epsilon; atom.next2 = static_cast(piece.size()) + 1; if (!quantifier.is_greedy) { atom.next1 = atom.next2; atom.next2 = 1; } if (atom.character == meta_char::mc_astrsk) firstatom.quantifier = quantifier; piece_with_quantifier.push_back(atom); // (push) } if (piece.size() >= 2 && firstatom.type == st_roundbracket_open && piece[1].type == st_roundbracket_pop) { firstatom.quantifier.atmost = 0u; piece[1].quantifier.atmost = 0u; } piece_with_quantifier += piece; return; } // atmost >= 2 #if !defined(SRELLDBG_NO_SIMPLEEQUIV) // The counter requires at least 6 states: save, restore, check, inc, dec, atom(s). // A character or charclass quantified by one of these has a simple equivalent representation: // a{0,2} 1.epsilon(2|5), 2.CHorCL(3), 3.epsilon(4|5), 4.CHorCL(5), [5]. // a{0,3} 1.epsilon(2|7), 2.CHorCL(3), 3.epsilon(4|7), 4.CHorCL(5), 5.epsilon(6|7), 6.CHorCL(7), [7]. // a{1,2} 1.CHorCL(2), 2.epsilon(3|4), 3.CHorCL(4), [4]. // a{1,3} 1.CHorCL(2), 2.epsilon(3|6), 3.CHorCL(4), 4.epsilon(5|6), 5.CHorCL(6), [6]. // a{2,3} 1.CHorCL(2), 2.CHorCL(3), 3.epsilon(4|5), 4.CHorCL(5), [5]. // a{2,4} 1.CHorCL(2), 2.CHorCL(3), 3.epsilon(4|7), 4.CHorCL(5), 5.epsilon(6|7), 6.CHorCL(7), [7]. if (piece.size() == 1 && firstatom.is_character_or_class() && quantifier.has_simple_equivalence()) { const typename state_array::size_type branchsize = piece.size() + 1; for (uint_l32 i = 0; i < quantifier.atleast; ++i) piece_with_quantifier += piece; if (atom.character == meta_char::mc_astrsk) firstatom.quantifier.set(0, 1, quantifier.is_greedy); atom.type = st_epsilon; atom.next2 = (quantifier.atmost - quantifier.atleast) * branchsize; if (!quantifier.is_greedy) { atom.next1 = atom.next2; atom.next2 = 1; } for (uint_l32 i = quantifier.atleast; i < quantifier.atmost; ++i) { piece_with_quantifier.push_back(atom); piece_with_quantifier += piece; quantifier.is_greedy ? (atom.next2 -= branchsize) : (atom.next1 -= branchsize); } return; } #endif // !defined(SRELLDBG_NO_SIMPLEEQUIV) atom.type = st_epsilon; if (quantifier.is_asterisk()) // {0,} { // greedy: 1.epsilon(2|4), 2.piece, 3.LAorC0WR(1|0), 4.OutOfLoop. // !greedy: 1.epsilon(4|2), 2.piece, 3.LAorC0WR(1|0), 4.OutOfLoop. // LAorC0WR: LastAtomOfPiece or Check0WidthRepeat. // atom.type points to 1. } else if (quantifier.is_plus()) // {1,} { #if !defined(SRELLDBG_NO_ASTERISK_OPT) if (piece.size() == 1 && firstatom.is_character_or_class()) { piece_with_quantifier += piece; --atom.quantifier.atleast; // /.+/ -> /..*/. } else #endif { atom.next1 = 2; atom.next2 = 0; piece_with_quantifier.push_back(atom); // greedy: 1.epsilon(3), 2.epsilon(3|5), 3.piece, 4.LAorC0WR(2|0), 5.OutOfLoop. // !greedy: 1.epsilon(3), 2.epsilon(5|3), 3.piece, 4.LAorC0WR(2|0), 5.OutOfLoop. // atom.type points to 2. } } else { atom.number = this->number_of_counters; ++this->number_of_counters; atom.type = st_save_and_reset_counter; atom.next1 = 2; atom.next2 = 1; piece_with_quantifier.push_back(atom); atom.type = st_restore_counter; atom.next1 = 0; atom.next2 = 0; piece_with_quantifier.push_back(atom); // 1.save_and_reset_counter(3|2), 2.restore_counter(0|0), atom.next1 = 0; atom.next2 = 0; atom.type = st_decrement_counter; piece.insert(0, atom); atom.next1 = 2; // atom.next2 = piece[1].is_character_or_class() ? 0 : 1; // atom.next2 = 0; for (state_size_type i = 1; i < piece.size(); ++i) { const state_type &state = piece[i]; if (state.is_character_or_class() || (state.type == st_epsilon && state.next2 == 0)) ; else { atom.next2 = 1; break; } } atom.type = st_epsilon; // st_increment_counter; piece.insert(0, atom); piece[0].number = 0; atom.type = st_check_counter; // greedy: 3.check_counter(4|6), 4.piece, 5.LAorC0WR(3|0), 6.OutOfLoop. // !greedy: 3.check_counter(6|4), 4.piece, 5.LAorC0WR(3|0), 6.OutOfLoop. // 4.piece = { 4a.increment_counter(4c|4b), 4b.decrement_counter(0|0), 4c.OriginalPiece }. } // atom.type is epsilon or check_counter. // Its "next"s point to piece and OutOfLoop. if (!piece_is_noncapturinggroup_contaning_capturinggroup && (piecesize.atleast || piece_has_0widthchecker)) { const typename state_array::size_type piece_size = piece.size(); state_type &lastatom = piece[piece_size - 1]; lastatom.next1 = 0 - static_cast(piece_size); // Points to the one immediately before piece, which will be pushed last in this block. // atom.type has already been set. epsilon or check_counter. atom.next1 = 1; atom.next2 = static_cast(piece_size) + 1; if (!quantifier.is_greedy) { atom.next1 = atom.next2; atom.next2 = 1; } piece_with_quantifier.push_back(atom); } else { // atom.type has already been set. epsilon or check_counter. atom.next1 = 1; atom.next2 = static_cast(piece.size()) + 4; // To OutOfLoop. // The reason for +3 than above is that push, pop, and check_0_width are added below. if (!quantifier.is_greedy) { atom.next1 = atom.next2; atom.next2 = 1; } piece_with_quantifier.push_back(atom); // *1 atom.number = this->number_of_repeats; ++this->number_of_repeats; const state_size_type org1stpos = (atom.type == st_check_counter) ? 2 : 0; if (piece_is_noncapturinggroup_contaning_capturinggroup) atom.quantifier = piece[org1stpos].quantifier; else atom.quantifier.set(1, 0); atom.type = st_repeat_in_pop; atom.next1 = 0; atom.next2 = 0; piece.insert(org1stpos, atom); atom.type = st_repeat_in_push; atom.next1 = 2; atom.next2 = 1; piece.insert(org1stpos, atom); atom.type = st_check_0_width_repeat; atom.next1 = 0 - static_cast(piece.size()) - 1; // Points to *1. atom.next2 = 1; piece.push_back(atom); // greedy: 1.epsilon(2|6), // !greedy: 1.epsilon(6|2), // 2.repeat_in_push(4|3), 3.repeat_in_pop(0|0), 4.piece, // 5.check_0_width_repeat(1|6), 6.OutOfLoop. // or // greedy: 1.check_counter(2|8), // !greedy: 1.check_counter(8|2), // 2.increment_counter(4|3), 3.decrement_counter(0|0) // 4.repeat_in_push(6|5), 5.repeat_in_pop(0|0), 6.piece, // 7.check_0_width_repeat(1|8), 8.OutOfLoop. } piece_with_quantifier += piece; } #if !defined(SRELL_NO_NAMEDCAPTURE) bool parse_groupname(const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { const gname_string groupname = get_groupname(curpos, end, cstate); if (!this->namedcaptures.push_back(groupname, this->number_of_brackets)) this->throw_error(regex_constants::error_backref); return true; } #endif // !defined(SRELL_NO_NAMEDCAPTURE) // '['. bool register_character_class(state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state & /* cstate */) { range_pairs ranges; range_pair code_range; state_type classatom; range_pairs curranges; if (curpos == end) this->throw_error(regex_constants::error_brack); atom.type = st_character_class; if (*curpos == meta_char::mc_caret) // '^' { atom.is_not = true; ++curpos; } for (;;) { if (curpos == end) this->throw_error(regex_constants::error_brack); if (*curpos == meta_char::mc_sbracl) // ']' break; classatom.reset(); if (!get_character_in_class(curranges, classatom, curpos, end)) return false; if (classatom.type == st_character_class) { ranges.merge(curranges); continue; } code_range.first = code_range.second = classatom.character; if (curpos == end) this->throw_error(regex_constants::error_brack); if (*curpos == meta_char::mc_minus) // '-' { ++curpos; if (curpos == end) this->throw_error(regex_constants::error_brack); if (*curpos == meta_char::mc_sbracl) { PUSH_SEPARATELY: ranges.join(code_range); code_range.first = code_range.second = meta_char::mc_minus; } else { if (!get_character_in_class(curranges, classatom, curpos, end)) return false; if (classatom.type == st_character_class) { ranges.merge(curranges); goto PUSH_SEPARATELY; } code_range.second = classatom.character; if (!code_range.is_range_valid()) this->throw_error(regex_constants::error_range); } } ranges.join(code_range); } // *curpos == ']' ++curpos; if (this->is_icase()) ranges.make_caseunfoldedcharset(); if (atom.is_not) { ranges.negation(); atom.is_not = false; } // atom.character = this->is_icase() ? ranges.template consists_of_one_character() : ranges.template consists_of_one_character(); atom.character = ranges.consists_of_one_character(this->is_icase()); if (atom.character != constants::invalid_u32value) { atom.type = st_character; return true; } atom.number = this->character_class.register_newclass(ranges); return true; } bool get_character_in_class(range_pairs &rp, state_type &atom, const uchar32 *&curpos, const uchar32 *const end /* , const re_compiler_state &cstate */) { atom.character = *curpos++; if (atom.character != meta_char::mc_escape) // '\\' return true; rp.clear(); return translate_escseq(&rp, atom, curpos, end, true); } void add_predefclass_to_charclass(range_pairs &cls, const state_type &classatom) { range_pairs predefclass = this->character_class[classatom.number]; if (classatom.is_not) predefclass.negation(); cls.merge(predefclass); } #if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) bool parse_charclass_v(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { posdata_holder pos; parse_unicharset(pos, curpos, end, cstate); if (!pos.may_contain_strings()) { state_type atom; atom.reset(); atom.character = pos.ranges.consists_of_one_character(this->is_icase()); if (atom.character == constants::invalid_u32value) { atom.type = st_character_class; atom.number = this->character_class.register_newclass(pos.ranges); } piece.push_back(atom); piecesize = atom.quantifier; } else { transform_seqdata(piece, pos); piecesize.atleast = static_cast(pos.length.first); piecesize.atmost = static_cast(pos.length.second); } return true; } void parse_unicharset(posdata_holder &basepos, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state &cstate) { enum operation_type { op_init, op_firstcc, op_union, op_intersection, op_subtraction }; operation_type otype = op_init; posdata_holder newpos; range_pair code_range; state_type ccatom; bool invert; if (curpos == end) goto ERROR_NOT_CLOSED; if (*curpos == meta_char::mc_caret) // '^' { invert = true; ++curpos; } else invert = false; // ClassSetCharacter :: // \ CharacterEscape[+UnicodeMode] // \ ClassSetReservedPunctuator // \ b for (;;) { if (curpos == end) goto ERROR_NOT_CLOSED; if (*curpos == meta_char::mc_sbracl) // ']' break; const uchar32 next2chars = check_doublepunctuators(curpos, end); switch (otype) { case op_intersection: if (next2chars != char_other::co_amp) goto ERROR_DOUBLE_PUNCT; curpos += 2; break; case op_subtraction: if (next2chars != meta_char::mc_minus) goto ERROR_DOUBLE_PUNCT; curpos += 2; break; case op_firstcc: if (next2chars == char_other::co_amp) otype = op_intersection; else if (next2chars == meta_char::mc_minus) otype = op_subtraction; else if (next2chars == constants::invalid_u32value) break; else goto ERROR_DOUBLE_PUNCT; curpos += 2; break; // case op_union: // case op_init: default: if (next2chars != constants::invalid_u32value) goto ERROR_DOUBLE_PUNCT; } AFTER_OPERATOR: if (curpos == end) goto ERROR_NOT_CLOSED; ccatom.reset(); if (*curpos == meta_char::mc_sbraop) // '[' { ++curpos; parse_unicharset(newpos, curpos, end, cstate); } else get_character_in_class_vmode(newpos, ccatom, curpos, end, cstate, false); if (otype == op_init) otype = op_firstcc; else if (otype == op_firstcc) otype = op_union; if (curpos == end) goto ERROR_NOT_CLOSED; if (ccatom.type == st_character_class) { } else if (ccatom.type == st_character) { if (!newpos.has_data()) { code_range.set(ccatom.character); if (otype <= op_union) { if (*curpos == meta_char::mc_minus) // '-' { ++curpos; if (curpos == end) goto ERROR_BROKEN_RANGE; if (otype < op_union && *curpos == meta_char::mc_minus) // '-' { otype = op_subtraction; ++curpos; basepos.ranges.join(code_range); goto AFTER_OPERATOR; } get_character_in_class_vmode(newpos, ccatom, curpos, end, cstate, true); otype = op_union; code_range.second = ccatom.character; if (!code_range.is_range_valid()) goto ERROR_BROKEN_RANGE; } } newpos.ranges.join(code_range); if (this->is_icase()) newpos.ranges.make_caseunfoldedcharset(); } } switch (otype) { case op_union: basepos.do_union(newpos); break; case op_intersection: basepos.do_and(newpos); break; case op_subtraction: basepos.do_subtract(newpos); break; default: // case op_firstcc: basepos.swap(newpos); } } // *curpos == ']' ++curpos; if (this->is_icase()) basepos.ranges.make_caseunfoldedcharset(); if (invert) { if (basepos.may_contain_strings()) goto ERROR_NOT_INVERTIBLE; basepos.ranges.negation(); } return; ERROR_NOT_CLOSED: this->throw_error(regex_constants::error_brack); ERROR_BROKEN_RANGE: this->throw_error(regex_constants::error_range); ERROR_NOT_INVERTIBLE: this->throw_error(regex_constants::error_complement); ERROR_DOUBLE_PUNCT: this->throw_error(regex_constants::error_operator); } uchar32 check_doublepunctuators(const uchar32 *curpos, const uchar32 *const end) const { const uchar32 firstchar = *curpos++; if (curpos == end || *curpos != firstchar) return constants::invalid_u32value; switch (firstchar) { // ClassSetReservedDoublePunctuator :: one of // && !! ## $$ %% ** ++ ,, .. :: ;; << == >> ?? @@ ^^ `` ~~ case char_other::co_amp: // '&' case meta_char::mc_exclam: // '!' case meta_char::mc_sharp: // '#' case meta_char::mc_dollar: // '$' case char_other::co_perc: // '%' case meta_char::mc_astrsk: // '*' case meta_char::mc_plus: // '+' case meta_char::mc_comma: // ',' case meta_char::mc_period: // '.' case meta_char::mc_colon: // ':' case char_other::co_smcln: // ';' case meta_char::mc_lt: // '<' case meta_char::mc_eq: // '=' case meta_char::mc_gt: // '>' case meta_char::mc_query: // '?' case char_other::co_atmrk: // '@' case meta_char::mc_caret: // '^' case char_other::co_grav: // '`' case char_other::co_tilde: // '~' case meta_char::mc_minus: // '-' return firstchar; default: return constants::invalid_u32value; } } bool get_character_in_class_vmode( posdata_holder &pos, state_type &ccatom, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state &cstate, const bool no_ccesc ) { pos.clear(); ccatom.character = *curpos++; switch (ccatom.character) { // ClassSetSyntaxCharacter :: one of // ( ) [ ] { } / - \ | case meta_char::mc_rbraop: // '(' case meta_char::mc_rbracl: // ')' case meta_char::mc_sbraop: // '[' case meta_char::mc_sbracl: // ']' case meta_char::mc_cbraop: // '{' case meta_char::mc_cbracl: // '}' case char_other::co_slash: // '/' case meta_char::mc_minus: // '-' case meta_char::mc_bar: // '|' this->throw_error(regex_constants::error_noescape); //@fallthrough@ case meta_char::mc_escape: // '\\' break; default: return false; } if (curpos == end) this->throw_error(regex_constants::error_escape); ccatom.character = *curpos++; if (!no_ccesc) { if (((ccatom.character | constants::asc_icase) == char_alnum::ch_p)) { return parse_escape_p_vmode(pos, ccatom, curpos, end, cstate); } else if (ccatom.character == char_alnum::ch_q) { return parse_escape_q_vmode(pos, curpos, end, cstate); } } switch (ccatom.character) { // ClassSetReservedPunctuator :: one of // & - ! # % , : ; < = > @ ` ~ case char_other::co_amp: // '&' case meta_char::mc_exclam: // '!' case meta_char::mc_sharp: // '#' case char_other::co_perc: // '%' case meta_char::mc_comma: // ',' case meta_char::mc_colon: // ':' case char_other::co_smcln: // ';' case meta_char::mc_lt: // '<' case meta_char::mc_eq: // '=' case meta_char::mc_gt: // '>' case char_other::co_atmrk: // '@' case char_other::co_grav: // '`' case char_other::co_tilde: // '~' return false; default:; } translate_escseq_nocheck(&pos.ranges, ccatom, curpos, end, true, no_ccesc); return false; } bool parse_escape_q_vmode(posdata_holder &pos, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state &cstate) { if (curpos == end || *curpos != meta_char::mc_cbraop) // '{' this->throw_error(regex_constants::error_escape); simple_array seqs; simple_array curseq; posdata_holder dummypos; state_type qatom; ++curpos; for (;;) { if (curpos == end) this->throw_error(regex_constants::error_escape); if (*curpos == meta_char::mc_bar || *curpos == meta_char::mc_cbracl) // '|' or '}'. { const uint_l32 seqlen = static_cast(curseq.size()); if (seqlen <= 1) { seqs.push_backncr(2); seqs.push_backncr(seqlen != 0 ? curseq[0] : constants::ccstr_empty); } else // >= 2 { seqs.push_backncr(seqlen + 1); seqs.append(curseq); } if (*curpos == meta_char::mc_cbracl) // '}' break; curseq.clear(); ++curpos; } else { qatom.reset(); get_character_in_class_vmode(dummypos, qatom, curpos, end, cstate, true); curseq.push_backncr(qatom.character); } } ++curpos; pos.split_seqs_and_ranges(seqs, this->is_icase(), cstate.back); return true; } #endif // !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) bool translate_escseq(range_pairs *const rp, state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const bool insidecharclass) { if (curpos == end) this->throw_error(regex_constants::error_escape); atom.character = *curpos++; return translate_escseq_nocheck(rp, atom, curpos, end, insidecharclass, false); } bool translate_character_class_escape(range_pairs *const rp, state_type &cceatom #if !defined(SRELL_NO_UNICODE_PROPERTY) , const uchar32 *&curpos , const uchar32 *const end , const bool insidecharclass #else , const uchar32 *& , const uchar32 *const , const bool #endif ) { // Predefined classes. switch (cceatom.character) { case char_alnum::ch_D: // 'D': cceatom.is_not = true; //@fallthrough@ case char_alnum::ch_d: // 'd': cceatom.number = static_cast(re_character_class::digit); // \d, \D. break; case char_alnum::ch_S: // 'S': cceatom.is_not = true; //@fallthrough@ case char_alnum::ch_s: // 's': cceatom.number = static_cast(re_character_class::space); // \s, \S. break; case char_alnum::ch_W: // 'W': cceatom.is_not = true; //@fallthrough@ case char_alnum::ch_w: // 'w': if (this->is_icase()) { this->character_class.setup_icase_word(); cceatom.number = static_cast(re_character_class::icase_word); } else cceatom.number = static_cast(re_character_class::word); // \w, \W. break; #if !defined(SRELL_NO_UNICODE_PROPERTY) // Prepared for Unicode properties and script names. case char_alnum::ch_P: // \P{...} cceatom.is_not = true; //@fallthrough@ case char_alnum::ch_p: // \p{...} { range_pairs lranges; range_pairs *const pranges = (rp != NULL) ? rp : &lranges; get_property_ranges(*pranges, curpos, end); if (cceatom.is_not) { pranges->negation(); cceatom.is_not = false; } if (!insidecharclass && this->is_icase()) pranges->make_caseunfoldedcharset(); if (rp == NULL) cceatom.number = this->character_class.register_newclass(*pranges); } cceatom.type = st_character_class; return true; #endif // !defined(SRELL_NO_UNICODE_PROPERTY) default: return false; } if (rp != NULL) add_predefclass_to_charclass(*rp, cceatom); else { if (cceatom.is_not) { range_pairs lranges; add_predefclass_to_charclass(lranges, cceatom); cceatom.number = this->character_class.register_newclass(lranges); } } cceatom.is_not = false; cceatom.type = st_character_class; return true; } bool translate_escseq_nocheck(range_pairs *const rp, state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const bool insidecharclass, const bool no_ccesc) { if (!no_ccesc && translate_character_class_escape(rp, atom, curpos, end, insidecharclass)) return true; switch (atom.character) { case char_alnum::ch_b: atom.character = char_ctrl::cc_bs; // '\b' 0x08:BS break; case char_alnum::ch_t: atom.character = char_ctrl::cc_htab; // '\t' 0x09:HT break; case char_alnum::ch_n: atom.character = char_ctrl::cc_nl; // '\n' 0x0a:LF break; case char_alnum::ch_v: atom.character = char_ctrl::cc_vtab; // '\v' 0x0b:VT break; case char_alnum::ch_f: atom.character = char_ctrl::cc_ff; // '\f' 0x0c:FF break; case char_alnum::ch_r: atom.character = char_ctrl::cc_cr; // '\r' 0x0d:CR break; case char_alnum::ch_c: // \cX if (curpos != end) { // atom.character = static_cast(utf_traits().codepoint_inc(curpos, end) & 0x1f); // *curpos++ atom.character = static_cast(*curpos | constants::asc_icase); if (atom.character >= char_alnum::ch_a && atom.character <= char_alnum::ch_z) atom.character = static_cast(*curpos++ & 0x1f); else { this->throw_error(regex_constants::error_escape); // Strict. // atom.character = char_alnum::ch_c; // Loose. } } break; case char_alnum::ch_0: atom.character = char_ctrl::cc_nul; // '\0' 0x00:NUL break; case char_alnum::ch_x: // \xhh atom.character = translate_numbers(curpos, end, 16, 2, 2, 0xff); break; case char_alnum::ch_u: // \uhhhh, \u{h~hhhhhh} atom.character = parse_escape_u(curpos, end); break; // SyntaxCharacter, '/', and '-'. case meta_char::mc_caret: // '^' case meta_char::mc_dollar: // '$' case meta_char::mc_escape: // '\\' case meta_char::mc_period: // '.' case meta_char::mc_astrsk: // '*' case meta_char::mc_plus: // '+' case meta_char::mc_query: // '?' case meta_char::mc_rbraop: // '(' case meta_char::mc_rbracl: // ')' case meta_char::mc_sbraop: // '[' case meta_char::mc_sbracl: // ']' case meta_char::mc_cbraop: // '{' case meta_char::mc_cbracl: // '}' case meta_char::mc_bar: // '|' case char_other::co_slash: // '/' break; case meta_char::mc_minus: // '-' allowed only in charclass. if (insidecharclass) break; //@fallthrough@ default: atom.character = constants::invalid_u32value; } if (atom.character == constants::invalid_u32value) this->throw_error(regex_constants::error_escape); return true; } uchar32 parse_escape_u(const uchar32 *&curpos, const uchar32 *const end) const { uchar32 ucp; if (curpos == end) return constants::invalid_u32value; if (*curpos == meta_char::mc_cbraop) { // ucp = translate_numbers(++curpos, end, 16, 1, 6, constants::unicode_max_codepoint, true); ucp = translate_numbers(++curpos, end, 16, 1, 0, constants::unicode_max_codepoint); if (curpos == end || *curpos != meta_char::mc_cbracl) return constants::invalid_u32value; ++curpos; } else { ucp = translate_numbers(curpos, end, 16, 4, 4, 0xffff); if (ucp >= 0xd800 && ucp <= 0xdbff) { const uchar32 * prefetch = curpos; if (prefetch != end && *prefetch == meta_char::mc_escape && ++prefetch != end && *prefetch == char_alnum::ch_u) { const uchar32 nextucp = translate_numbers(++prefetch, end, 16, 4, 4, 0xffff); if (nextucp >= 0xdc00 && nextucp <= 0xdfff) { curpos = prefetch; ucp = (((ucp << 10) & 0xffc00) | (nextucp & 0x3ff)) + 0x10000; } } } } return ucp; } #if !defined(SRELL_NO_UNICODE_PROPERTY) void get_property_ranges(range_pairs &pranges, const uchar32 *&curpos, const uchar32 *const end) { const uint_l32 pnumber = lookup_propertynumber(curpos, end); if (pnumber == up_constants::error_property || this->character_class.is_pos(pnumber)) this->throw_error(regex_constants::error_property); this->character_class.load_upranges(pranges, pnumber); } uint_l32 lookup_propertynumber(const uchar32 *&curpos, const uchar32 *const end) { pstring pname; pstring pvalue; get_property_name_and_value(pname, pvalue, curpos, end); return this->character_class.get_propertynumber(pname, pvalue); } void get_property_name_and_value(pstring &pname, pstring &pvalue, const uchar32 *&curpos, const uchar32 *const end) { if (curpos == end || *curpos != meta_char::mc_cbraop) // '{' this->throw_error(regex_constants::error_escape); get_property_name_or_value(pvalue, ++curpos, end); if (!pvalue.size()) this->throw_error(regex_constants::error_escape); if (static_cast(pvalue[pvalue.size() - 1]) != char_other::co_sp) // ' ', not a value. { if (curpos == end) this->throw_error(regex_constants::error_escape); if (*curpos == meta_char::mc_eq) // '=' { pname = pvalue; get_property_name_or_value(pvalue, ++curpos, end); if (!pvalue.size()) this->throw_error(regex_constants::error_escape); } } if (curpos == end || *curpos != meta_char::mc_cbracl) // '}' this->throw_error(regex_constants::error_escape); if (static_cast(pvalue[pvalue.size() - 1]) == char_other::co_sp) // ' ', value. pvalue.resize(pvalue.size() - 1); ++curpos; } void get_property_name_or_value(pstring &name_or_value, const uchar32 *&curpos, const uchar32 *const end) const { bool number_found = false; name_or_value.clear(); for (;; ++curpos) { if (curpos == end) break; const uchar32 curchar = *curpos; if (curchar >= char_alnum::ch_A && curchar <= char_alnum::ch_Z) ; else if (curchar >= char_alnum::ch_a && curchar <= char_alnum::ch_z) ; else if (curchar == char_other::co_ll) // '_' ; else if (curchar >= char_alnum::ch_0 && curchar <= char_alnum::ch_9) number_found = true; else break; name_or_value.append(1, static_cast(curchar)); } if (number_found) name_or_value.append(1, char_other::co_sp); // ' ' } #endif // !defined(SRELL_NO_UNICODE_PROPERTY) bool translate_atom_escape(state_type &escatom #if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) , state_array &piece , re_quantifier &piecesize #else , state_array & , re_quantifier & #endif , const uchar32 *&curpos, const uchar32 *const end, /* const */ re_compiler_state &cstate) { if (curpos == end) this->throw_error(regex_constants::error_escape); escatom.character = *curpos; #if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) if (this->is_vmode() && ((escatom.character | constants::asc_icase) == char_alnum::ch_p)) { posdata_holder pos; parse_escape_p_vmode(pos, escatom, ++curpos, end, cstate); if (escatom.type == st_character_class) escatom.number = this->character_class.register_newclass(pos.ranges); else transform_seqdata(piece, pos); piecesize.set(escatom.quantifier.atleast, escatom.quantifier.atmost); return true; } #endif // !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) switch (escatom.character) { case char_alnum::ch_B: // 'B': escatom.is_not = true; //@fallthrough@ case char_alnum::ch_b: // 'b': escatom.type = st_boundary; // \b, \B. escatom.quantifier.reset(0); // atom.number = 0; if (this->is_icase()) { this->character_class.setup_icase_word(); escatom.number = static_cast(re_character_class::icase_word); } else escatom.number = static_cast(re_character_class::word); // \w, \W. break; // case char_alnum::ch_A: // 'A': // atom.type = st_bol; // '\A' // case char_alnum::ch_Z: // 'Z': // atom.type = st_eol; // '\Z' // case char_alnum::ch_z: // 'z': // atom.type = st_eol; // '\z' // case char_alnum::ch_R: // 'R': // (?>\r\n?|[\x0A-\x0C\x85\u{2028}\u{2029}]) // Backreferences. #if !defined(SRELL_NO_NAMEDCAPTURE) // Prepared for named captures. case char_alnum::ch_k: // 'k': return parse_backreference_name(escatom, curpos, end, cstate); // \k. #endif default: if (escatom.character >= char_alnum::ch_1 && escatom.character <= char_alnum::ch_9) // \1, \9. return parse_backreference_number(escatom, curpos, end, cstate); return translate_escseq(NULL, escatom, curpos, end, false); } ++curpos; return true; } bool parse_backreference_number(state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state &cstate) { const uchar32 backrefno = translate_numbers(curpos, end, 10, 0, 0, 0xfffffffe); // 22.2.1.1 Static Semantics: Early Errors: // It is a Syntax Error if NcapturingParens >= 23^2 - 1. if (backrefno == constants::invalid_u32value) this->throw_error(regex_constants::error_escape); atom.number = static_cast(backrefno); atom.backrefnumber_unresolved = false; return backreference_postprocess(atom, cstate); } bool backreference_postprocess(state_type &atom, const re_compiler_state & /* cstate */) const { atom.next2 = 1; atom.type = st_backreference; // atom.quantifier.atleast = cstate.atleast_widths_of_brackets[atom.number - 1]; // Moved to check_backreferences(). return true; } #if !defined(SRELL_NO_NAMEDCAPTURE) bool parse_backreference_name(state_type &atom, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) { if (++curpos == end || *curpos != meta_char::mc_lt) this->throw_error(regex_constants::error_escape); const gname_string groupname = get_groupname(++curpos, end, cstate); atom.number = this->namedcaptures[groupname]; if (atom.number != groupname_mapper::notfound) atom.backrefnumber_unresolved = false; else { atom.backrefnumber_unresolved = true; atom.number = static_cast(cstate.unresolved_gnames.size()); cstate.unresolved_gnames.push_back(groupname, atom.number); } return backreference_postprocess(atom, cstate); } #if !defined(SRELL_NO_UNICODE_PROPERTY) gname_string get_groupname(const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &cstate) #else gname_string get_groupname(const uchar32 *&curpos, const uchar32 *const end, re_compiler_state &) #endif { charT mbstr[utf_traits::maxseqlen]; gname_string groupname; #if !defined(SRELL_NO_UNICODE_PROPERTY) cstate.idchecker.setup(); #endif for (;;) { if (curpos == end) this->throw_error(regex_constants::error_escape); uchar32 curchar = *curpos++; if (curchar == meta_char::mc_gt) // '>' break; if (curchar == meta_char::mc_escape && curpos != end && *curpos == char_alnum::ch_u) // '\\', 'u'. curchar = parse_escape_u(++curpos, end); #if defined(SRELL_NO_UNICODE_PROPERTY) if (curchar != meta_char::mc_escape) #else if (cstate.idchecker.is_identifier(curchar, groupname.size() != 0)) #endif ; // OK. else curchar = constants::invalid_u32value; if (curchar == constants::invalid_u32value) this->throw_error(regex_constants::error_escape); const uchar32 seqlen = utf_traits::to_codeunits(mbstr, curchar); for (uchar32 i = 0; i < seqlen; ++i) groupname.append(1, mbstr[i]); } if (!groupname.size()) this->throw_error(regex_constants::error_escape); return groupname; } #endif // !defined(SRELL_NO_NAMEDCAPTURE) #if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) bool parse_escape_p_vmode(posdata_holder &pos, state_type &patom, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state &cstate) { if (curpos == end) this->throw_error(regex_constants::error_escape); // patom.is_not = (patom.character & constants::asc_icase) ? false : true; if (patom.character == char_alnum::ch_P) // \P{...} patom.is_not = true; patom.number = lookup_propertynumber(curpos, end); if (patom.number == up_constants::error_property) this->throw_error(regex_constants::error_property); if (!this->character_class.is_pos(patom.number)) { pos.clear(); this->character_class.load_upranges(pos.ranges, patom.number); if (this->is_icase() && patom.number >= static_cast(re_character_class::number_of_predefcls)) pos.ranges.make_caseunfoldedcharset(); if (patom.is_not) { pos.ranges.negation(); patom.is_not = false; } patom.type = st_character_class; patom.quantifier.reset(1); } else { simple_array sequences; this->character_class.get_prawdata(sequences, patom.number); pos.split_seqs_and_ranges(sequences, this->is_icase(), cstate.back); patom.quantifier.set(pos.length.first, pos.length.second); if (patom.is_not) this->throw_error(regex_constants::error_complement); } return true; } uint_l32 transform_seqdata(state_array &piece, const posdata_holder &pos) { uchar32 seqlen = static_cast(pos.indices.size()); state_type ccatom; ccatom.reset(); ccatom.type = st_character_class; ccatom.number = this->character_class.register_newclass(pos.ranges); if (seqlen > 0) { const bool has_empty = pos.has_empty(); state_size_type prevbranch_end = 0; state_type branchatom; state_type jumpatom; state_array branch; branch.resize(seqlen); for (uchar32 i = 0; i < seqlen; ++i) branch[i].reset(); branchatom.reset(); branchatom.type = st_epsilon; branchatom.character = meta_char::mc_bar; // '|' jumpatom.reset(); jumpatom.type = st_epsilon; --seqlen; for (; seqlen >= 2; --seqlen) { uchar32 offset = pos.indices[seqlen]; const uchar32 seqend = pos.indices[seqlen - 1]; if (offset != seqend) { branch.resize(seqlen + 1); branch[seqlen] = jumpatom; branchatom.quantifier.atleast = seqlen; for (uchar32 count = 0; offset < seqend; ++offset) { branch[count++].character = pos.seqs[offset]; if (count == seqlen) { if (piece.size()) { state_type &lastatom = piece[piece.size() - 1]; lastatom.next1 = seqlen + 2; piece[prevbranch_end].next2 = static_cast(piece.size() - prevbranch_end); } prevbranch_end = piece.size(); piece.push_back(branchatom); piece.append(branch); count = 0; } } } } if (piece.size()) { state_type &lastatom = piece[piece.size() - 1]; lastatom.next1 = has_empty ? 3 : 2; piece[prevbranch_end].next2 = static_cast(piece.size() - prevbranch_end); } branchatom.quantifier.atleast = 1; if (has_empty) { branchatom.next2 = 2; piece.push_back(branchatom); } piece.push_back(ccatom); branchatom.character = meta_char::mc_colon; // ':' branchatom.next1 = 1; branchatom.next2 = 0; branchatom.quantifier.set(1, 0); piece.insert(0, branchatom); ++prevbranch_end; branchatom.quantifier.atmost = 1; piece.push_back(branchatom); optimise_pos(piece, branchatom, has_empty); } return ccatom.number; } void optimise_pos(state_array &piece, state_type &branchatom, const bool has_empty) const { #if !defined(SRELLDBG_NO_ASTERISK_OPT) && !defined(SRELLDBG_NO_POS_OPT) simple_array ins_bt; branchatom.number = 0u; branchatom.next1 = 0; for (state_size_type srcbase = 1; srcbase < piece.size();) { state_type &srcbaseatom = piece[srcbase]; if (srcbaseatom.is_branch()) { if (srcbaseatom.quantifier.atleast == 1) { break; } const state_size_type dstbaseinit = srcbase + srcbaseatom.next2; const uint_l32 srcseqlen = srcbaseatom.quantifier.atleast; for (uint_l32 complen = srcseqlen; complen;) { state_size_type dstbase = dstbaseinit; bool modified = false; --complen; for (;;) { state_type &dstbaseatom = piece[dstbase]; const uint_l32 dstseqlen = dstbaseatom.quantifier.atleast; state_size_type dstpos = dstbase + (dstbaseatom.type == st_epsilon ? 1 : 0); if (piece[dstpos].type == st_character_class) { branchatom.next2 = static_cast(dstbase); break; } if (dstseqlen >= complen) { state_size_type srcpos = srcbase + 1; for (uint_l32 i = 0;; ++i) { state_type &srcref = piece[srcpos]; state_type &dstref = piece[dstpos]; if (i == complen) { if (dstref.type == st_epsilon) { if (complen) { uchar32 inspos = static_cast(ins_bt.size()); for (; !piece[srcpos].quantifier.is_greedy;) srcpos = static_cast(piece[srcpos].quantifier.atmost); for (; inspos >= 2 && (ins_bt[inspos - 2] > srcpos);) { inspos -= 2; } ins_bt.insert(inspos, dstpos); ins_bt.insert(inspos, srcpos); } } else if (dstref.type == st_character && srcref.character != dstref.character) { if (dstref.quantifier.is_greedy) { piece[srcpos].next2 = static_cast(dstpos - srcpos); dstref.quantifier.is_greedy = false; dstref.quantifier.atmost = srcpos; } modified = true; } break; } else if (srcref.character == dstref.character && dstref.type == st_character) { ++srcpos; ++dstpos; } else break; } if (modified) break; if (dstbaseatom.type == st_epsilon && dstbaseatom.next2) { dstbase += dstbaseatom.next2; } else { break; } } else break; } } if (branchatom.number == 0) { branchatom.next1 = static_cast(srcbase); } else { srcbaseatom.next2 = 0; srcbaseatom.character = char_alnum::ch_s; } srcbase = dstbaseinit; ++branchatom.number; } else break; // ++srcbase; } if (branchatom.next2 == 0) ++branchatom.next2; if (branchatom.next1 != 0) piece[branchatom.next1].next2 = static_cast(branchatom.next2 - branchatom.next1); if (has_empty) { piece[branchatom.next2].next2 = 0; piece[branchatom.next2 + 1].next2 = 1; } #endif // !defined(SRELLDBG_NO_ASTERISK_OPT) && !defined(SRELLDBG_NO_POS_OPT) } void insert_btbranch(state_array &piece, const simple_array &ins_bt) const { state_type insstate; simple_array reordering1; simple_array reordering2; uchar32 offset = 0; uchar32 chainindex = 0; insstate.reset(); insstate.type = st_epsilon; insstate.character = meta_char::mc_bar; reordering1.resize(piece.size() + 1); reordering2.resize(piece.size() + 1); for (uchar32 indx = 0; indx <= piece.size(); ++indx) { reordering1[indx] = indx + offset; if (chainindex < ins_bt.size()) { if (indx == ins_bt[chainindex]) { ++offset; chainindex += 2; } } reordering2[indx] = indx + offset; } for (uchar32 indx = 0; indx < piece.size(); ++indx) { state_type &st = piece[indx]; if (st.next1 != 0) { const uchar32 newn1 = reordering1[indx + st.next1]; st.next1 = static_cast(newn1 - reordering2[indx]); } if (st.next2 != 0) { const uchar32 newn2 = reordering1[indx + st.next2]; st.next2 = static_cast(newn2 - reordering2[indx]); } } for (uchar32 indx = 0; indx < ins_bt.size();) { const uchar32 srcpos = reordering1[ins_bt[indx++]]; const uchar32 dstpos = reordering1[ins_bt[indx++]]; piece.insert(srcpos, insstate); state_type &insst = piece[srcpos]; insst.next2 = static_cast(dstpos - srcpos); } } #endif // !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY) bool get_quantifier(re_quantifier &quantifier, const uchar32 *&curpos, const uchar32 *const end) { switch (*curpos) { case meta_char::mc_astrsk: // '*': --quantifier.atleast; //@fallthrough@ case meta_char::mc_plus: // '+': quantifier.set_infinity(); break; case meta_char::mc_query: // '?': --quantifier.atleast; break; case meta_char::mc_cbraop: // '{': get_brace_with_quantifier(quantifier, curpos, end); break; default: return true; } if (++curpos != end && *curpos == meta_char::mc_query) // '?' { quantifier.is_greedy = false; ++curpos; } return true; } void get_brace_with_quantifier(re_quantifier &quantifier, const uchar32 *&curpos, const uchar32 *const end) { ++curpos; quantifier.atleast = static_cast(translate_numbers(curpos, end, 10, 1, 0, constants::max_u32value)); if (quantifier.atleast == static_cast(constants::invalid_u32value)) goto THROW_ERROR_BRACE; if (curpos == end) goto THROW_ERROR_BRACE; if (*curpos == meta_char::mc_comma) // ',' { ++curpos; quantifier.atmost = static_cast(translate_numbers(curpos, end, 10, 1, 0, constants::max_u32value)); if (quantifier.atmost == static_cast(constants::invalid_u32value)) quantifier.set_infinity(); if (!quantifier.is_valid()) this->throw_error(regex_constants::error_badbrace); } else quantifier.atmost = quantifier.atleast; if (curpos == end || *curpos != meta_char::mc_cbracl) // '}' { THROW_ERROR_BRACE: this->throw_error(regex_constants::error_brace); } // *curpos == '}' } uchar32 translate_numbers(const uchar32 *&curpos, const uchar32 *const end, const int radix, const std::size_t minsize, const std::size_t maxsize, const uchar32 maxvalue) const { std::size_t count = 0; uchar32 u32value = 0; int num; for (; maxsize == 0 || count < maxsize; ++curpos, ++count) { if (curpos == end || (num = tonumber(*curpos, radix)) == -1) break; const uchar32 nextvalue = u32value * radix + num; if ((/* maxvalue != 0 && */ nextvalue > maxvalue) || nextvalue < u32value) break; u32value = nextvalue; } if (count >= minsize) return u32value; return constants::invalid_u32value; } int tonumber(const uchar32 ch, const int radix) const { if ((ch >= char_alnum::ch_0 && ch <= char_alnum::ch_7) || (radix >= 10 && (ch == char_alnum::ch_8 || ch == char_alnum::ch_9))) return static_cast(ch - char_alnum::ch_0); if (radix == 16) { if (ch >= char_alnum::ch_a && ch <= char_alnum::ch_f) return static_cast(ch - char_alnum::ch_a + 10); if (ch >= char_alnum::ch_A && ch <= char_alnum::ch_F) return static_cast(ch - char_alnum::ch_A + 10); } return -1; } bool check_backreferences(re_compiler_state &cstate) { for (typename state_array::size_type backrefpos = 0; backrefpos < this->NFA_states.size(); ++backrefpos) { state_type &brs = this->NFA_states[backrefpos]; if (brs.type == st_backreference) { const uint_l32 &backrefno = brs.number; #if !defined(SRELL_NO_NAMEDCAPTURE) if (brs.backrefnumber_unresolved) { if (backrefno >= cstate.unresolved_gnames.size()) return false; // Internal error. brs.number = this->namedcaptures[cstate.unresolved_gnames[backrefno]]; if (backrefno == groupname_mapper::notfound) return false; brs.backrefnumber_unresolved = false; } #endif for (typename state_array::size_type roundbracket_closepos = 0;; ++roundbracket_closepos) { if (roundbracket_closepos < this->NFA_states.size()) { const state_type &rbcs = this->NFA_states[roundbracket_closepos]; if (rbcs.type == st_roundbracket_close && rbcs.number == backrefno) { if (roundbracket_closepos < backrefpos) { // brs.quantifier.atleast = cstate.atleast_widths_of_brackets[backrefno - 1]; // 20210429: It was reported that clang-tidy was dissatisfied with this code. // 20211006: Replaced with the following code: const uint_l32 backrefnoindex = backrefno - 1; // This can never be true. Added only for satisfying clang-tidy. if (backrefnoindex >= cstate.atleast_widths_of_brackets.size()) return false; brs.quantifier.atleast = cstate.atleast_widths_of_brackets[backrefnoindex]; cstate.backref_used = true; } else { brs.type = st_epsilon; brs.next2 = 0; } break; } } else return false; } } } return true; } #if !defined(SRELLDBG_NO_1STCHRCLS) void create_firstchar_class() { #if !defined(SRELLDBG_NO_BITSET) range_pairs fcc; #else range_pairs &fcc = this->firstchar_class; #endif const bool canbe0length = gather_nextchars(fcc, static_cast(this->NFA_states[0].next1), 0u, false); if (canbe0length) { fcc.set_solerange(range_pair_helper(0, constants::unicode_max_codepoint)); // Expressions would consist of assertions only, such as /^$/. // We cannot but accept every codepoint. } #if !defined(SRELLDBG_NO_BITSET) this->NFA_states[0].quantifier.atleast = this->character_class.register_newclass(fcc); set_bitset_table(fcc); #endif } #if !defined(SRELLDBG_NO_BITSET) void set_bitset_table(const range_pairs &fcc) { for (typename range_pairs::size_type i = 0; i < fcc.size(); ++i) { const range_pair &range = fcc[i]; #if 0 uchar32 second = range.second <= constants::unicode_max_codepoint ? range.second : constants::unicode_max_codepoint; #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(push) #pragma warning(disable:4127) #endif if (utf_traits::utftype == 16) #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(pop) #endif { if (second >= 0x10000 && range.first < 0x10000) { this->firstchar_class_bs.set_range(utf_traits::firstcodeunit(0x10000) & utf_traits::bitsetmask, utf_traits::firstcodeunit(second) & utf_traits::bitsetmask); second = 0xffff; } } this->firstchar_class_bs.set_range(utf_traits::firstcodeunit(range.first) & utf_traits::bitsetmask, utf_traits::firstcodeunit(second) & utf_traits::bitsetmask); #else for (uchar32 ucp = range.first; ucp <= constants::unicode_max_codepoint; ++ucp) { this->firstchar_class_bs.set(utf_traits::firstcodeunit(ucp) & utf_traits::bitsetmask); if (ucp == range.second) break; } #endif } } #endif // !defined(SRELLDBG_NO_BITSET) #endif // !defined(SRELLDBG_NO_1STCHRCLS) bool gather_nextchars(range_pairs &nextcharclass, typename state_array::size_type pos, simple_array &checked, const uint_l32 bracket_number, const bool subsequent) const { bool canbe0length = false; for (;;) { const state_type &state = this->NFA_states[pos]; if (checked[pos]) break; checked[pos] = true; if (state.next2 && (state.type != st_check_counter || !state.quantifier.is_greedy || state.quantifier.atleast == 0) && (state.type != st_save_and_reset_counter) && (state.type != st_roundbracket_open) && (state.type != st_roundbracket_close || state.number != bracket_number) && (state.type != st_repeat_in_push) && (state.type != st_backreference || (state.quantifier.atleast == 0 && state.next1 != state.next2)) && (state.type != st_lookaround_open)) if (gather_nextchars(nextcharclass, pos + state.next2, checked, bracket_number, subsequent)) canbe0length = true; switch (state.type) { case st_character: if (!this->is_ricase()) { nextcharclass.join(range_pair_helper(state.character)); } else { uchar32 table[ucf_constants::rev_maxset] = {}; const uchar32 setnum = unicode_case_folding::casefoldedcharset(table, state.character); for (uchar32 j = 0; j < setnum; ++j) nextcharclass.join(range_pair_helper(table[j])); } return canbe0length; case st_character_class: nextcharclass.merge(this->character_class[state.number]); return canbe0length; case st_backreference: { const typename state_array::size_type nextpos = find_next1_of_bracketopen(state.number); const bool length0 = gather_nextchars(nextcharclass, nextpos, state.number, subsequent); if (!length0) return canbe0length; } break; case st_eol: case st_bol: if (!subsequent) break; //@fallthrough@ case st_boundary: if (subsequent) nextcharclass.set_solerange(range_pair_helper(0, constants::unicode_max_codepoint)); break; case st_lookaround_open: // if (!state.is_not && !state.reverse) if (!state.is_not && state.quantifier.atleast == 0) { gather_nextchars(nextcharclass, pos + 1, checked, 0u, subsequent); } else if (subsequent) nextcharclass.set_solerange(range_pair_helper(0, constants::unicode_max_codepoint)); break; case st_roundbracket_close: if (/* bracket_number == 0 || */ state.number != bracket_number) break; //@fallthrough@ case st_success: // == st_lookaround_close. return true; case st_check_counter: if (!state.quantifier.is_greedy && state.quantifier.atleast >= 1) return canbe0length; //@fallthrough@ default:; } if (state.next1) pos += state.next1; else break; } return canbe0length; } bool gather_nextchars(range_pairs &nextcharclass, const typename state_array::size_type pos, const uint_l32 bracket_number, const bool subsequent) const { simple_array checked; checked.resize(this->NFA_states.size(), false); return gather_nextchars(nextcharclass, pos, checked, bracket_number, subsequent); } typename state_array::size_type find_next1_of_bracketopen(const uint_l32 bracketno) const { for (typename state_array::size_type no = 0; no < this->NFA_states.size(); ++no) { const state_type &state = this->NFA_states[no]; if (state.type == st_roundbracket_open && state.number == bracketno) return no + state.next1; } return 0; } void relativejump_to_absolutejump() { for (typename state_array::size_type pos = 0; pos < this->NFA_states.size(); ++pos) { state_type &state = this->NFA_states[pos]; #if !defined(SRELLDBG_NO_ASTERISK_OPT) if (state.next1 || state.type == st_character || state.type == st_character_class) #else if (state.next1) #endif state.next_state1 = &this->NFA_states[pos + state.next1]; else state.next_state1 = NULL; if (state.next2) state.next_state2 = &this->NFA_states[pos + state.next2]; else state.next_state2 = NULL; } } void optimise() { #if !defined(SRELLDBG_NO_BRANCH_OPT2) && !defined(SRELLDBG_NO_ASTERISK_OPT) branch_optimisation2(); #endif #if !defined(SRELLDBG_NO_ASTERISK_OPT) asterisk_optimisation(); #endif #if !defined(SRELLDBG_NO_BRANCH_OPT) && !defined(SRELLDBG_NO_ASTERISK_OPT) branch_optimisation(); #endif #if !defined(SRELLDBG_NO_1STCHRCLS) create_firstchar_class(); #endif #if !defined(SRELLDBG_NO_SKIP_EPSILON) skip_epsilon(); #endif #if !defined(SRELLDBG_NO_CCPOS) set_charclass_posinfo(); #endif } #if !defined(SRELLDBG_NO_SKIP_EPSILON) void skip_epsilon() { for (typename state_array::size_type pos = 0; pos < this->NFA_states.size(); ++pos) { state_type &state = this->NFA_states[pos]; if (state.next1) state.next1 = static_cast(skip_nonbranch_epsilon(pos + state.next1) - pos); if (state.next2) state.next2 = static_cast(skip_nonbranch_epsilon(pos + state.next2) - pos); } } typename state_array::size_type skip_nonbranch_epsilon(typename state_array::size_type pos) const { for (;;) { const state_type &state = this->NFA_states[pos]; if (state.type == st_epsilon && state.next2 == 0) { pos += state.next1; continue; } break; } return pos; } #endif #if !defined(SRELLDBG_NO_ASTERISK_OPT) void asterisk_optimisation() { state_type *prevstate_is_astrskepsilon = NULL; const state_type *prevcharstate = NULL; state_size_type mnp_inspos = 0; bool inspos_updatable = true; #if !defined(SRELLDBG_NO_SPLITCC) bool inserted = false; #endif for (typename state_array::size_type cur = 1; cur < this->NFA_states.size(); ++cur) { state_type &curstate = this->NFA_states[cur]; switch (curstate.type) { case st_epsilon: if (curstate.character == meta_char::mc_astrsk) { prevstate_is_astrskepsilon = &curstate; } else { prevstate_is_astrskepsilon = NULL; inspos_updatable = false; } break; case st_character: case st_character_class: if (inspos_updatable) { if (prevcharstate) { if (prevcharstate->type != curstate.type || prevcharstate->number != curstate.number) inspos_updatable = false; } if (inspos_updatable) { if (prevstate_is_astrskepsilon) { inspos_updatable = false; if (prevstate_is_astrskepsilon->quantifier.is_asterisk_or_plus()) { mnp_inspos = cur + 1; } } } prevcharstate = &curstate; } if (prevstate_is_astrskepsilon) { const re_quantifier &eq = prevstate_is_astrskepsilon->quantifier; const state_size_type epsilonno = cur - 1; const state_size_type faroffset = eq.is_greedy ? prevstate_is_astrskepsilon->next2 : prevstate_is_astrskepsilon->next1; const state_size_type nextno = epsilonno + faroffset; #if !defined(SRELLDBG_NO_SPLITCC) const state_size_type origlen = this->NFA_states.size(); #endif if (is_exclusive_sequence(eq, cur, nextno)) { state_type &epsilonstate = this->NFA_states[epsilonno]; state_type &curstate2 = this->NFA_states[cur]; epsilonstate.next1 = 1; epsilonstate.next2 = 0; epsilonstate.number = 0; // curstate2.quantifier.is_greedy = true; if (epsilonstate.quantifier.is_infinity()) { curstate2.next1 = 0; curstate2.next2 = faroffset - 1; } else // ? or {0,1} { curstate2.next2 = faroffset - 1; } #if !defined(SRELLDBG_NO_SPLITCC) if (mnp_inspos == nextno && origlen != this->NFA_states.size()) inserted = true; #endif } prevstate_is_astrskepsilon = NULL; } break; default: prevstate_is_astrskepsilon = NULL; inspos_updatable = false; } } #if !defined(SRELLDBG_NO_NEXTPOS_OPT) if (mnp_inspos != 0) { state_size_type cur = mnp_inspos; if (this->NFA_states[cur].type != st_success) { const state_type &prevstate = this->NFA_states[cur - 1]; #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER) && !defined(SRELLDBG_NO_1STCHRCLS) && !defined(SRELLDBG_NO_BITSET) #if !defined(SRELLDBG_NO_SPLITCC) if (!inserted && prevstate.next1 == 0) #else if (prevstate.next1 == 0) #endif { range_pairs prevcc; range_pairs nextcc; // gather_if_char_or_cc_strict(prevcc, prevstate); if (prevstate.type == st_character) { prevcc.set_solerange(range_pair_helper(prevstate.character)); } else if (prevstate.type == st_character_class) { prevcc = this->character_class[prevstate.number]; } gather_nextchars(nextcc, cur, 0u, true); const uint_l32 cpnum_prevcc = prevcc.total_codepoints(); const uint_l32 cpnum_nextcc = nextcc.total_codepoints(); if (cpnum_nextcc != 0 && cpnum_nextcc < cpnum_prevcc) { state_array newNFAs; state_type atom; atom.reset(); atom.character = meta_char::mc_eq; // '=' atom.type = st_lookaround_open; atom.next1 = static_cast(cur - 1) * 2 + 2; atom.next2 = 1; atom.quantifier.atleast = 2; // Match point rewinder. newNFAs.append(1, atom); newNFAs.append(this->NFA_states, 1, cur - 1); atom.type = st_lookaround_close; atom.next1 = 0; atom.next2 = 0; newNFAs.append(1, atom); insert_at(1, newNFAs.size()); this->NFA_states.replace(1, newNFAs.size(), newNFAs); this->NFA_states[0].next2 = this->NFA_states[0].next1; this->NFA_states[0].next1 = 1; return; } } #endif // !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER) && !defined(SRELLDBG_NO_1STCHRCLS) && !defined(SRELLDBG_NO_BITSET) insert_at(cur, 1); state_type &mnpstate = this->NFA_states[cur]; state_type &charstate = this->NFA_states[cur - 1]; mnpstate.type = st_move_nextpos; #if !defined(SRELLDBG_NO_SPLITCC) if (inserted) { charstate.next2 = 1; } else #endif if (charstate.next1 == 0) { mnpstate.next1 = charstate.next2 - 1; charstate.next2 = 1; } else { mnpstate.next1 = -2; charstate.next1 = 1; } } } #endif // !defined(SRELLDBG_NO_NEXTPOS_OPT) } bool is_exclusive_sequence(const re_quantifier &eq, const state_size_type curno, const state_size_type nextno) // const { const state_type &curstate = this->NFA_states[curno]; range_pairs curchar_class; range_pairs nextchar_class; if (curstate.type == st_character) { curchar_class.join(range_pair_helper(curstate.character)); } else if (curstate.type == st_character_class) { curchar_class = this->character_class[curstate.number]; if (curchar_class.size() == 0) // Means [], which always makes matching fail. return true; // For preventing the automaton from pushing bt data. } else { return false; } const bool canbe0length = gather_nextchars(nextchar_class, nextno, 0u, true); if (nextchar_class.size()) { if (!canbe0length || eq.is_greedy) { #if !defined(SRELLDBG_NO_SPLITCC) range_pairs kept; range_pairs removed; curchar_class.split_ranges(kept, removed, nextchar_class); if (removed.size() == 0) // !curchar_class.is_overlap(nextchar_class) return true; if (curstate.type == st_character_class && kept.size() && eq.is_infinity()) { { state_type &curstate2 = this->NFA_states[curno]; curstate2.character = kept.consists_of_one_character(this->is_icase()); if (curstate2.character != constants::invalid_u32value) curstate2.type = st_character; else curstate2.number = this->character_class.register_newclass(kept); } const re_quantifier backupeq(eq); insert_at(nextno, 2); state_type &n0 = this->NFA_states[nextno]; state_type &n1 = this->NFA_states[nextno + 1]; n0.reset(); n0.type = st_epsilon; n0.character = meta_char::mc_astrsk; n0.quantifier = backupeq; // n0.next2 = 1; n0.next2 = 2; if (!n0.quantifier.is_greedy) { n0.next1 = n0.next2; n0.next2 = 1; } n1.reset(); n1.type = st_character_class; n1.character = removed.consists_of_one_character(this->is_icase()); if (n1.character != constants::invalid_u32value) n1.type = st_character; else n1.number = this->character_class.register_newclass(removed); n1.next1 = -2; // n1.next2 = 0; return true; } #else // defined(SRELLDBG_NO_SPLITCC) if (!curchar_class.is_overlap(nextchar_class)) { return true; } #endif // !defined(SRELLDBG_NO_SPLITCC) } } else if (/* nextchar_class.size() == 0 && */ (!canbe0length || only_success_left(nextno))) { // (size() == 0 && !canbe0length) means []. return eq.is_greedy; } return false; } bool only_success_left(typename state_array::size_type pos) const { for (;;) { const state_type &state = this->NFA_states[pos]; switch (state.type) { case st_success: return true; case st_roundbracket_close: case st_backreference: if (state.next2 != 0 && state.next1 != state.next2) return false; break; case st_epsilon: if (state.next2 != 0 && !only_success_left(pos + state.next2)) return false; break; case st_roundbracket_open: break; // /a*()/ default: return false; } if (state.next1) pos += state.next1; else return false; } } #endif // !defined(SRELLDBG_NO_ASTERISK_OPT) void insert_at(const typename state_array::size_type pos, const std::ptrdiff_t len) { state_type newstate; for (typename state_array::size_type cur = 0; cur < pos; ++cur) { state_type &state = this->NFA_states[cur]; if (state.next1 && (cur + state.next1) >= pos) state.next1 += len; if (state.next2 && (cur + state.next2) >= pos) state.next2 += len; } for (typename state_array::size_type cur = pos; cur < this->NFA_states.size(); ++cur) { state_type &state = this->NFA_states[cur]; if ((cur + state.next1) < pos) state.next1 -= len; if ((cur + state.next2) < pos) state.next2 -= len; } newstate.reset(); newstate.type = st_epsilon; for (std::ptrdiff_t count = 0; count < len; ++count) this->NFA_states.insert(pos, newstate); } #if !defined(SRELLDBG_NO_NEXTPOS_OPT) #endif // !defined(SRELLDBG_NO_NEXTPOS_OPT) #if !defined(SRELLDBG_NO_BRANCH_OPT) || !defined(SRELLDBG_NO_BRANCH_OPT2) state_size_type gather_if_char_or_charclass(range_pairs &charclass, state_size_type pos, const bool strictly) const { for (;;) { const state_type &curstate = this->NFA_states[pos]; if (curstate.type == st_character && curstate.next2 == 0) { charclass.set_solerange(range_pair_helper(curstate.character)); return pos; } else if (curstate.type == st_character_class && curstate.next2 == 0) { charclass = this->character_class[curstate.number]; return pos; } else if (curstate.type == st_epsilon && curstate.next2 == 0 && !strictly) { } else break; pos += curstate.next1; } return 0; } #endif // !defined(SRELLDBG_NO_BRANCH_OPT) || !defined(SRELLDBG_NO_BRANCH_OPT2) #if !defined(SRELLDBG_NO_BRANCH_OPT) void branch_optimisation() { range_pairs nextcharclass1; for (typename state_array::size_type pos = 0; pos < this->NFA_states.size(); ++pos) { const state_type &state = this->NFA_states[pos]; if (state.is_branch()) { const typename state_array::size_type nextcharpos = gather_if_char_or_charclass(nextcharclass1, pos + state.next1, false); if (nextcharpos) { range_pairs nextcharclass2; const bool canbe0length = gather_nextchars(nextcharclass2, pos + state.next2, 0u /* bracket_number */, true); if (!canbe0length && !nextcharclass1.is_overlap(nextcharclass2)) { state_type &branch = this->NFA_states[pos]; state_type &next1 = this->NFA_states[nextcharpos]; next1.next2 = pos + branch.next2 - nextcharpos; branch.next2 = 0; } } } } } #endif // !defined(SRELLDBG_NO_BRANCH_OPT) #if !defined(SRELL_NO_ICASE) bool check_if_really_needs_icase_search() { uchar32 u32chars[ucf_constants::rev_maxset]; for (typename state_array::size_type i = 0; i < this->NFA_states.size(); ++i) { const state_type &state = this->NFA_states[i]; if (state.type == st_character) { if (unicode_case_folding::casefoldedcharset(u32chars, state.character) > 1) return true; } else if (state.type == st_backreference) return true; } // this->soflags &= ~regex_constants::icase; return false; } #endif // !defined(SRELL_NO_ICASE) #if !defined(SRELLDBG_NO_BMH) void setup_bmhdata() { simple_array u32s; for (typename state_array::size_type i = 1; i < this->NFA_states.size(); ++i) { const state_type &state = this->NFA_states[i]; if (state.type == st_character) u32s.push_backncr(state.character); else { u32s.clear(); break; } } if (u32s.size() > 1) // if ((u32s.size() > 1 && !this->is_ricase()) || (u32s.size() > 2 && this->is_ricase())) { if (this->bmdata) this->bmdata->clear(); else this->bmdata = new re_bmh; this->bmdata->setup(u32s, this->is_ricase()); return /* false */; } if (this->bmdata) delete this->bmdata; this->bmdata = NULL; // return true; } #endif // !defined(SRELLDBG_NO_BMH) #if !defined(SRELLDBG_NO_CCPOS) void set_charclass_posinfo() { this->character_class.finalise(); for (typename state_array::size_type i = 1; i < this->NFA_states.size(); ++i) { state_type &state = this->NFA_states[i]; if (state.type == st_character_class) { const range_pair &posinfo = this->character_class.charclasspos(state.number); state.quantifier.setccpos(posinfo.first, posinfo.second); } } } #endif // !defined(SRELLDBG_NO_CCPOS) #if !defined(SRELLDBG_NO_BRANCH_OPT2) void branch_optimisation2() { range_pairs basealt1stch; range_pairs nextalt1stch; for (state_size_type pos = 0; pos < this->NFA_states.size(); ++pos) { const state_type &curstate = this->NFA_states[pos]; if (curstate.is_branch()) { const state_size_type next1pos = pos + curstate.next1; state_size_type precharchainpos = pos; if (gather_if_char_or_charclass(basealt1stch, next1pos, true) != 0) { state_size_type next2pos = precharchainpos + curstate.next2; state_size_type postcharchainpos = 0; for (;;) { state_size_type next2next1pos = next2pos; state_type &nstate2 = this->NFA_states[next2pos]; state_size_type next2next2pos = 0; if (nstate2.is_branch()) { next2next2pos = next2pos + nstate2.next2; next2next1pos += nstate2.next1; } if (gather_if_char_or_charclass(nextalt1stch, next2next1pos, true) != 0) { const int relation = basealt1stch.relationship(nextalt1stch); if (relation == 0) { if (next2next2pos) // if (nstate2.is_branch()) { nstate2.reset(); nstate2.type = st_epsilon; } if (postcharchainpos == 0) { postcharchainpos = next1pos + 1; insert_at(postcharchainpos, 1); this->NFA_states[next1pos].next1 = 1; } else { const state_size_type prevbranchpos = postcharchainpos; postcharchainpos = prevbranchpos + this->NFA_states[prevbranchpos].next2; insert_at(postcharchainpos, 1); this->NFA_states[prevbranchpos].next2 = postcharchainpos - prevbranchpos; // Fix for bug210423. This line cannot be omitted, because // NFA_states[prevbranchpos].next2 has been incremented in insert_at(). } // if (next2next1pos >= postcharchainpos) ++next2next1pos; if (precharchainpos >= postcharchainpos) ++precharchainpos; state_type &prechainbranchpoint = this->NFA_states[precharchainpos]; if (next2next2pos) { // if (next2next2pos >= postcharchainpos) ++next2next2pos; prechainbranchpoint.next2 = next2next2pos - precharchainpos; } else { prechainbranchpoint.next2 = 0; } state_type &newbranchpoint = this->NFA_states[postcharchainpos]; newbranchpoint.character = meta_char::mc_bar; // newbranchpoint.next1 = 1; newbranchpoint.next2 = next2next1pos + this->NFA_states[next2next1pos].next1 - postcharchainpos; } else if (relation == 1) { break; } else precharchainpos = next2pos; } else { // Fix for bug210428. // Original: /mm2|m|mm/ // 1st step: /m(?:m2||m)/ <- No more optimisation can be performed. Must quit. // 2nd step: /mm(?:2||)/ <- BUG. break; } if (next2next2pos == 0) break; next2pos = next2next2pos; } } } } } #endif // !defined(SRELLDBG_NO_BRANCH_OPT2) public: // For debug. void print_NFA_states(const int) const; }; // re_compiler } // namespace regex_internal // ... "rei_compiler.hpp"] // ["regex_sub_match.hpp" ... // 28.9, class template sub_match: template class sub_match : public std::pair { public: typedef typename std::iterator_traits::value_type value_type; typedef typename std::iterator_traits::difference_type difference_type; typedef BidirectionalIterator iterator; typedef std::basic_string string_type; bool matched; // constexpr sub_match(); // C++11. sub_match() : matched(false) { } difference_type length() const { return matched ? std::distance(this->first, this->second) : 0; } operator string_type() const { return matched ? string_type(this->first, this->second) : string_type(); } string_type str() const { return matched ? string_type(this->first, this->second) : string_type(); } int compare(const sub_match &s) const { return str().compare(s.str()); } int compare(const string_type &s) const { return str().compare(s); } int compare(const value_type *const s) const { return str().compare(s); } }; // 28.9.2, sub_match non-member operators: // [7.9.2] sub_match non-member operators // Compares sub_match & with sub_match &. template bool operator==(const sub_match &lhs, const sub_match &rhs) { return lhs.compare(rhs) == 0; // 1 } template bool operator!=(const sub_match &lhs, const sub_match &rhs) { return lhs.compare(rhs) != 0; // 2 } template bool operator<(const sub_match &lhs, const sub_match &rhs) { return lhs.compare(rhs) < 0; // 3 } template bool operator<=(const sub_match &lhs, const sub_match &rhs) { return lhs.compare(rhs) <= 0; // 4 } template bool operator>=(const sub_match &lhs, const sub_match &rhs) { return lhs.compare(rhs) >= 0; // 5 } template bool operator>(const sub_match &lhs, const sub_match &rhs) { return lhs.compare(rhs) > 0; // 6 } // Compares basic_string & with sub_match &. template bool operator==( const std::basic_string::value_type, ST, SA> &lhs, const sub_match &rhs ) { return rhs.compare(lhs.c_str()) == 0; // 7 } template bool operator!=( const std::basic_string::value_type, ST, SA> &lhs, const sub_match &rhs ) { return !(lhs == rhs); // 8 } template bool operator<( const std::basic_string::value_type, ST, SA> &lhs, const sub_match &rhs ) { return rhs.compare(lhs.c_str()) > 0; // 9 } template bool operator>( const std::basic_string::value_type, ST, SA> &lhs, const sub_match &rhs ) { return rhs < lhs; // 10 } template bool operator>=( const std::basic_string::value_type, ST, SA> &lhs, const sub_match &rhs ) { return !(lhs < rhs); // 11 } template bool operator<=( const std::basic_string::value_type, ST, SA> &lhs, const sub_match &rhs ) { return !(rhs < lhs); // 12 } // Compares sub_match & with basic_string &. template bool operator==( const sub_match &lhs, const std::basic_string::value_type, ST, SA> &rhs ) { return lhs.compare(rhs.c_str()) == 0; // 13 } template bool operator!=( const sub_match &lhs, const std::basic_string::value_type, ST, SA> &rhs ) { return !(lhs == rhs); // 14 } template bool operator<( const sub_match &lhs, const std::basic_string::value_type, ST, SA> &rhs ) { return lhs.compare(rhs.c_str()) < 0; // 15 } template bool operator>( const sub_match &lhs, const std::basic_string::value_type, ST, SA> &rhs ) { return rhs < lhs; // 16 } template bool operator>=( const sub_match &lhs, const std::basic_string::value_type, ST, SA> &rhs ) { return !(lhs < rhs); // 17 } template bool operator<=( const sub_match &lhs, const std::basic_string::value_type, ST, SA> &rhs ) { return !(rhs < lhs); // 18 } // Compares iterator_traits::value_type * with sub_match &. template bool operator==( typename std::iterator_traits::value_type const *lhs, const sub_match &rhs ) { return rhs.compare(lhs) == 0; // 19 } template bool operator!=( typename std::iterator_traits::value_type const *lhs, const sub_match &rhs ) { return !(lhs == rhs); // 20 } template bool operator<( typename std::iterator_traits::value_type const *lhs, const sub_match &rhs ) { return rhs.compare(lhs) > 0; // 21 } template bool operator>( typename std::iterator_traits::value_type const *lhs, const sub_match &rhs ) { return rhs < lhs; // 22 } template bool operator>=( typename std::iterator_traits::value_type const *lhs, const sub_match &rhs ) { return !(lhs < rhs); // 23 } template bool operator<=( typename std::iterator_traits::value_type const *lhs, const sub_match &rhs ) { return !(rhs < lhs); // 24 } // Compares sub_match & with iterator_traits::value_type *. template bool operator==( const sub_match &lhs, typename std::iterator_traits::value_type const *rhs ) { return lhs.compare(rhs) == 0; // 25 } template bool operator!=( const sub_match &lhs, typename std::iterator_traits::value_type const *rhs ) { return !(lhs == rhs); // 26 } template bool operator<( const sub_match &lhs, typename std::iterator_traits::value_type const *rhs ) { return lhs.compare(rhs) < 0; // 27 } template bool operator>( const sub_match &lhs, typename std::iterator_traits::value_type const *rhs ) { return rhs < lhs; // 28 } template bool operator>=( const sub_match &lhs, typename std::iterator_traits::value_type const *rhs ) { return !(lhs < rhs); // 29 } template bool operator<=( const sub_match &lhs, typename std::iterator_traits::value_type const *rhs ) { return !(rhs < lhs); // 30 } // Compares iterator_traits::value_type & with sub_match &. template bool operator==( typename std::iterator_traits::value_type const &lhs, const sub_match &rhs ) { return rhs.compare(typename sub_match::string_type(1, lhs)) == 0; // 31 } template bool operator!=( typename std::iterator_traits::value_type const &lhs, const sub_match &rhs ) { return !(lhs == rhs); // 32 } template bool operator<( typename std::iterator_traits::value_type const &lhs, const sub_match &rhs ) { return rhs.compare(typename sub_match::string_type(1, lhs)) > 0; // 33 } template bool operator>( typename std::iterator_traits::value_type const &lhs, const sub_match &rhs ) { return rhs < lhs; // 34 } template bool operator>=( typename std::iterator_traits::value_type const &lhs, const sub_match &rhs ) { return !(lhs < rhs); // 35 } template bool operator<=( typename std::iterator_traits::value_type const &lhs, const sub_match &rhs ) { return !(rhs < lhs); // 36 } // Compares sub_match & with iterator_traits::value_type &. template bool operator==( const sub_match &lhs, typename std::iterator_traits::value_type const &rhs ) { return lhs.compare(typename sub_match::string_type(1, rhs)) == 0; // 37 } template bool operator!=( const sub_match &lhs, typename std::iterator_traits::value_type const &rhs ) { return !(lhs == rhs); // 38 } template bool operator<( const sub_match &lhs, typename std::iterator_traits::value_type const &rhs ) { return lhs.compare(typename sub_match::string_type(1, rhs)) < 0; // 39 } template bool operator>( const sub_match &lhs, typename std::iterator_traits::value_type const &rhs ) { return rhs < lhs; // 40 } template bool operator>=( const sub_match &lhs, typename std::iterator_traits::value_type const &rhs ) { return !(lhs < rhs); // 41 } template bool operator<=( const sub_match &lhs, typename std::iterator_traits::value_type const &rhs ) { return !(rhs < lhs); // 42 } template std::basic_ostream &operator<<(std::basic_ostream &os, const sub_match &m) { return (os << m.str()); } typedef sub_match csub_match; typedef sub_match wcsub_match; typedef sub_match ssub_match; typedef sub_match wssub_match; #if defined(SRELL_CPP11_CHAR1632_ENABLED) typedef sub_match u16csub_match; typedef sub_match u32csub_match; typedef sub_match u16ssub_match; typedef sub_match u32ssub_match; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) typedef sub_match u8csub_match; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2 typedef sub_match u8ssub_match; #endif typedef csub_match u8ccsub_match; typedef ssub_match u8cssub_match; #if !defined(SRELL_CPP20_CHAR8_ENABLED) typedef u8ccsub_match u8csub_match; #endif #if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2 typedef u8cssub_match u8ssub_match; #endif #if defined(WCHAR_MAX) #if WCHAR_MAX >= 0x10ffff typedef wcsub_match u32wcsub_match; typedef wssub_match u32wssub_match; typedef u32wcsub_match u1632wcsub_match; typedef u32wssub_match u1632wssub_match; #elif WCHAR_MAX >= 0xffff typedef wcsub_match u16wcsub_match; typedef wssub_match u16wssub_match; typedef u16wcsub_match u1632wcsub_match; typedef u16wssub_match u1632wssub_match; #endif #endif // ... "regex_sub_match.hpp"] // ["regex_match_results.hpp" ... // 28.10, class template match_results: template > > class match_results { public: typedef sub_match value_type; typedef const value_type & const_reference; typedef const_reference reference; // typedef implementation defined const_iterator; typedef typename std::vector::const_iterator const_iterator; typedef const_iterator iterator; typedef typename std::iterator_traits::difference_type difference_type; #if defined(__cplusplus) && __cplusplus >= 201103L typedef typename std::allocator_traits::size_type size_type; #else typedef typename Allocator::size_type size_type; // TR1. #endif typedef Allocator allocator_type; typedef typename std::iterator_traits::value_type char_type; typedef std::basic_string string_type; public: // 28.10.1, construct/copy/destroy: // [7.10.1] construct/copy/destroy explicit match_results(const Allocator &a = Allocator()) : ready_(false), sub_matches_(a) { } match_results(const match_results &m) { operator=(m); } #if defined(SRELL_CPP11_MOVE_ENABLED) match_results(match_results &&m) SRELL_NOEXCEPT { operator=(std::move(m)); } #endif match_results &operator=(const match_results &m) { if (this != &m) { // this->sstate_ = m.sstate_; this->ready_ = m.ready_; this->sub_matches_ = m.sub_matches_; this->prefix_ = m.prefix_; this->suffix_ = m.suffix_; this->base_ = m.base_; #if !defined(SRELL_NO_NAMEDCAPTURE) this->gnames_ = m.gnames_; #endif } return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) match_results &operator=(match_results &&m) SRELL_NOEXCEPT { if (this != &m) { // this->sstate_ = std::move(m.sstate_); this->ready_ = m.ready_; this->sub_matches_ = std::move(m.sub_matches_); this->prefix_ = std::move(m.prefix_); this->suffix_ = std::move(m.suffix_); this->base_ = m.base_; #if !defined(SRELL_NO_NAMEDCAPTURE) this->gnames_ = std::move(m.gnames_); #endif } return *this; } #endif // ~match_results(); // 28.10.2, state: bool ready() const { return ready_; } // 28.10.3, size: // [7.10.2] size size_type size() const { return sub_matches_.size(); } size_type max_size() const { return sub_matches_.max_size(); // return static_cast(~0) / sizeof (value_type); } bool empty() const { return size() == 0; } // 28.10.4, element access: // [7.10.3] element access difference_type length(const size_type sub = 0) const { return (*this)[sub].length(); } difference_type position(const size_type sub = 0) const { const_reference ref = (*this)[sub]; return std::distance(base_, ref.first); } string_type str(const size_type sub = 0) const { return string_type((*this)[sub]); } const_reference operator[](const size_type n) const { #if defined(SRELL_STRICT_IMPL) return n < sub_matches_.size() ? sub_matches_[n] : unmatched_; #else return sub_matches_[n]; #endif } #if !defined(SRELL_NO_NAMEDCAPTURE) // Helpers for overload resolution of the integer literal 0 of signed types. template difference_type length(const IntegerType zero) const { return length(static_cast(zero)); } template difference_type position(const IntegerType zero) const { return position(static_cast(zero)); } template string_type str(const IntegerType zero) const { return str(static_cast(zero)); } template const_reference operator[](const IntegerType zero) const { return operator[](static_cast(zero)); } difference_type length(const string_type &sub) const { return (*this)[sub].length(); } difference_type position(const string_type &sub) const { const_reference ref = (*this)[sub]; return std::distance(base_, ref.first); } string_type str(const string_type &sub) const { return string_type((*this)[sub]); } const_reference operator[](const string_type &sub) const { return sub_matches_[lookup_and_check_backref_number(sub.c_str(), sub.c_str() + sub.size())]; } difference_type length(const char_type *sub) const { return (*this)[sub].length(); } difference_type position(const char_type *sub) const { const_reference ref = (*this)[sub]; return std::distance(base_, ref.first); } string_type str(const char_type *sub) const { return string_type((*this)[sub]); } const_reference operator[](const char_type *sub) const { return sub_matches_[lookup_and_check_backref_number(sub, sub + std::char_traits::length(sub))]; } #endif // !defined(SRELL_NO_NAMEDCAPTURE) const_reference prefix() const { return prefix_; } const_reference suffix() const { return suffix_; } const_iterator begin() const { return sub_matches_.begin(); } const_iterator end() const { return sub_matches_.end(); } const_iterator cbegin() const { return sub_matches_.begin(); } const_iterator cend() const { return sub_matches_.end(); } // 28.10.5, format: // [7.10.4] format template OutputIter format( OutputIter out, const char_type *fmt_first, const char_type *const fmt_last, regex_constants::match_flag_type /* flags */ = regex_constants::format_default ) const { if (this->ready() && !this->empty()) { #if !defined(SRELL_NO_NAMEDCAPTURE) const bool no_groupnames = gnames_.size() == 0; #endif const value_type &m0 = (*this)[0]; while (fmt_first != fmt_last) { if (*fmt_first != static_cast(regex_internal::meta_char::mc_dollar)) // '$' { *out++ = *fmt_first++; } else { ++fmt_first; if (fmt_first == fmt_last) { *out++ = regex_internal::meta_char::mc_dollar; // '$'; } else if (*fmt_first == static_cast(regex_internal::char_other::co_amp)) // '&', $& { out = std::copy(m0.first, m0.second, out); ++fmt_first; } else if (*fmt_first == static_cast(regex_internal::char_other::co_grav)) // '`', $`, prefix. { out = std::copy(this->prefix().first, this->prefix().second, out); ++fmt_first; } else if (*fmt_first == static_cast(regex_internal::char_other::co_apos)) // '\'', $', suffix. { out = std::copy(this->suffix().first, this->suffix().second, out); ++fmt_first; } #if !defined(SRELL_NO_NAMEDCAPTURE) else if (*fmt_first == static_cast(regex_internal::meta_char::mc_lt) && !no_groupnames) // '<', $< { const char_type *const current_backup = fmt_first; bool replaced = false; if (++fmt_first == fmt_last) ; // Do nothing. else { const char_type *const name_begin = fmt_first; for (;; ++fmt_first) { if (*fmt_first == static_cast(regex_internal::meta_char::mc_gt)) { const regex_internal::uint_l32 backref_number = lookup_backref_number(name_begin, fmt_first); if (backref_number != regex_internal::groupname_mapper::notfound) { const value_type &mn = (*this)[backref_number]; if (mn.matched) out = std::copy(mn.first, mn.second, out); // replaced = true; } replaced = true; ++fmt_first; break; } if (fmt_first == fmt_last) break; } } if (!replaced) { fmt_first = current_backup; *out++ = regex_internal::meta_char::mc_dollar; // '$'; } } #endif // !defined(SRELL_NO_NAMEDCAPTURE) else { const char_type *const backup_pos = fmt_first; size_type backref_number = 0; if (fmt_first != fmt_last && *fmt_first >= static_cast(regex_internal::char_alnum::ch_0) && *fmt_first <= static_cast(regex_internal::char_alnum::ch_9)) // '0'-'9' { backref_number += *fmt_first - regex_internal::char_alnum::ch_0; // '0'; if (++fmt_first != fmt_last && *fmt_first >= static_cast(regex_internal::char_alnum::ch_0) && *fmt_first <= static_cast(regex_internal::char_alnum::ch_9)) // '0'-'9' { backref_number *= 10; backref_number += *fmt_first - regex_internal::char_alnum::ch_0; // '0'; ++fmt_first; } } if (backref_number && backref_number < this->size()) { const value_type &mn = (*this)[backref_number]; if (mn.matched) out = std::copy(mn.first, mn.second, out); } else { *out++ = regex_internal::meta_char::mc_dollar; // '$'; fmt_first = backup_pos; if (*fmt_first == static_cast(regex_internal::meta_char::mc_dollar)) ++fmt_first; } } } } } return out; } template OutputIter format( OutputIter out, const std::basic_string &fmt, regex_constants::match_flag_type flags = regex_constants::format_default ) const { return format(out, fmt.data(), fmt.data() + fmt.size(), flags); } template std::basic_string format( const string_type &fmt, regex_constants::match_flag_type flags = regex_constants::format_default ) const { std::basic_string result; // format(std::back_insert_iterator(result), fmt, flags); format(std::back_inserter(result), fmt, flags); return result; } string_type format(const char_type *fmt, regex_constants::match_flag_type flags = regex_constants::format_default) const { string_type result; format(std::back_inserter(result), fmt, fmt + std::char_traits::length(fmt), flags); return result; } // 28.10.6, allocator: // [7.10.5] allocator allocator_type get_allocator() const { return allocator_type(); } // 28.10.7, swap: // [7.10.6] swap void swap(match_results &that) { const match_results tmp(that); that = *this; *this = tmp; } public: // For internal. typedef match_results match_results_type; typedef typename match_results_type::size_type match_results_size_type; typedef typename regex_internal::re_search_state search_state_type; search_state_type sstate_; void clear_() { ready_ = false; sub_matches_.clear(); // prefix_.matched = false; // suffix_.matched = false; #if !defined(SRELL_NO_NAMEDCAPTURE) gnames_.clear(); #endif } // template #if !defined(SRELL_NO_NAMEDCAPTURE) bool set_match_results_(const regex_internal::groupname_mapper &gnames) #else bool set_match_results_() #endif { sub_matches_.resize(sstate_.bracket.size()); // value_type &m0 = sub_matches_[0]; sub_matches_[0].matched = true; for (regex_internal::uint_l32 i = 1; i < static_cast(sstate_.bracket.size()); ++i) { const typename search_state_type::submatch_type &br = sstate_.bracket[i]; value_type &sm = sub_matches_[i]; sm.first = br.core.open_at; sm.second = br.core.close_at; sm.matched = br.counter != 0; } base_ = sstate_.lblim; prefix_.first = sstate_.srchbegin; prefix_.second = sub_matches_[0].first = sstate_.bracket[0].core.open_at; suffix_.first = sub_matches_[0].second = sstate_.nth.in_string; suffix_.second = sstate_.srchend; prefix_.matched = prefix_.first != prefix_.second; // The spec says prefix().first != prefix().second suffix_.matched = suffix_.first != suffix_.second; // The spec says suffix().first != suffix().second #if !defined(SRELL_NO_NAMEDCAPTURE) gnames_ = gnames; #endif ready_ = true; return true; } bool set_match_results_bmh_() { sub_matches_.resize(1); // value_type &m0 = sub_matches_[0]; sub_matches_[0].matched = true; base_ = sstate_.lblim; prefix_.first = sstate_.srchbegin; prefix_.second = sub_matches_[0].first = sstate_.nth.in_string; suffix_.first = sub_matches_[0].second = sstate_.nextpos; suffix_.second = sstate_.srchend; prefix_.matched = prefix_.first != prefix_.second; suffix_.matched = suffix_.first != suffix_.second; ready_ = true; return true; } void set_prefix_first_(const BidirectionalIterator pf) { prefix_.first = pf; } bool mark_as_failed_() { ready_ = true; // 30.11.2 and 3: Postconditions: m.ready() == true in all cases. return false; } private: #if !defined(SRELL_NO_NAMEDCAPTURE) regex_internal::uint_l32 lookup_backref_number(const char_type *begin, const char_type *const end) const { typename regex_internal::groupname_mapper::gname_string key(end - begin); for (std::size_t i = 0; begin != end; ++begin, ++i) key[i] = *begin; return gnames_[key]; } regex_internal::uint_l32 lookup_and_check_backref_number(const char_type *begin, const char_type *const end) const { const regex_internal::uint_l32 backrefno = lookup_backref_number(begin, end); if (backrefno == regex_internal::groupname_mapper::notfound) throw regex_error(regex_constants::error_backref); return backrefno; } #endif // !defined(SRELL_NO_NAMEDCAPTURE) public: // For debug. template void print_sub_matches(const BasicRegexT &, const int) const; void print_addresses(const value_type &, const char *const) const; private: typedef std::vector sub_match_array; bool ready_; sub_match_array sub_matches_; value_type prefix_; value_type suffix_; BidirectionalIterator base_; #if !defined(SRELL_NO_NAMEDCAPTURE) regex_internal::groupname_mapper gnames_; #endif #if defined(SRELL_STRICT_IMPL) value_type unmatched_; #endif }; // 28.10.7, match_results swap: // [7.10.6] match_results swap template void swap( match_results &m1, match_results &m2 ) { m1.swap(m2); } // 28.10.8, match_results comparisons template bool operator==( const match_results &m1, const match_results &m2 ) { if (!m1.ready() && !m2.ready()) return true; if (m1.ready() && m2.ready()) { if (m1.empty() && m2.empty()) return true; if (!m1.empty() && !m2.empty()) { return m1.prefix() == m2.prefix() && m1.size() == m2.size() && std::equal(m1.begin(), m1.end(), m2.begin()) && m1.suffix() == m2.suffix(); } } return false; } template bool operator!=( const match_results &m1, const match_results &m2 ) { return !(m1 == m2); } typedef match_results cmatch; typedef match_results wcmatch; typedef match_results smatch; typedef match_results wsmatch; #if defined(SRELL_CPP11_CHAR1632_ENABLED) typedef match_results u16cmatch; typedef match_results u32cmatch; typedef match_results u16smatch; typedef match_results u32smatch; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) typedef match_results u8cmatch; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2 typedef match_results u8smatch; #endif typedef cmatch u8ccmatch; typedef smatch u8csmatch; #if !defined(SRELL_CPP20_CHAR8_ENABLED) typedef u8ccmatch u8cmatch; #endif #if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2 typedef u8csmatch u8smatch; #endif #if defined(WCHAR_MAX) #if WCHAR_MAX >= 0x10ffff typedef wcmatch u32wcmatch; typedef wsmatch u32wsmatch; typedef u32wcmatch u1632wcmatch; typedef u32wsmatch u1632wsmatch; #elif WCHAR_MAX >= 0xffff typedef wcmatch u16wcmatch; typedef wsmatch u16wsmatch; typedef u16wcmatch u1632wcmatch; typedef u16wsmatch u1632wsmatch; #endif #endif // ... "regex_match_results.hpp"] // ["rei_algorithm.hpp" ... namespace regex_internal { template class regex_object : public re_compiler { public: template bool search ( const BidirectionalIterator begin, const BidirectionalIterator end, const BidirectionalIterator lookbehind_limit, match_results &results, const regex_constants::match_flag_type flags /* = regex_constants::match_default */ ) const { results.clear_(); // results.sstate_.template init(begin, end, lookbehind_limit, flags); results.sstate_.init(begin, end, lookbehind_limit, flags); if (results.sstate_.match_continuous_flag()) { if (this->NFA_states.size()) { results.sstate_.set_entrypoint(this->NFA_states[0].next_state2); goto DO_SEARCH; } } else #if !defined(SRELLDBG_NO_BMH) if (this->bmdata) { #if !defined(SRELL_NO_ICASE) if (!this->is_ricase() ? this->bmdata->do_casesensitivesearch(results.sstate_, typename std::iterator_traits::iterator_category()) : this->bmdata->do_icasesearch(results.sstate_, typename std::iterator_traits::iterator_category())) #else if (this->bmdata->do_casesensitivesearch(results.sstate_, typename std::iterator_traits::iterator_category())) #endif return results.set_match_results_bmh_(); } else #endif if (this->NFA_states.size()) { results.sstate_.set_entrypoint(this->NFA_states[0].next_state1); DO_SEARCH: results.sstate_.init_for_automaton(this->number_of_brackets, this->number_of_counters, this->number_of_repeats); #if !defined(SRELL_NO_ICASE) if (!this->is_ricase() ? do_search(results) : do_search(results)) #else if (do_search(results)) #endif { #if !defined(SRELL_NO_NAMEDCAPTURE) return results.set_match_results_(this->namedcaptures); #else return results.set_match_results_(); #endif } } return results.mark_as_failed_(); } private: typedef typename traits::utf_traits utf_traits; template bool do_search ( match_results &results ) const { re_search_state &sstate = results.sstate_; const BidirectionalIterator searchend = sstate.nth.in_string; for (;;) { const bool final = sstate.nextpos == searchend; sstate.nth.in_string = sstate.nextpos; if (!final) { #ifdef SRELLDBG_NO_1STCHRCLS utf_traits::codepoint_inc(sstate.nextpos, sstate.srchend); #else { #if !defined(SRELLDBG_NO_BITSET) if (!this->firstchar_class_bs.test((*sstate.nextpos++) & utf_traits::bitsetmask)) #else const uchar32 firstchar = utf_traits::codepoint_inc(sstate.nextpos, sstate.srchend); if (!this->firstchar_class.is_included(firstchar)) #endif continue; } #endif } // Even when final == true, we have to try for such expressions // as "" =~ /^$/ or "..." =~ /$/. #if defined(SRELL_NO_LIMIT_COUNTER) sstate.reset(/* first */); #else sstate.reset(/* first, */ this->limit_counter); #endif if (run_automaton(sstate /* , false */)) return true; if (final) break; } return false; } template struct casehelper { static T canonicalise(const T t) { return t; } }; template struct casehelper { static T canonicalise(const T t) { return unicode_case_folding::do_casefolding(t); } }; template bool run_automaton ( // match_results &results, re_search_state &sstate // , const bool is_recursive /* = false */ ) const { typedef casehelper casehelper_type; typedef typename re_object_core::state_type state_type; typedef re_search_state ss_type; // typedef typename ss_type::search_core_state scstate_type; typedef typename ss_type::submatch_type submatch_type; typedef typename ss_type::submatchcore_type submatchcore_type; typedef typename ss_type::counter_type counter_type; typedef typename ss_type::position_type position_type; bool is_matched; goto START; JUDGE: if (is_matched) { MATCHED: sstate.nth.in_NFA_states = sstate.nth.in_NFA_states->next_state1; } else { NOT_MATCHED: #if !defined(SRELL_NO_LIMIT_COUNTER) if (--sstate.failure_counter) { #endif if (sstate.bt_stack.size() > sstate.btstack_size) { sstate.nth = sstate.bt_stack.back(); sstate.bt_stack.pop_back(); sstate.nth.in_NFA_states = sstate.nth.in_NFA_states->next_state2; // continue; } else { return false; } #if !defined(SRELL_NO_LIMIT_COUNTER) } else throw regex_error(regex_constants::error_complexity); #endif } // START: for (;;) { START: const state_type ¤t_NFA = *sstate.nth.in_NFA_states; switch (current_NFA.type) { case st_character: #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(push) #pragma warning(disable:4127) #endif if (!reverse) #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(pop) #endif { if (!sstate.is_at_srchend()) { #if !defined(SRELLDBG_NO_ASTERISK_OPT) const BidirectionalIterator prevpos = sstate.nth.in_string; #endif const uchar32 uchar = casehelper_type::canonicalise(utf_traits::codepoint_inc(sstate.nth.in_string, sstate.srchend)); RETRY_CF: const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states; if (current_NFA2.character == uchar) goto MATCHED; #if !defined(SRELLDBG_NO_ASTERISK_OPT) if (current_NFA2.next_state2) { sstate.nth.in_NFA_states = current_NFA2.next_state2; if (sstate.nth.in_NFA_states->type == st_character) goto RETRY_CF; sstate.nth.in_string = prevpos; continue; } #endif } #if !defined(SRELLDBG_NO_ASTERISK_OPT) else if (current_NFA.next_state2) { sstate.nth.in_NFA_states = current_NFA.next_state2; continue; } #endif } else // reverse == true. { if (!sstate.is_at_lookbehindlimit()) { #if !defined(SRELLDBG_NO_ASTERISK_OPT) const BidirectionalIterator prevpos = sstate.nth.in_string; #endif const uchar32 uchar = casehelper_type::canonicalise(utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim)); RETRY_CB: const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states; if (current_NFA2.character == uchar) goto MATCHED; #if !defined(SRELLDBG_NO_ASTERISK_OPT) if (current_NFA2.next_state2) { sstate.nth.in_NFA_states = current_NFA2.next_state2; if (sstate.nth.in_NFA_states->type == st_character) goto RETRY_CB; sstate.nth.in_string = prevpos; continue; } #endif } #if !defined(SRELLDBG_NO_ASTERISK_OPT) else if (current_NFA.next_state2) { sstate.nth.in_NFA_states = current_NFA.next_state2; continue; } #endif } goto NOT_MATCHED; case st_character_class: #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(push) #pragma warning(disable:4127) #endif if (!reverse) #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(pop) #endif { if (!sstate.is_at_srchend()) { #if !defined(SRELLDBG_NO_ASTERISK_OPT) const BidirectionalIterator prevpos = sstate.nth.in_string; #endif const uchar32 uchar = utf_traits::codepoint_inc(sstate.nth.in_string, sstate.srchend); // RETRY_CCF: const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states; #if !defined(SRELLDBG_NO_CCPOS) if (this->character_class.is_included(current_NFA2.quantifier.offset, current_NFA2.quantifier.length, uchar)) #else if (this->character_class.is_included(current_NFA2.number, uchar)) #endif goto MATCHED; #if !defined(SRELLDBG_NO_ASTERISK_OPT) if (current_NFA2.next_state2) { sstate.nth.in_NFA_states = current_NFA2.next_state2; // if (sstate.nth.in_NFA_states->type == st_character_class) // goto RETRY_CCF; sstate.nth.in_string = prevpos; continue; } #endif } #if !defined(SRELLDBG_NO_ASTERISK_OPT) else if (current_NFA.next_state2) { sstate.nth.in_NFA_states = current_NFA.next_state2; continue; } #endif } else // reverse == true. { if (!sstate.is_at_lookbehindlimit()) { #if !defined(SRELLDBG_NO_ASTERISK_OPT) const BidirectionalIterator prevpos = sstate.nth.in_string; #endif const uchar32 uchar = utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim); // RETRY_CCB: const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states; #if !defined(SRELLDBG_NO_CCPOS) if (this->character_class.is_included(current_NFA2.quantifier.offset, current_NFA2.quantifier.length, uchar)) #else if (this->character_class.is_included(current_NFA2.number, uchar)) #endif goto MATCHED; #if !defined(SRELLDBG_NO_ASTERISK_OPT) if (current_NFA2.next_state2) { sstate.nth.in_NFA_states = current_NFA2.next_state2; // if (sstate.nth.in_NFA_states->type == st_character_class) // goto RETRY_CCB; sstate.nth.in_string = prevpos; continue; } #endif } #if !defined(SRELLDBG_NO_ASTERISK_OPT) else if (current_NFA.next_state2) { sstate.nth.in_NFA_states = current_NFA.next_state2; continue; } #endif } goto NOT_MATCHED; case st_epsilon: #if defined(SRELLDBG_NO_SKIP_EPSILON) if (current_NFA.next_state2) #endif { sstate.bt_stack.push_back(sstate.nth); // sstate.push(); } sstate.nth.in_NFA_states = current_NFA.next_state1; continue; default: switch (current_NFA.type) { case st_check_counter: { const uint_l32 counter = sstate.counter[current_NFA.number]; if (counter < current_NFA.quantifier.atmost) { ++sstate.counter[current_NFA.number]; LOOP_WITHOUT_INCREMENT: if (counter >= current_NFA.quantifier.atleast) { sstate.bt_stack.push_back(sstate.nth); sstate.nth.in_NFA_states = current_NFA.next_state1; } else { sstate.nth.in_NFA_states = current_NFA.quantifier.is_greedy ? current_NFA.next_state1 : current_NFA.next_state2; } } else { if (current_NFA.quantifier.is_infinity()) goto LOOP_WITHOUT_INCREMENT; sstate.nth.in_NFA_states = current_NFA.quantifier.is_greedy ? current_NFA.next_state2 : current_NFA.next_state1; } } continue; case st_decrement_counter: --sstate.counter[current_NFA.number]; goto NOT_MATCHED; case st_save_and_reset_counter: { counter_type &c = sstate.counter[current_NFA.number]; sstate.counter_stack.push_back(c); sstate.bt_stack.push_back(sstate.nth); c = 0; } goto MATCHED; case st_restore_counter: sstate.counter[current_NFA.number] = sstate.counter_stack.back(); sstate.counter_stack.pop_back(); goto NOT_MATCHED; case st_roundbracket_open: // '(': { submatch_type &bracket = sstate.bracket[current_NFA.number]; sstate.capture_stack.push_back(bracket.core); #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(push) #pragma warning(disable:4127) #endif if (!reverse) #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(pop) #endif { bracket.core.open_at = sstate.nth.in_string; } else bracket.core.close_at = sstate.nth.in_string; ++bracket.counter; for (uint_l32 brno = current_NFA.quantifier.atleast; brno <= current_NFA.quantifier.atmost; ++brno) { submatch_type &inner_bracket = sstate.bracket[brno]; sstate.capture_stack.push_back(inner_bracket.core); sstate.counter_stack.push_back(inner_bracket.counter); inner_bracket.core.open_at = inner_bracket.core.close_at = sstate.srchend; inner_bracket.counter = 0; // ECMAScript spec (3-5.1) 15.10.2.5, NOTE 3. // ECMAScript 2018 (ES9) 21.2.2.5.1, Note 3. } sstate.bt_stack.push_back(sstate.nth); } goto MATCHED; case st_roundbracket_pop: // '/': { for (uint_l32 brno = current_NFA.quantifier.atmost; brno >= current_NFA.quantifier.atleast; --brno) { submatch_type &inner_bracket = sstate.bracket[brno]; inner_bracket.counter = sstate.counter_stack.back(); inner_bracket.core = sstate.capture_stack.back(); sstate.counter_stack.pop_back(); sstate.capture_stack.pop_back(); } submatch_type &bracket = sstate.bracket[current_NFA.number]; bracket.core = sstate.capture_stack.back(); sstate.capture_stack.pop_back(); --bracket.counter; } goto NOT_MATCHED; case st_roundbracket_close: // ')': { submatch_type &bracket = sstate.bracket[current_NFA.number]; submatchcore_type &brc = bracket.core; if ((!reverse ? brc.open_at : brc.close_at) != sstate.nth.in_string) { sstate.nth.in_NFA_states = current_NFA.next_state1; } else // 0 width match, breaks from the loop. { if (current_NFA.next_state1->type != st_check_counter) { if (bracket.counter > 1) goto NOT_MATCHED; // ECMAScript spec 15.10.2.5, note 4. sstate.nth.in_NFA_states = current_NFA.next_state2; // Accepts 0 width match and exits. } else { // A pair with check_counter. const counter_type counter = sstate.counter[current_NFA.next_state1->number]; if (counter > current_NFA.next_state1->quantifier.atleast) goto NOT_MATCHED; // Takes a captured string in the previous loop. sstate.nth.in_NFA_states = current_NFA.next_state1; // Accepts 0 width match and continues. } } #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(push) #pragma warning(disable:4127) #endif if (!reverse) #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(pop) #endif { brc.close_at = sstate.nth.in_string; } else // reverse == true. { brc.open_at = sstate.nth.in_string; } } continue; case st_repeat_in_push: { position_type &r = sstate.repeat[current_NFA.number]; sstate.repeat_stack.push_back(r); r = sstate.nth.in_string; for (uint_l32 brno = current_NFA.quantifier.atleast; brno <= current_NFA.quantifier.atmost; ++brno) { submatch_type &inner_bracket = sstate.bracket[brno]; sstate.capture_stack.push_back(inner_bracket.core); sstate.counter_stack.push_back(inner_bracket.counter); inner_bracket.core.open_at = inner_bracket.core.close_at = sstate.srchend; inner_bracket.counter = 0; // ECMAScript 2019 (ES10) 21.2.2.5.1, Note 3. } sstate.bt_stack.push_back(sstate.nth); } goto MATCHED; case st_repeat_in_pop: for (uint_l32 brno = current_NFA.quantifier.atmost; brno >= current_NFA.quantifier.atleast; --brno) { submatch_type &inner_bracket = sstate.bracket[brno]; inner_bracket.counter = sstate.counter_stack.back(); inner_bracket.core = sstate.capture_stack.back(); sstate.counter_stack.pop_back(); sstate.capture_stack.pop_back(); } sstate.repeat[current_NFA.number] = sstate.repeat_stack.back(); sstate.repeat_stack.pop_back(); goto NOT_MATCHED; case st_check_0_width_repeat: if (sstate.nth.in_string != sstate.repeat[current_NFA.number]) goto MATCHED; sstate.nth.in_NFA_states = current_NFA.next_state2; continue; case st_backreference: // '\\': { const submatch_type &bracket = sstate.bracket[current_NFA.number]; if (bracket.counter == 0) // Undefined. { ESCAPE_FROM_ZERO_WIDTH_MATCH: sstate.nth.in_NFA_states = current_NFA.next_state2; continue; } else { const submatchcore_type &brc = bracket.core; if (brc.open_at == brc.close_at) { goto ESCAPE_FROM_ZERO_WIDTH_MATCH; } else { #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(push) #pragma warning(disable:4127) #endif if (!reverse) #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(pop) #endif { for (BidirectionalIterator backrefpos = brc.open_at; backrefpos != brc.close_at;) { if (!sstate.is_at_srchend()) { const uchar32 uchartxt = utf_traits::codepoint_inc(sstate.nth.in_string, sstate.srchend); const uchar32 ucharref = utf_traits::codepoint_inc(backrefpos, brc.close_at); if (casehelper_type::canonicalise(uchartxt) == casehelper_type::canonicalise(ucharref)) continue; } goto NOT_MATCHED; } } else // reverse == true. { for (BidirectionalIterator backrefpos = brc.close_at; backrefpos != brc.open_at;) { if (!sstate.is_at_lookbehindlimit()) { const uchar32 uchartxt = utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim); const uchar32 ucharref = utf_traits::dec_codepoint(backrefpos, brc.open_at); if (casehelper_type::canonicalise(uchartxt) == casehelper_type::canonicalise(ucharref)) continue; } goto NOT_MATCHED; } } } } } goto MATCHED; case st_lookaround_open: { for (uint_l32 i = 1; i < this->number_of_brackets; ++i) { const submatch_type &sm = sstate.bracket[i]; sstate.capture_stack.push_back(sm.core); sstate.counter_stack.push_back(sm.counter); } for (uint_l32 i = 0; i < this->number_of_counters; ++i) sstate.counter_stack.push_back(sstate.counter[i]); for (uint_l32 i = 0; i < this->number_of_repeats; ++i) sstate.repeat_stack.push_back(sstate.repeat[i]); const typename ss_type::bottom_state backup_bottom(sstate.btstack_size, sstate.capture_stack.size(), sstate.counter_stack.size(), sstate.repeat_stack.size()); const BidirectionalIterator orgpos = sstate.nth.in_string; sstate.btstack_size = sstate.bt_stack.size(); #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER) if (current_NFA.quantifier.atleast == 2) { sstate.repeat_stack.push_back(sstate.lblim); sstate.lblim = sstate.srchbegin; } #endif #if defined(SRELL_FIXEDWIDTHLOOKBEHIND) // if (current_NFA.reverse) { for (uint_l32 i = 0; i < current_NFA.quantifier.atleast; ++i) { if (!sstate.is_at_lookbehindlimit()) { utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim); continue; } is_matched = false; goto AFTER_LOOKAROUND; } } #endif sstate.nth.in_NFA_states = current_NFA.next_state2; #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) is_matched = current_NFA.quantifier.atleast == 0 ? run_automaton(sstate /* , true */) : run_automaton(sstate /* , true */); #else is_matched = run_automaton(sstate /* , true */); #endif #if defined(SRELL_FIXEDWIDTHLOOKBEHIND) AFTER_LOOKAROUND: #endif { #if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER) if (current_NFA.quantifier.atleast == 2) { sstate.lblim = sstate.repeat_stack[backup_bottom.repeatstack_size]; if (is_matched) sstate.bracket[0].core.open_at = sstate.nth.in_string; } #endif #if defined(SRELL_ENABLE_GT) if (current_NFA.character != meta_char::mc_gt) // '>' #endif { sstate.nth.in_string = orgpos; } sstate.bt_stack.resize(sstate.btstack_size); sstate.btstack_size = backup_bottom.btstack_size; sstate.capture_stack.resize(backup_bottom.capturestack_size); sstate.counter_stack.resize(backup_bottom.counterstack_size); sstate.repeat_stack.resize(backup_bottom.repeatstack_size); is_matched ^= current_NFA.is_not; } } if (is_matched) { sstate.nth.in_NFA_states = current_NFA.next_state1; continue; } // case st_lookaround_pop: for (uint_l32 i = this->number_of_repeats; i;) { sstate.repeat[--i] = sstate.repeat_stack.back(); sstate.repeat_stack.pop_back(); } for (uint_l32 i = this->number_of_counters; i;) { sstate.counter[--i] = sstate.counter_stack.back(); sstate.counter_stack.pop_back(); } for (uint_l32 i = this->number_of_brackets; i > 1;) { submatch_type &sm = sstate.bracket[--i]; sm.counter = sstate.counter_stack.back(); sm.core = sstate.capture_stack.back(); sstate.counter_stack.pop_back(); sstate.capture_stack.pop_back(); } goto NOT_MATCHED; case st_bol: // '^': if (sstate.is_at_lookbehindlimit() && !sstate.match_prev_avail_flag()) { if (!sstate.match_not_bol_flag()) goto MATCHED; } // !sstate.is_at_lookbehindlimit() || sstate.match_prev_avail_flag() else if (current_NFA.multiline) { const uchar32 prevchar = utf_traits::prevcodepoint(sstate.nth.in_string, sstate.lblim); if (this->character_class.is_included(re_character_class::newline, prevchar)) goto MATCHED; } goto NOT_MATCHED; case st_eol: // '$': if (sstate.is_at_srchend()) { if (!sstate.match_not_eol_flag()) goto MATCHED; } else if (current_NFA.multiline) { const uchar32 nextchar = utf_traits::codepoint(sstate.nth.in_string, sstate.srchend); if (this->character_class.is_included(re_character_class::newline, nextchar)) goto MATCHED; } goto NOT_MATCHED; case st_boundary: // '\b' '\B' is_matched = current_NFA.is_not; // is_matched = current_NFA.character == char_alnum::ch_B; // First, suppose the previous character is not \w but \W. if (sstate.is_at_srchend()) { if (sstate.match_not_eow_flag()) is_matched = !is_matched; } else if (this->character_class.is_included(current_NFA.number, utf_traits::codepoint(sstate.nth.in_string, sstate.srchend))) { is_matched = !is_matched; } // \W/last \w // \b false true // \B true false // Second, if the actual previous character is \w, flip is_matched. if (sstate.is_at_lookbehindlimit() && !sstate.match_prev_avail_flag()) { if (sstate.match_not_bow_flag()) is_matched = !is_matched; } // !sstate.is_at_lookbehindlimit() || sstate.match_prev_avail_flag() else if (this->character_class.is_included(current_NFA.number, utf_traits::prevcodepoint(sstate.nth.in_string, sstate.lblim))) { is_matched = !is_matched; } // \b \B // pre cur \W/last \w pre cur \W/last \w // \W/base false true \W/base true false // \w true false \w false true goto JUDGE; case st_success: // == lookaround_close. // if (is_recursive) if (sstate.btstack_size) return true; if ( (!sstate.match_not_null_flag() || !sstate.is_null()) && (!sstate.match_match_flag() || sstate.is_at_srchend()) ) return true; goto NOT_MATCHED; #if !defined(SRELLDBG_NO_NEXTPOS_OPT) case st_move_nextpos: #if !defined(SRELLDBG_NO_1STCHRCLS) && !defined(SRELLDBG_NO_BITSET) sstate.nextpos = sstate.nth.in_string; if (!sstate.is_at_srchend()) ++sstate.nextpos; #else // defined(SRELLDBG_NO_1STCHRCLS) || defined(SRELLDBG_NO_BITSET) if (sstate.nth.in_string != sstate.bracket[0].core.open_at) { sstate.nextpos = sstate.nth.in_string; if (!sstate.is_at_srchend()) utf_traits::codepoint_inc(sstate.nextpos, sstate.srchend); } #endif goto MATCHED; #endif default: // Reaching here means that this->NFA_states is corrupted. throw regex_error(regex_constants::error_internal); } } } } }; // regex_object } // namespace regex_internal // ... "rei_algorithm.hpp"] // ["basic_regex.hpp" ... // 28.8, class template basic_regex: template > class basic_regex : public regex_internal::regex_object { public: // Types: typedef charT value_type; typedef traits traits_type; typedef typename traits::string_type string_type; typedef regex_constants::syntax_option_type flag_type; typedef typename traits::locale_type locale_type; // 28.8.1, constants: // [7.8.1] constants static const regex_constants::syntax_option_type icase = regex_constants::icase; static const regex_constants::syntax_option_type nosubs = regex_constants::nosubs; static const regex_constants::syntax_option_type optimize = regex_constants::optimize; static const regex_constants::syntax_option_type collate = regex_constants::collate; static const regex_constants::syntax_option_type ECMAScript = regex_constants::ECMAScript; static const regex_constants::syntax_option_type basic = regex_constants::basic; static const regex_constants::syntax_option_type extended = regex_constants::extended; static const regex_constants::syntax_option_type awk = regex_constants::awk; static const regex_constants::syntax_option_type grep = regex_constants::grep; static const regex_constants::syntax_option_type egrep = regex_constants::egrep; static const regex_constants::syntax_option_type multiline = regex_constants::multiline; static const regex_constants::syntax_option_type dotall = regex_constants::dotall; static const regex_constants::syntax_option_type unicodesets = regex_constants::unicodesets; // 28.8.2, construct/copy/destroy: // [7.8.2] construct/copy/destroy basic_regex() { } explicit basic_regex(const charT *const p, const flag_type f = regex_constants::ECMAScript) { assign(p, p + std::char_traits::length(p), f); } basic_regex(const charT *const p, const std::size_t len, const flag_type f = regex_constants::ECMAScript) { assign(p, p + len, f); } basic_regex(const basic_regex &e) { assign(e); } #if defined(SRELL_CPP11_MOVE_ENABLED) basic_regex(basic_regex &&e) SRELL_NOEXCEPT { assign(std::move(e)); } #endif template explicit basic_regex(const std::basic_string &p, const flag_type f = regex_constants::ECMAScript) { assign(p, f); } template basic_regex(ForwardIterator first, ForwardIterator last, const flag_type f = regex_constants::ECMAScript) { assign(first, last, f); } #if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED) basic_regex(std::initializer_list il, const flag_type f = regex_constants::ECMAScript) { assign(il, f); } #endif // ~basic_regex(); basic_regex &operator=(const basic_regex &right) { return assign(right); } #if defined(SRELL_CPP11_MOVE_ENABLED) basic_regex &operator=(basic_regex &&e) SRELL_NOEXCEPT { return assign(std::move(e)); } #endif basic_regex &operator=(const charT *const ptr) { return assign(ptr); } #if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED) basic_regex &operator=(std::initializer_list il) { return assign(il.begin(), il.end()); } #endif template basic_regex &operator=(const std::basic_string &p) { return assign(p); } // 28.8.3, assign: // [7.8.3] assign basic_regex &assign(const basic_regex &right) { regex_internal::re_object_core::operator=(right); return *this; } #if defined(SRELL_CPP11_MOVE_ENABLED) basic_regex &assign(basic_regex &&right) SRELL_NOEXCEPT { regex_internal::re_object_core::operator=(std::move(right)); return *this; } #endif basic_regex &assign(const charT *const ptr, const flag_type f = regex_constants::ECMAScript) { return assign(ptr, ptr + std::char_traits::length(ptr), f); } basic_regex &assign(const charT *const p, std::size_t len, const flag_type f = regex_constants::ECMAScript) { return assign(p, p + len, f); } template basic_regex &assign(const std::basic_string &s, const flag_type f = regex_constants::ECMAScript) { return assign(s.c_str(), s.c_str() + s.size(), f); } template basic_regex &assign(InputIterator first, InputIterator last, const flag_type f = regex_constants::ECMAScript) { #if defined(SRELL_STRICT_IMPL) basic_regex tmp; tmp.compile(first, last, f); tmp.swap(*this); #else this->compile(first, last, f); #endif return *this; } #if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED) basic_regex &assign(std::initializer_list il, const flag_type f = regex_constants::ECMAScript) { return assign(il.begin(), il.end(), f); } #endif // 28.8.4, const operations: // [7.8.4] const operations unsigned mark_count() const { return this->number_of_brackets - 1; } flag_type flags() const { return this->soflags; } // 28.8.5, locale: // [7.8.5] locale locale_type imbue(locale_type loc) { return this->traits_inst.imbue(loc); } locale_type getloc() const { return this->traits_inst.getloc(); } // 28.8.6, swap: // [7.8.6] swap void swap(basic_regex &e) { regex_internal::re_object_core::swap(e); } }; template const regex_constants::syntax_option_type basic_regex::icase; template const regex_constants::syntax_option_type basic_regex::nosubs; template const regex_constants::syntax_option_type basic_regex::optimize; template const regex_constants::syntax_option_type basic_regex::collate; template const regex_constants::syntax_option_type basic_regex::ECMAScript; template const regex_constants::syntax_option_type basic_regex::basic; template const regex_constants::syntax_option_type basic_regex::extended; template const regex_constants::syntax_option_type basic_regex::awk; template const regex_constants::syntax_option_type basic_regex::grep; template const regex_constants::syntax_option_type basic_regex::egrep; template const regex_constants::syntax_option_type basic_regex::multiline; template const regex_constants::syntax_option_type basic_regex::dotall; template const regex_constants::syntax_option_type basic_regex::unicodesets; // 28.8.6, basic_regex swap: template void swap(basic_regex &lhs, basic_regex &rhs) { lhs.swap(rhs); } typedef basic_regex regex; typedef basic_regex wregex; #if defined(WCHAR_MAX) #if WCHAR_MAX >= 0x10ffff typedef wregex u32wregex; typedef u32wregex u1632wregex; #elif WCHAR_MAX >= 0xffff typedef basic_regex > u16wregex; typedef u16wregex u1632wregex; #endif #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) typedef basic_regex u8regex; #endif typedef basic_regex > u8cregex; #if !defined(SRELL_CPP20_CHAR8_ENABLED) typedef u8cregex u8regex; #endif #if defined(SRELL_CPP11_CHAR1632_ENABLED) typedef basic_regex u16regex; typedef basic_regex u32regex; #endif // ... "basic_regex.hpp"] // ["regex_iterator.hpp" ... // 28.12.1, class template regex_iterator: template ::value_type, class traits = regex_traits > class regex_iterator { public: typedef basic_regex regex_type; typedef match_results value_type; typedef std::ptrdiff_t difference_type; typedef const value_type * pointer; typedef const value_type & reference; typedef std::forward_iterator_tag iterator_category; regex_iterator() { // 28.12.1.1: Constructs an end-of-sequence iterator. } regex_iterator( const BidirectionalIterator a, const BidirectionalIterator b, const regex_type &re, const regex_constants::match_flag_type m = regex_constants::match_default) : begin(a), end(b), pregex(&re), flags(m) { regex_search(begin, end, begin, match, *pregex, flags); // 28.12.1.1: If this call returns false the constructor // sets *this to the end-of-sequence iterator. } regex_iterator(const regex_iterator &that) { operator=(that); } regex_iterator &operator=(const regex_iterator &that) { if (this != &that) { this->begin = that.begin; this->end = that.end; this->pregex = that.pregex; this->flags = that.flags; this->match = that.match; } return *this; } bool operator==(const regex_iterator &right) const { // It is probably safe to assume that match.size() == 0 means // end-of-sequence, because it happens only when 1) never tried // regex_search, or 2) regex_search returned false. if (this->match.size() == 0 || right.match.size() == 0) return this->match.size() == right.match.size(); return this->begin == right.begin && this->end == right.end && this->pregex == right.pregex && this->flags == right.flags && this->match[0] == right.match[0]; } bool operator!=(const regex_iterator &right) const { return !(*this == right); } const value_type &operator*() const { return match; } const value_type *operator->() const { return &match; } regex_iterator &operator++() { if (this->match.size()) { BidirectionalIterator start = match[0].second; if (match[0].first == start) // The iterator holds a 0-length match. { if (start == end) { match.clear_(); // 28.12.1.4.2: If the iterator holds a zero-length match and // start == end the operator sets *this to the end-ofsequence // iterator and returns *this. } else { // 28.12.1.4.3: Otherwise, if the iterator holds a zero-length match // the operator calls regex_search(start, end, match, *pregex, flags // | regex_constants::match_not_null | regex_constants::match_continuous). // If the call returns true the operator returns *this. [Cont...] if (!regex_search(start, end, begin, match, *pregex, flags | regex_constants::match_not_null | regex_constants::match_continuous)) { const BidirectionalIterator prevend = start; // [...Cont] Otherwise the operator increments start and continues // as if the most recent match was not a zero-length match. // ++start; utf_traits::codepoint_inc(start, end); flags |= regex_constants::match_prev_avail; if (regex_search(start, end, begin, match, *pregex, flags)) { // 28.12.1.4.5-6: In all cases in which the call to regex_search // returns true, match.prefix().first shall be equal to the previous // value of match[0].second, ... match[i].position() shall return // distance(begin, match[i].first). // This means that match[i].position() gives the offset from the // beginning of the target sequence, which is often not the same as // the offset from the sequence passed in the call to regex_search. // // To satisfy this: match.set_prefix_first_(prevend); } } } } else { // 28.12.1.4.4: If the most recent match was not a zero-length match, // the operator sets flags to flags | regex_constants::match_prev_avail // and calls regex_search(start, end, match, *pregex, flags). [Cont...] flags |= regex_constants::match_prev_avail; regex_search(start, end, begin, match, *pregex, flags); // [...Cont] If the call returns false the iterator sets *this to // the end-of-sequence iterator. The iterator then returns *this. // // 28.12.1.4.5-6: In all cases in which the call to regex_search // returns true, match.prefix().first shall be equal to the previous // value of match[0].second, ... match[i].position() shall return // distance(begin, match[i].first). // This means that match[i].position() gives the offset from the // beginning of the target sequence, which is often not the same as // the offset from the sequence passed in the call to regex_search. // // These should already be done in regex_search. } } return *this; } regex_iterator operator++(int) { const regex_iterator tmp = *this; ++(*this); return tmp; } private: BidirectionalIterator begin; BidirectionalIterator end; const regex_type *pregex; regex_constants::match_flag_type flags; match_results match; typedef typename traits::utf_traits utf_traits; }; typedef regex_iterator cregex_iterator; typedef regex_iterator wcregex_iterator; typedef regex_iterator sregex_iterator; typedef regex_iterator wsregex_iterator; #if defined(SRELL_CPP11_CHAR1632_ENABLED) typedef regex_iterator u16cregex_iterator; typedef regex_iterator u32cregex_iterator; typedef regex_iterator u16sregex_iterator; typedef regex_iterator u32sregex_iterator; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) typedef regex_iterator u8cregex_iterator; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2 typedef regex_iterator u8sregex_iterator; #endif typedef regex_iterator::value_type, u8regex_traits::value_type> > u8ccregex_iterator; typedef regex_iterator::value_type, u8regex_traits::value_type> > u8csregex_iterator; #if !defined(SRELL_CPP20_CHAR8_ENABLED) typedef u8ccregex_iterator u8cregex_iterator; #endif #if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2 typedef u8csregex_iterator u8sregex_iterator; #endif #if defined(WCHAR_MAX) #if WCHAR_MAX >= 0x10ffff typedef wcregex_iterator u32wcregex_iterator; typedef wsregex_iterator u32wsregex_iterator; typedef u32wcregex_iterator u1632wcregex_iterator; typedef u32wsregex_iterator u1632wsregex_iterator; #elif WCHAR_MAX >= 0xffff typedef regex_iterator::value_type, u16regex_traits::value_type> > u16wcregex_iterator; typedef regex_iterator::value_type, u16regex_traits::value_type> > u16wsregex_iterator; typedef u16wcregex_iterator u1632wcregex_iterator; typedef u16wsregex_iterator u1632wsregex_iterator; #endif #endif // ... "regex_iterator.hpp"] // ["regex_algorithm.hpp" ... // 28.11.2, function template regex_match: // [7.11.2] Function template regex_match template bool regex_match( const BidirectionalIterator first, const BidirectionalIterator last, match_results &m, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return e.search(first, last, first, m, flags | regex_constants::match_continuous | regex_constants::match_match_); } template bool regex_match( const BidirectionalIterator first, const BidirectionalIterator last, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { // 4 Effects: Behaves "as if" by constructing an instance of // match_results what, and then returning the // result of regex_match(first, last, what, e, flags). match_results what; return regex_match(first, last, what, e, flags); } template bool regex_match( const charT *const str, match_results &m, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_match(str, str + std::char_traits::length(str), m, e, flags); } template bool regex_match( const std::basic_string &s, match_results::const_iterator, Allocator> &m, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_match(s.begin(), s.end(), m, e, flags); } template bool regex_match( const charT *const str, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_match(str, str + std::char_traits::length(str), e, flags); } template bool regex_match( const std::basic_string &s, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_match(s.begin(), s.end(), e, flags); } template bool regex_search( const BidirectionalIterator first, const BidirectionalIterator last, const BidirectionalIterator lookbehind_limit, match_results &m, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return e.search(first, last, lookbehind_limit, m, flags); } template bool regex_search( const BidirectionalIterator first, const BidirectionalIterator last, const BidirectionalIterator lookbehind_limit, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { // 6 Effects: Behaves "as if" by constructing an object what of type // match_results and then returning the result of // regex_search(first, last, what, e, flags). match_results what; return regex_search(first, last, lookbehind_limit, what, e, flags); } // 28.11.3, function template regex_search: // 7.11.3 regex_search [tr.re.alg.search] template bool regex_search( const BidirectionalIterator first, const BidirectionalIterator last, match_results &m, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return e.search(first, last, first, m, flags); } template bool regex_search( const BidirectionalIterator first, const BidirectionalIterator last, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { // 6 Effects: Behaves "as if" by constructing an object what of type // match_results and then returning the result of // regex_search(first, last, what, e, flags). match_results what; return regex_search(first, last, what, e, flags); } template bool regex_search( const charT *const str, match_results &m, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_search(str, str + std::char_traits::length(str), m, e, flags); } template bool regex_search( const charT *const str, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_search(str, str + std::char_traits::length(str), e, flags); } template bool regex_search( const std::basic_string &s, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_search(s.begin(), s.end(), e, flags); } template bool regex_search( const std::basic_string &s, match_results::const_iterator, Allocator> &m, const basic_regex &e, const regex_constants::match_flag_type flags = regex_constants::match_default ) { return regex_search(s.begin(), s.end(), m, e, flags); } // 28.11.4, function template regex_replace: // [7.11.4] Function template regex_replace template OutputIterator regex_replace( OutputIterator out, const BidirectionalIterator first, const BidirectionalIterator last, const basic_regex &e, const std::basic_string &fmt, const regex_constants::match_flag_type flags = regex_constants::match_default ) { typedef regex_iterator iterator_type; const bool do_copy = !(flags & regex_constants::format_no_copy); const iterator_type eos; iterator_type i(first, last, e, flags); typename iterator_type::value_type::value_type last_m_suffix; last_m_suffix.first = first; last_m_suffix.second = last; for (; i != eos; ++i) { if (do_copy) out = std::copy(i->prefix().first, i->prefix().second, out); out = i->format(out, fmt, flags); last_m_suffix = i->suffix(); if (flags & regex_constants::format_first_only) break; } if (do_copy) out = std::copy(last_m_suffix.first, last_m_suffix.second, out); return out; } template OutputIterator regex_replace( OutputIterator out, const BidirectionalIterator first, const BidirectionalIterator last, const basic_regex &e, const charT *const fmt, const regex_constants::match_flag_type flags = regex_constants::match_default ) { // Strictly speaking, this should be implemented as a version different // from the above with changing the line i->format(out, fmt, flags) to // i->format(out, fmt, fmt + char_traits::length(fmt), flags). const std::basic_string fs(fmt, fmt + std::char_traits::length(fmt)); return regex_replace(out, first, last, e, fs, flags); } template std::basic_string regex_replace( const std::basic_string &s, const basic_regex &e, const std::basic_string &fmt, const regex_constants::match_flag_type flags = regex_constants::match_default ) { std::basic_string result; regex_replace(std::back_inserter(result), s.begin(), s.end(), e, fmt, flags); return result; } template std::basic_string regex_replace( const std::basic_string &s, const basic_regex &e, const charT *const fmt, const regex_constants::match_flag_type flags = regex_constants::match_default ) { std::basic_string result; regex_replace(std::back_inserter(result), s.begin(), s.end(), e, fmt, flags); return result; } template std::basic_string regex_replace( const charT *const s, const basic_regex &e, const std::basic_string &fmt, const regex_constants::match_flag_type flags = regex_constants::match_default ) { std::basic_string result; regex_replace(std::back_inserter(result), s, s + std::char_traits::length(s), e, fmt, flags); return result; } template std::basic_string regex_replace( const charT *const s, const basic_regex &e, const charT *const fmt, const regex_constants::match_flag_type flags = regex_constants::match_default ) { std::basic_string result; regex_replace(std::back_inserter(result), s, s + std::char_traits::length(s), e, fmt, flags); return result; } // ... "regex_algorithm.hpp"] // ["regex_token_iterator.hpp" ... // 28.12.2, class template regex_token_iterator: template ::value_type, class traits = regex_traits > class regex_token_iterator { public: typedef basic_regex regex_type; typedef sub_match value_type; typedef std::ptrdiff_t difference_type; typedef const value_type * pointer; typedef const value_type & reference; typedef std::forward_iterator_tag iterator_category; regex_token_iterator() : result(NULL) { // Constructs the end-of-sequence iterator. } regex_token_iterator( const BidirectionalIterator a, const BidirectionalIterator b, const regex_type &re, int submatch = 0, regex_constants::match_flag_type m = regex_constants::match_default ) : position(a, b, re, m), result(NULL), subs(1, submatch) { post_constructor(a, b); } regex_token_iterator( const BidirectionalIterator a, const BidirectionalIterator b, const regex_type &re, const std::vector &submatches, regex_constants::match_flag_type m = regex_constants::match_default ) : position(a, b, re, m), result(NULL), subs(submatches) { post_constructor(a, b); } #if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED) regex_token_iterator( const BidirectionalIterator a, const BidirectionalIterator b, const regex_type &re, std::initializer_list submatches, regex_constants::match_flag_type m = regex_constants::match_default ) : position(a, b, re, m), result(NULL), subs(submatches) { post_constructor(a, b); } #endif template // Was R in TR1. regex_token_iterator( const BidirectionalIterator a, const BidirectionalIterator b, const regex_type &re, const int (&submatches)[N], regex_constants::match_flag_type m = regex_constants::match_default ) : position(a, b, re, m), result(NULL), subs(submatches, submatches + N) { post_constructor(a, b); } regex_token_iterator(const regex_token_iterator &that) { operator=(that); } regex_token_iterator &operator=(const regex_token_iterator &that) { if (this != &that) { this->position = that.position; this->result = that.result; this->suffix = that.suffix; this->N = that.N; this->subs = that.subs; } return *this; } bool operator==(const regex_token_iterator &right) { if (this->result == NULL || right.result == NULL) return this->result == right.result; if (this->result == &this->suffix || right.result == &right.suffix) return this->suffix == right.suffix; return this->position == right.position && this->N == right.N && this->subs == right.subs; } bool operator!=(const regex_token_iterator &right) { return !(*this == right); } const value_type &operator*() { return *result; } const value_type *operator->() { return result; } regex_token_iterator &operator++() { position_iterator prev(position); position_iterator eos_iterator; if (result != NULL) // To avoid inifinite loop. The specification does not require, though. { if (result == &suffix) { result = NULL; // end-of-sequence. } else { ++this->N; for (;;) { if (this->N < subs.size()) { result = subs[this->N] != -1 ? &((*position)[subs[this->N]]) : &((*position).prefix()); break; } this->N = 0; ++position; if (position == eos_iterator) { if (this->N < subs.size() && prev->suffix().length() && minus1_in_subs()) { suffix = prev->suffix(); result = &suffix; } else { result = NULL; } break; } } } } return *this; } regex_token_iterator operator++(int) { const regex_token_iterator tmp(*this); ++(*this); return tmp; } private: void post_constructor(const BidirectionalIterator a, const BidirectionalIterator b) { position_iterator eos_iterator; this->N = 0; if (position != eos_iterator && subs.size()) { result = subs[this->N] != -1 ? &((*position)[subs[this->N]]) : &((*position).prefix()); } else if (minus1_in_subs()) // end-of-sequence. { suffix.first = a; suffix.second = b; suffix.matched = a != b; // 28.1.2.7: In a suffix iterator the member result holds a pointer // to the data member suffix, the value of the member suffix.match is true, if (suffix.matched) result = &suffix; else result = NULL; // Means end-of-sequence. } } bool minus1_in_subs() const { for (std::size_t i = 0; i < subs.size(); ++i) if (subs[i] == -1) return true; return false; } private: typedef regex_iterator position_iterator; position_iterator position; const value_type *result; value_type suffix; std::size_t N; std::vector subs; }; typedef regex_token_iterator cregex_token_iterator; typedef regex_token_iterator wcregex_token_iterator; typedef regex_token_iterator sregex_token_iterator; typedef regex_token_iterator wsregex_token_iterator; #if defined(SRELL_CPP11_CHAR1632_ENABLED) typedef regex_token_iterator u16cregex_token_iterator; typedef regex_token_iterator u32cregex_token_iterator; typedef regex_token_iterator u16sregex_token_iterator; typedef regex_token_iterator u32sregex_token_iterator; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) typedef regex_token_iterator u8cregex_token_iterator; #endif #if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2 typedef regex_token_iterator u8sregex_token_iterator; #endif typedef regex_token_iterator::value_type, u8regex_traits::value_type> > u8ccregex_token_iterator; typedef regex_token_iterator::value_type, u8regex_traits::value_type> > u8csregex_token_iterator; #if !defined(SRELL_CPP20_CHAR8_ENABLED) typedef u8ccregex_token_iterator u8cregex_token_iterator; #endif #if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2 typedef u8csregex_token_iterator u8sregex_token_iterator; #endif #if defined(WCHAR_MAX) #if WCHAR_MAX >= 0x10ffff typedef wcregex_token_iterator u32wcregex_token_iterator; typedef wsregex_token_iterator u32wsregex_token_iterator; typedef u32wcregex_token_iterator u1632wcregex_token_iterator; typedef u32wsregex_token_iterator u1632wsregex_token_iterator; #elif WCHAR_MAX >= 0xffff typedef regex_token_iterator::value_type, u16regex_traits::value_type> > u16wcregex_token_iterator; typedef regex_token_iterator::value_type, u16regex_traits::value_type> > u16wsregex_token_iterator; typedef u16wcregex_token_iterator u1632wcregex_token_iterator; typedef u16wsregex_token_iterator u1632wsregex_token_iterator; #endif #endif // ... "regex_token_iterator.hpp"] } // namespace srell #ifdef SRELL_NOEXCEPT #undef SRELL_NOEXCEPT #endif #ifdef SRELL_CPP20_CHAR8_ENABLED #undef SRELL_CPP20_CHAR8_ENABLED #endif #ifdef SRELL_CPP11_CHAR1632_ENABLED #undef SRELL_CPP11_CHAR1632_ENABLED #endif #ifdef SRELL_CPP11_INITIALIZER_LIST_ENABLED #undef SRELL_CPP11_INITIALIZER_LIST_ENABLED #endif #ifdef SRELL_CPP11_MOVE_ENABLED #undef SRELL_CPP11_MOVE_ENABLED #endif #endif // SRELL_REGEX_TEMPLATE_LIBRARY