11238 lines
283 KiB
C++
11238 lines
283 KiB
C++
/*****************************************************************************
|
|
**
|
|
** SRELL (std::regex-like library) version 4.000
|
|
**
|
|
** Copyright (c) 2012-2022, Nozomu Katoo. All rights reserved.
|
|
**
|
|
** Redistribution and use in source and binary forms, with or without
|
|
** modification, are permitted provided that the following conditions are
|
|
** met:
|
|
**
|
|
** 1. Redistributions of source code must retain the above copyright notice,
|
|
** this list of conditions and the following disclaimer.
|
|
**
|
|
** 2. Redistributions in binary form must reproduce the above copyright
|
|
** notice, this list of conditions and the following disclaimer in the
|
|
** documentation and/or other materials provided with the distribution.
|
|
**
|
|
** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
|
|
** IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
|
** THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
** PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
** CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
** EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
**
|
|
******************************************************************************
|
|
**/
|
|
|
|
#ifndef SRELL_REGEX_TEMPLATE_LIBRARY
|
|
#define SRELL_REGEX_TEMPLATE_LIBRARY
|
|
|
|
#include <stdexcept>
|
|
#include <climits>
|
|
#include <cwchar>
|
|
#include <string>
|
|
#include <locale>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <cstddef>
|
|
#include <utility>
|
|
#include <vector>
|
|
#include <iterator>
|
|
#include <memory>
|
|
#include <algorithm>
|
|
|
|
#ifdef __cpp_unicode_characters
|
|
#ifndef SRELL_CPP11_CHAR1632_ENABLED
|
|
#define SRELL_CPP11_CHAR1632_ENABLED
|
|
#endif
|
|
#endif
|
|
#ifdef __cpp_initializer_lists
|
|
#include <initializer_list>
|
|
#ifndef SRELL_CPP11_INITIALIZER_LIST_ENABLED
|
|
#define SRELL_CPP11_INITIALIZER_LIST_ENABLED
|
|
#endif
|
|
#endif
|
|
#ifdef __cpp_rvalue_references
|
|
#ifndef SRELL_CPP11_MOVE_ENABLED
|
|
#define SRELL_CPP11_MOVE_ENABLED
|
|
#endif
|
|
#endif
|
|
#ifdef SRELL_CPP11_MOVE_ENABLED
|
|
#if defined(_MSC_VER) && _MSC_VER < 1900
|
|
#define SRELL_NOEXCEPT
|
|
#else
|
|
#define SRELL_NOEXCEPT noexcept
|
|
#endif
|
|
#endif
|
|
#ifdef __cpp_char8_t
|
|
#ifndef SRELL_CPP20_CHAR8_ENABLED
|
|
#ifdef __cpp_lib_char8_t
|
|
#define SRELL_CPP20_CHAR8_ENABLED 2
|
|
#else
|
|
#define SRELL_CPP20_CHAR8_ENABLED 1
|
|
#endif
|
|
#endif
|
|
#endif
|
|
|
|
// The following SRELL_NO_* macros would be useful when wanting to
|
|
// reduce the size of a binary by turning off some feature(s).
|
|
|
|
#ifdef SRELL_NO_UNICODE_DATA
|
|
|
|
// Prevents Unicode data used for icase (case-insensitive) matching
|
|
// from being output into a resulting binary. In this case only the
|
|
// ASCII characters are case-folded when icase matching is performed
|
|
// (i.e., [A-Z] -> [a-z] only).
|
|
#define SRELL_NO_UNICODE_ICASE
|
|
|
|
// Disables the Unicode property (\p{...} and \P{...}) and prevents
|
|
// Unicode property data from being output into a resulting binary.
|
|
#define SRELL_NO_UNICODE_PROPERTY
|
|
#endif
|
|
|
|
// Prevents icase matching specific functions into a resulting binary.
|
|
// In this case the icase flag is ignored and icase matching becomes
|
|
// unavailable.
|
|
#ifdef SRELL_NO_ICASE
|
|
#ifndef SRELL_NO_UNICODE_ICASE
|
|
#define SRELL_NO_UNICODE_ICASE
|
|
#endif
|
|
#endif
|
|
|
|
// This macro might be removed in the future.
|
|
#ifdef SRELL_V1_COMPATIBLE
|
|
#ifndef SRELL_NO_UNICODE_PROPERTY
|
|
#define SRELL_NO_UNICODE_PROPERTY
|
|
#endif
|
|
#ifndef SRELL_NO_VMODE
|
|
#define SRELL_NO_VMODE
|
|
#endif
|
|
#define SRELL_NO_NAMEDCAPTURE
|
|
#define SRELL_NO_SINGLELINE
|
|
#define SRELL_FIXEDWIDTHLOOKBEHIND
|
|
#endif
|
|
|
|
namespace srell
|
|
{
|
|
// ["regex_constants.h" ...
|
|
|
|
namespace regex_constants
|
|
{
|
|
enum syntax_option_type
|
|
{
|
|
icase = 1 << 0,
|
|
nosubs = 1 << 1,
|
|
optimize = 1 << 2,
|
|
collate = 1 << 3,
|
|
ECMAScript = 1 << 4,
|
|
basic = 1 << 5,
|
|
extended = 1 << 6,
|
|
awk = 1 << 7,
|
|
grep = 1 << 8,
|
|
egrep = 1 << 9,
|
|
multiline = 1 << 10,
|
|
|
|
// SRELL's extension.
|
|
dotall = 1 << 11, // singleline.
|
|
unicodesets = 1 << 12
|
|
};
|
|
|
|
inline syntax_option_type operator&(const syntax_option_type left, const syntax_option_type right)
|
|
{
|
|
return static_cast<syntax_option_type>(static_cast<int>(left) & static_cast<int>(right));
|
|
}
|
|
inline syntax_option_type operator|(const syntax_option_type left, const syntax_option_type right)
|
|
{
|
|
return static_cast<syntax_option_type>(static_cast<int>(left) | static_cast<int>(right));
|
|
}
|
|
inline syntax_option_type operator^(const syntax_option_type left, const syntax_option_type right)
|
|
{
|
|
return static_cast<syntax_option_type>(static_cast<int>(left) ^ static_cast<int>(right));
|
|
}
|
|
inline syntax_option_type operator~(const syntax_option_type b)
|
|
{
|
|
return static_cast<syntax_option_type>(~static_cast<int>(b));
|
|
}
|
|
inline syntax_option_type &operator&=(syntax_option_type &left, const syntax_option_type right)
|
|
{
|
|
left = left & right;
|
|
return left;
|
|
}
|
|
inline syntax_option_type &operator|=(syntax_option_type &left, const syntax_option_type right)
|
|
{
|
|
left = left | right;
|
|
return left;
|
|
}
|
|
inline syntax_option_type &operator^=(syntax_option_type &left, const syntax_option_type right)
|
|
{
|
|
left = left ^ right;
|
|
return left;
|
|
}
|
|
}
|
|
// namespace regex_constants
|
|
|
|
namespace regex_constants
|
|
{
|
|
enum match_flag_type
|
|
{
|
|
match_default = 0,
|
|
match_not_bol = 1 << 0,
|
|
match_not_eol = 1 << 1,
|
|
match_not_bow = 1 << 2,
|
|
match_not_eow = 1 << 3,
|
|
match_any = 1 << 4,
|
|
match_not_null = 1 << 5,
|
|
match_continuous = 1 << 6,
|
|
match_prev_avail = 1 << 7,
|
|
|
|
format_default = 0,
|
|
format_sed = 1 << 8,
|
|
format_no_copy = 1 << 9,
|
|
format_first_only = 1 << 10,
|
|
|
|
// For internal use.
|
|
match_match_ = 1 << 11
|
|
};
|
|
|
|
inline match_flag_type operator&(const match_flag_type left, const match_flag_type right)
|
|
{
|
|
return static_cast<match_flag_type>(static_cast<int>(left) & static_cast<int>(right));
|
|
}
|
|
inline match_flag_type operator|(const match_flag_type left, const match_flag_type right)
|
|
{
|
|
return static_cast<match_flag_type>(static_cast<int>(left) | static_cast<int>(right));
|
|
}
|
|
inline match_flag_type operator^(const match_flag_type left, const match_flag_type right)
|
|
{
|
|
return static_cast<match_flag_type>(static_cast<int>(left) ^ static_cast<int>(right));
|
|
}
|
|
inline match_flag_type operator~(const match_flag_type b)
|
|
{
|
|
return static_cast<match_flag_type>(~static_cast<int>(b));
|
|
}
|
|
inline match_flag_type &operator&=(match_flag_type &left, const match_flag_type right)
|
|
{
|
|
left = left & right;
|
|
return left;
|
|
}
|
|
inline match_flag_type &operator|=(match_flag_type &left, const match_flag_type right)
|
|
{
|
|
left = left | right;
|
|
return left;
|
|
}
|
|
inline match_flag_type &operator^=(match_flag_type &left, const match_flag_type right)
|
|
{
|
|
left = left ^ right;
|
|
return left;
|
|
}
|
|
}
|
|
// namespace regex_constants
|
|
|
|
// 28.5, regex constants:
|
|
namespace regex_constants
|
|
{
|
|
typedef int error_type;
|
|
|
|
static const error_type error_collate = 100;
|
|
static const error_type error_ctype = 101;
|
|
static const error_type error_escape = 102;
|
|
static const error_type error_backref = 103;
|
|
static const error_type error_brack = 104;
|
|
static const error_type error_paren = 105;
|
|
static const error_type error_brace = 106;
|
|
static const error_type error_badbrace = 107;
|
|
static const error_type error_range = 108;
|
|
static const error_type error_space = 109;
|
|
static const error_type error_badrepeat = 110;
|
|
static const error_type error_complexity = 111;
|
|
static const error_type error_stack = 112;
|
|
|
|
// SRELL's extensions.
|
|
static const error_type error_utf8 = 113;
|
|
// The expression contained an invalid UTF-8 sequence.
|
|
|
|
static const error_type error_property = 114;
|
|
// The expression contained an invalid Unicode property name or value.
|
|
|
|
static const error_type error_noescape = 115;
|
|
// (Only in v-mode) ( ) [ ] { } / - \ | need to be escaped in a character class.
|
|
|
|
static const error_type error_operator = 116;
|
|
// (Only in v-mode) A character class contained a reserved double punctuation
|
|
// operator or different types of operators at the same level, such as [ab--cd].
|
|
|
|
static const error_type error_complement = 117;
|
|
// (Only in v-mode) \P or a negated character class contained a property of strings.
|
|
|
|
static const error_type error_modifier = 118;
|
|
// A specific flag modifier appears more then once.
|
|
|
|
#if defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
static const error_type error_lookbehind = 200;
|
|
#endif
|
|
static const error_type error_internal = 999;
|
|
}
|
|
// namespace regex_constants
|
|
|
|
// ... "regex_constants.h"]
|
|
// ["regex_error.hpp" ...
|
|
|
|
// 28.6, class regex_error:
|
|
class regex_error : public std::runtime_error
|
|
{
|
|
public:
|
|
|
|
explicit regex_error(const regex_constants::error_type ecode)
|
|
: std::runtime_error("regex_error") // added for error C2512.
|
|
, ecode_(ecode)
|
|
{
|
|
}
|
|
|
|
regex_constants::error_type code() const
|
|
{
|
|
return ecode_;
|
|
}
|
|
|
|
private:
|
|
|
|
regex_constants::error_type ecode_;
|
|
};
|
|
|
|
// ... "regex_error.hpp"]
|
|
// ["rei_type.h" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
#if defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
|
|
typedef char32_t uchar32;
|
|
|
|
#elif defined(UINT_MAX) && UINT_MAX >= 0xFFFFFFFF
|
|
|
|
typedef unsigned int uchar32;
|
|
|
|
#elif defined(ULONG_MAX) && ULONG_MAX >= 0xFFFFFFFF
|
|
|
|
typedef unsigned long uchar32;
|
|
|
|
#else
|
|
#error could not find a suitable type for 32-bit Unicode integer values.
|
|
#endif // defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
|
|
typedef uchar32 uint_l32; // uint_least32.
|
|
|
|
} // regex_internal
|
|
|
|
// ... "rei_type.h"]
|
|
// ["rei_constants.h" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
enum re_state_type
|
|
{
|
|
st_character, // 0x00
|
|
st_character_class, // 0x01
|
|
|
|
st_epsilon, // 0x02
|
|
|
|
st_check_counter, // 0x03
|
|
// st_increment_counter, // 0x04
|
|
st_decrement_counter, // 0x04
|
|
st_save_and_reset_counter, // 0x05
|
|
st_restore_counter, // 0x06
|
|
|
|
st_roundbracket_open, // 0x07
|
|
st_roundbracket_pop, // 0x08
|
|
st_roundbracket_close, // 0x09
|
|
|
|
st_repeat_in_push, // 0x0a
|
|
st_repeat_in_pop, // 0x0b
|
|
st_check_0_width_repeat, // 0x0c
|
|
|
|
st_backreference, // 0x0d
|
|
|
|
st_lookaround_open, // 0x0e
|
|
|
|
// st_lookaround_pop, // 0x10
|
|
|
|
st_bol, // 0x0f
|
|
st_eol, // 0x10
|
|
st_boundary, // 0x11
|
|
|
|
st_success, // 0x12
|
|
|
|
#if !defined(SRELLDBG_NO_NEXTPOS_OPT)
|
|
st_move_nextpos, // 0x13
|
|
#endif
|
|
|
|
st_lookaround_close = st_success,
|
|
st_zero_width_boundary = st_lookaround_open,
|
|
};
|
|
// re_state_type
|
|
|
|
namespace constants
|
|
{
|
|
static const uchar32 unicode_max_codepoint = 0x10ffff;
|
|
static const uchar32 invalid_u32value = static_cast<uchar32>(-1);
|
|
static const uchar32 max_u32value = static_cast<uchar32>(-2);
|
|
static const uchar32 asc_icase = 0x20;
|
|
static const uchar32 ccstr_empty = static_cast<uchar32>(-1);
|
|
static const uint_l32 infinity = static_cast<uint_l32>(~0);
|
|
}
|
|
// constants
|
|
|
|
namespace meta_char
|
|
{
|
|
static const uchar32 mc_exclam = 0x21; // '!'
|
|
static const uchar32 mc_sharp = 0x23; // '#'
|
|
static const uchar32 mc_dollar = 0x24; // '$'
|
|
static const uchar32 mc_rbraop = 0x28; // '('
|
|
static const uchar32 mc_rbracl = 0x29; // ')'
|
|
static const uchar32 mc_astrsk = 0x2a; // '*'
|
|
static const uchar32 mc_plus = 0x2b; // '+'
|
|
static const uchar32 mc_comma = 0x2c; // ','
|
|
static const uchar32 mc_minus = 0x2d; // '-'
|
|
static const uchar32 mc_period = 0x2e; // '.'
|
|
static const uchar32 mc_colon = 0x3a; // ':'
|
|
static const uchar32 mc_lt = 0x3c; // '<'
|
|
static const uchar32 mc_eq = 0x3d; // '='
|
|
static const uchar32 mc_gt = 0x3e; // '>'
|
|
static const uchar32 mc_query = 0x3f; // '?'
|
|
static const uchar32 mc_sbraop = 0x5b; // '['
|
|
static const uchar32 mc_escape = 0x5c; // '\\'
|
|
static const uchar32 mc_sbracl = 0x5d; // ']'
|
|
static const uchar32 mc_caret = 0x5e; // '^'
|
|
static const uchar32 mc_cbraop = 0x7b; // '{'
|
|
static const uchar32 mc_bar = 0x7c; // '|'
|
|
static const uchar32 mc_cbracl = 0x7d; // '}'
|
|
}
|
|
// meta_char
|
|
|
|
namespace char_ctrl
|
|
{
|
|
static const uchar32 cc_nul = 0x00; // '\0' //0x00:NUL
|
|
static const uchar32 cc_bs = 0x08; // '\b' //0x08:BS
|
|
static const uchar32 cc_htab = 0x09; // '\t' //0x09:HT
|
|
static const uchar32 cc_nl = 0x0a; // '\n' //0x0a:LF
|
|
static const uchar32 cc_vtab = 0x0b; // '\v' //0x0b:VT
|
|
static const uchar32 cc_ff = 0x0c; // '\f' //0x0c:FF
|
|
static const uchar32 cc_cr = 0x0d; // '\r' //0x0d:CR
|
|
}
|
|
// char_ctrl
|
|
|
|
namespace char_alnum
|
|
{
|
|
static const uchar32 ch_0 = 0x30; // '0'
|
|
static const uchar32 ch_1 = 0x31; // '1'
|
|
static const uchar32 ch_7 = 0x37; // '7'
|
|
static const uchar32 ch_8 = 0x38; // '8'
|
|
static const uchar32 ch_9 = 0x39; // '9'
|
|
static const uchar32 ch_A = 0x41; // 'A'
|
|
static const uchar32 ch_B = 0x42; // 'B'
|
|
static const uchar32 ch_D = 0x44; // 'D'
|
|
static const uchar32 ch_F = 0x46; // 'F'
|
|
static const uchar32 ch_P = 0x50; // 'P'
|
|
static const uchar32 ch_S = 0x53; // 'S'
|
|
static const uchar32 ch_W = 0x57; // 'W'
|
|
static const uchar32 ch_Z = 0x5a; // 'Z'
|
|
static const uchar32 ch_a = 0x61; // 'a'
|
|
static const uchar32 ch_b = 0x62; // 'b'
|
|
static const uchar32 ch_c = 0x63; // 'c'
|
|
static const uchar32 ch_d = 0x64; // 'd'
|
|
static const uchar32 ch_f = 0x66; // 'f'
|
|
static const uchar32 ch_k = 0x6b; // 'k'
|
|
static const uchar32 ch_n = 0x6e; // 'n'
|
|
static const uchar32 ch_p = 0x70; // 'p'
|
|
static const uchar32 ch_q = 0x71; // 'q'
|
|
static const uchar32 ch_r = 0x72; // 'r'
|
|
static const uchar32 ch_s = 0x73; // 's'
|
|
static const uchar32 ch_t = 0x74; // 't'
|
|
static const uchar32 ch_u = 0x75; // 'u'
|
|
static const uchar32 ch_v = 0x76; // 'v'
|
|
static const uchar32 ch_w = 0x77; // 'w'
|
|
static const uchar32 ch_x = 0x78; // 'x'
|
|
static const uchar32 ch_z = 0x7a; // 'z'
|
|
}
|
|
// char_alnum
|
|
|
|
namespace char_other
|
|
{
|
|
static const uchar32 co_sp = 0x20; // ' '
|
|
static const uchar32 co_perc = 0x25; // '%'
|
|
static const uchar32 co_amp = 0x26; // '&'
|
|
static const uchar32 co_apos = 0x27; // '\''
|
|
static const uchar32 co_slash = 0x2f; // '/'
|
|
static const uchar32 co_smcln = 0x3b; // ';'
|
|
static const uchar32 co_atmrk = 0x40; // '@'
|
|
static const uchar32 co_ll = 0x5f; // '_'
|
|
static const uchar32 co_grav = 0x60; // '`'
|
|
static const uchar32 co_tilde = 0x7e; // '~'
|
|
}
|
|
// char_other
|
|
}
|
|
// namespace regex_internal
|
|
|
|
// ... "rei_constants.h"]
|
|
// ["rei_utf_traits.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
template <typename charT>
|
|
struct utf_traits_core
|
|
{
|
|
public:
|
|
|
|
static const std::size_t maxseqlen = 1;
|
|
static const int utftype = 0;
|
|
|
|
static const std::size_t bitsetsize = 0x100;
|
|
static const uchar32 bitsetmask = 0xff;
|
|
static const uchar32 cumask = 0xff;
|
|
|
|
// *iter
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint(ForwardIterator begin, const ForwardIterator /* end */)
|
|
{
|
|
return static_cast<uchar32>(*begin);
|
|
// Caller is responsible for begin != end.
|
|
}
|
|
|
|
// *iter++
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator /* end */)
|
|
{
|
|
return static_cast<uchar32>(*begin++);
|
|
// Caller is responsible for begin != end.
|
|
}
|
|
|
|
// iter2 = iter; return *--iter2;
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator /* begin */)
|
|
{
|
|
return static_cast<uchar32>(*--cur);
|
|
}
|
|
|
|
// *--iter
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator /* begin */)
|
|
{
|
|
return static_cast<uchar32>(*--cur);
|
|
// Caller is responsible for cur != begin.
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
|
|
template <typename charT2>
|
|
static bool is_trailing(const charT2 /* cu */)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
#endif // !defined(SRELLDBG_NO_BMH)
|
|
|
|
static uchar32 to_codeunits(charT out[maxseqlen], uchar32 cp)
|
|
{
|
|
out[0] = static_cast<charT>(cp);
|
|
return 1;
|
|
}
|
|
|
|
static uchar32 firstcodeunit(const uchar32 cp)
|
|
{
|
|
return cp;
|
|
}
|
|
|
|
template <typename ForwardIterator>
|
|
static bool seek_charboundary(ForwardIterator &begin, const ForwardIterator end)
|
|
{
|
|
return begin != end;
|
|
}
|
|
};
|
|
template <typename charT>
|
|
const std::size_t utf_traits_core<charT>::maxseqlen;
|
|
template <typename charT>
|
|
const int utf_traits_core<charT>::utftype;
|
|
template <typename charT>
|
|
const std::size_t utf_traits_core<charT>::bitsetsize;
|
|
template <typename charT>
|
|
const uchar32 utf_traits_core<charT>::bitsetmask;
|
|
template <typename charT>
|
|
const uchar32 utf_traits_core<charT>::cumask;
|
|
// utf_traits_core
|
|
|
|
// common and utf-32.
|
|
template <typename charT>
|
|
struct utf_traits : public utf_traits_core<charT>
|
|
{
|
|
static const int utftype = 32;
|
|
|
|
static const std::size_t bitsetsize = 0x10000;
|
|
static const uchar32 bitsetmask = 0xffff;
|
|
static const uchar32 cumask = 0x1fffff;
|
|
};
|
|
template <typename charT>
|
|
const int utf_traits<charT>::utftype;
|
|
template <typename charT>
|
|
const std::size_t utf_traits<charT>::bitsetsize;
|
|
template <typename charT>
|
|
const uchar32 utf_traits<charT>::bitsetmask;
|
|
template <typename charT>
|
|
const uchar32 utf_traits<charT>::cumask;
|
|
// utf_traits
|
|
|
|
// utf-8 specific.
|
|
template <typename charT>
|
|
struct utf8_traits : public utf_traits_core<charT>
|
|
{
|
|
public:
|
|
|
|
// utf-8 specific.
|
|
static const std::size_t maxseqlen = 4;
|
|
static const int utftype = 8;
|
|
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint(ForwardIterator begin, const ForwardIterator end)
|
|
{
|
|
// return codepoint_inc(begin, end);
|
|
|
|
uchar32 codepoint = static_cast<uchar32>(*begin & 0xff);
|
|
|
|
if ((codepoint & 0x80) == 0) // 1 octet.
|
|
return codepoint;
|
|
|
|
if (++begin != end && (codepoint >= 0xc0 && codepoint <= 0xf7) && (*begin & 0xc0) == 0x80)
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint << 6) | (*begin & 0x3f));
|
|
|
|
if ((codepoint & 0x800) == 0) // 2 octets.
|
|
return static_cast<uchar32>(codepoint & 0x7ff);
|
|
|
|
if (++begin != end && (*begin & 0xc0) == 0x80)
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint << 6) | (*begin & 0x3f));
|
|
|
|
if ((codepoint & 0x10000) == 0) // 3 octets.
|
|
return static_cast<uchar32>(codepoint & 0xffff);
|
|
|
|
if (++begin != end && (*begin & 0xc0) == 0x80) // 4 octets.
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint << 6) | (*begin & 0x3f));
|
|
|
|
return static_cast<uchar32>(codepoint & 0x1fffff);
|
|
}
|
|
}
|
|
}
|
|
// else // 80-bf, f8-ff: invalid.
|
|
|
|
return regex_internal::constants::invalid_u32value;
|
|
}
|
|
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator end)
|
|
{
|
|
uchar32 codepoint = static_cast<uchar32>(*begin++ & 0xff);
|
|
|
|
if ((codepoint & 0x80) == 0) // 1 octet.
|
|
return codepoint;
|
|
|
|
// Expects transformation to (codepoint - 0xc0) <= 0x37 by optimisation.
|
|
// 0xF7 instead of 0xF4 is for consistency with reverse iterators.
|
|
if (begin != end && (codepoint >= 0xc0 && codepoint <= 0xf7) && (*begin & 0xc0) == 0x80)
|
|
// if (begin != end && (0x7f00 & (1 << ((codepoint >> 3) & 0xf))) && (*begin & 0xc0) == 0x80) // c0, c8, d0, d8, e0, e8, f0.
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint << 6) | (*begin++ & 0x3f));
|
|
|
|
// 11 ?aaa aabb bbbb
|
|
if ((codepoint & 0x800) == 0) // 2 octets.
|
|
return static_cast<uchar32>(codepoint & 0x7ff);
|
|
// c080-c1bf: invalid. 00-7F.
|
|
// c280-dfbf: valid. 080-7FF.
|
|
|
|
// 11 1aaa aabb bbbb
|
|
if (begin != end && (*begin & 0xc0) == 0x80)
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint << 6) | (*begin++ & 0x3f));
|
|
|
|
// 111? aaaa bbbb bbcc cccc
|
|
if ((codepoint & 0x10000) == 0) // 3 octets.
|
|
return static_cast<uchar32>(codepoint & 0xffff);
|
|
// e08080-e09fbf: invalid. 000-7FF.
|
|
// e0a080-efbfbf: valid. 0800-FFFF.
|
|
|
|
// 1111 0aaa bbbb bbcc cccc
|
|
if (begin != end && (*begin & 0xc0) == 0x80) // 4 octets.
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint << 6) | (*begin++ & 0x3f));
|
|
// f0808080-f08fbfbf: invalid. 0000-FFFF.
|
|
// f0908080-f3bfbfbf: valid. 10000-FFFFF.
|
|
// f4808080-f48fbfbf: valid. 100000-10FFFF.
|
|
// f4908080-f4bfbfbf: invalid. 110000-13FFFF.
|
|
// f5808080-f7bfbfbf: invalid. 140000-1FFFFF.
|
|
|
|
// 11 110a aabb bbbb cccc ccdd dddd
|
|
return static_cast<uchar32>(codepoint & 0x1fffff);
|
|
}
|
|
}
|
|
}
|
|
// else // 80-bf, f8-ff: invalid.
|
|
|
|
return regex_internal::constants::invalid_u32value;
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator begin)
|
|
{
|
|
uchar32 codepoint = static_cast<uchar32>(*--cur);
|
|
|
|
if ((codepoint & 0x80) == 0)
|
|
return static_cast<uchar32>(codepoint & 0xff);
|
|
|
|
if ((codepoint & 0x40) == 0 && cur != begin)
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint & 0x3f) | (*--cur << 6));
|
|
|
|
if ((codepoint & 0x3800) == 0x3000) // 2 octets.
|
|
return static_cast<uchar32>(codepoint & 0x7ff);
|
|
|
|
if ((codepoint & 0x3000) == 0x2000 && cur != begin)
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint & 0xfff) | (*--cur << 12));
|
|
|
|
if ((codepoint & 0xf0000) == 0xe0000) // 3 octets.
|
|
return static_cast<uchar32>(codepoint & 0xffff);
|
|
|
|
if ((codepoint & 0xc0000) == 0x80000 && cur != begin)
|
|
{
|
|
if ((*--cur & 0xf8) == 0xf0) // 4 octets.
|
|
return static_cast<uchar32>((codepoint & 0x3ffff) | ((*cur & 7) << 18));
|
|
}
|
|
}
|
|
}
|
|
return regex_internal::constants::invalid_u32value;
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator begin)
|
|
{
|
|
uchar32 codepoint = static_cast<uchar32>(*--cur);
|
|
|
|
if ((codepoint & 0x80) == 0)
|
|
return static_cast<uchar32>(codepoint & 0xff);
|
|
|
|
if ((codepoint & 0x40) == 0 && cur != begin)
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint & 0x3f) | (*--cur << 6));
|
|
|
|
// 11 0bbb bbaa aaaa?
|
|
if ((codepoint & 0x3800) == 0x3000) // 2 octets.
|
|
// if ((*cur & 0xe0) == 0xc0)
|
|
return static_cast<uchar32>(codepoint & 0x7ff);
|
|
|
|
// 10 bbbb bbaa aaaa?
|
|
if ((codepoint & 0x3000) == 0x2000 && cur != begin) // [\x80-\xbf]{2}.
|
|
// if ((*cur & 0xc0) == 0x80 && cur != begin)
|
|
{
|
|
codepoint = static_cast<uchar32>((codepoint & 0xfff) | (*--cur << 12));
|
|
|
|
// 1110 cccc bbbb bbaa aaaa?
|
|
if ((codepoint & 0xf0000) == 0xe0000) // 3 octets.
|
|
// if ((*cur & 0xf0) == 0xe0)
|
|
return static_cast<uchar32>(codepoint & 0xffff);
|
|
|
|
// 10cc cccc bbbb bbaa aaaa?
|
|
if ((codepoint & 0xc0000) == 0x80000 && cur != begin) // [\x80-\xbf]{3}.
|
|
// if ((*cur & 0xc0) == 0x80 && cur != begin)
|
|
{
|
|
if ((*--cur & 0xf8) == 0xf0) // 4 octets.
|
|
return static_cast<uchar32>((codepoint & 0x3ffff) | ((*cur & 7) << 18));
|
|
// d ddcc cccc bbbb bbaa aaaa
|
|
//else // [\0-\xef\xf8-\xff][\x80-\xbf]{3}.
|
|
|
|
// Sequences [\xc0-\xdf][\x80-\xbf] and [\xe0-\xef][\x80-\xbf]{2} are valid.
|
|
// To give a chance to them, rewinds cur.
|
|
++cur;
|
|
}
|
|
//else // [\0-\x7f\xc0-\xdf\xf0-\xff][\x80-\xbf]{2}.
|
|
++cur; // Sequence [\xc0-\xdf][\x80-\xbf] is valid. Rewinds to give a chance to it.
|
|
}
|
|
//else // [\0-\x7f\xe0-\xff][\x80-\xbf].
|
|
++cur; // Rewinds to give a chance to [\0-\x7f].
|
|
}
|
|
//else // [\xc0-\xff].
|
|
|
|
return regex_internal::constants::invalid_u32value;
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
|
|
template <typename charT2>
|
|
static bool is_trailing(const charT2 cu)
|
|
{
|
|
return (cu & 0xc0) == 0x80;
|
|
}
|
|
|
|
#endif // !defined(SRELLDBG_NO_BMH)
|
|
|
|
static uchar32 to_codeunits(charT out[maxseqlen], uchar32 cp)
|
|
{
|
|
if (cp < 0x80)
|
|
{
|
|
out[0] = static_cast<charT>(cp);
|
|
return 1;
|
|
}
|
|
else if (cp < 0x800)
|
|
{
|
|
out[0] = static_cast<charT>(((cp >> 6) & 0x1f) | 0xc0);
|
|
out[1] = static_cast<charT>((cp & 0x3f) | 0x80);
|
|
return 2;
|
|
}
|
|
else if (cp < 0x10000)
|
|
{
|
|
out[0] = static_cast<charT>(((cp >> 12) & 0x0f) | 0xe0);
|
|
out[1] = static_cast<charT>(((cp >> 6) & 0x3f) | 0x80);
|
|
out[2] = static_cast<charT>((cp & 0x3f) | 0x80);
|
|
return 3;
|
|
}
|
|
// else // if (cp < 0x110000)
|
|
{
|
|
out[0] = static_cast<charT>(((cp >> 18) & 0x07) | 0xf0);
|
|
out[1] = static_cast<charT>(((cp >> 12) & 0x3f) | 0x80);
|
|
out[2] = static_cast<charT>(((cp >> 6) & 0x3f) | 0x80);
|
|
out[3] = static_cast<charT>((cp & 0x3f) | 0x80);
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
static uchar32 firstcodeunit(const uchar32 cp)
|
|
{
|
|
if (cp < 0x80)
|
|
return cp;
|
|
|
|
if (cp < 0x800)
|
|
return static_cast<uchar32>(((cp >> 6) & 0x1f) | 0xc0);
|
|
|
|
if (cp < 0x10000)
|
|
return static_cast<uchar32>(((cp >> 12) & 0x0f) | 0xe0);
|
|
|
|
return static_cast<uchar32>(((cp >> 18) & 0x07) | 0xf0);
|
|
}
|
|
|
|
template <typename ForwardIterator>
|
|
static bool seek_charboundary(ForwardIterator &begin, const ForwardIterator end)
|
|
{
|
|
for (; begin != end; ++begin)
|
|
{
|
|
// if ((*begin & 0xc0) != 0x80 && (*begin & 0xf8) != 0xf8) // 00-7f, c0-f7.
|
|
if ((*begin & 0xc0) != 0x80) // 00-7f, c0-ff.
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
};
|
|
template <typename charT>
|
|
const std::size_t utf8_traits<charT>::maxseqlen;
|
|
template <typename charT>
|
|
const int utf8_traits<charT>::utftype;
|
|
// utf8_traits
|
|
|
|
// utf-16 specific.
|
|
template <typename charT>
|
|
struct utf16_traits : public utf_traits_core<charT>
|
|
{
|
|
public:
|
|
|
|
// utf-16 specific.
|
|
static const std::size_t maxseqlen = 2;
|
|
static const int utftype = 16;
|
|
|
|
static const std::size_t bitsetsize = 0x10000;
|
|
static const uchar32 bitsetmask = 0xffff;
|
|
static const uchar32 cumask = 0xffff;
|
|
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint(ForwardIterator begin, const ForwardIterator end)
|
|
{
|
|
const uchar32 codeunit = *begin;
|
|
|
|
if ((codeunit & 0xdc00) != 0xd800)
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
|
|
if (++begin != end && (*begin & 0xdc00) == 0xdc00)
|
|
return static_cast<uchar32>((((codeunit & 0x3ff) << 10) | (*begin & 0x3ff)) + 0x10000);
|
|
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
}
|
|
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator end)
|
|
{
|
|
const uchar32 codeunit = *begin++;
|
|
|
|
if ((codeunit & 0xdc00) != 0xd800)
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
|
|
if (begin != end && (*begin & 0xdc00) == 0xdc00)
|
|
return static_cast<uchar32>((((codeunit & 0x3ff) << 10) | (*begin++ & 0x3ff)) + 0x10000);
|
|
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator begin)
|
|
{
|
|
const uchar32 codeunit = *--cur;
|
|
|
|
if ((codeunit & 0xdc00) != 0xdc00 || cur == begin)
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
|
|
if ((*--cur & 0xdc00) == 0xd800)
|
|
return static_cast<uchar32>((((*cur & 0x3ff) << 10) | (codeunit & 0x3ff)) + 0x10000);
|
|
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator begin)
|
|
{
|
|
const uchar32 codeunit = *--cur;
|
|
|
|
if ((codeunit & 0xdc00) != 0xdc00 || cur == begin)
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
|
|
if ((*--cur & 0xdc00) == 0xd800)
|
|
return static_cast<uchar32>((((*cur & 0x3ff) << 10) | (codeunit & 0x3ff)) + 0x10000);
|
|
//else // (codeunit & 0xdc00) == 0xdc00 && (*cur & 0xdc00) != 0xd800
|
|
|
|
++cur;
|
|
|
|
return static_cast<uchar32>(codeunit & 0xffff);
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
|
|
template <typename charT2>
|
|
static bool is_trailing(const charT2 cu)
|
|
{
|
|
return (cu & 0xdc00) == 0xdc00;
|
|
}
|
|
|
|
#endif // !defined(SRELLDBG_NO_BMH)
|
|
|
|
static uchar32 to_codeunits(charT out[maxseqlen], uchar32 cp)
|
|
{
|
|
if (cp < 0x10000)
|
|
{
|
|
out[0] = static_cast<charT>(cp);
|
|
return 1;
|
|
}
|
|
// else // if (cp < 0x110000)
|
|
{
|
|
cp -= 0x10000;
|
|
out[0] = static_cast<charT>(((cp >> 10) & 0x3ff) | 0xd800);
|
|
out[1] = static_cast<charT>((cp & 0x3ff) | 0xdc00);
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
static uchar32 firstcodeunit(const uchar32 cp)
|
|
{
|
|
if (cp < 0x10000)
|
|
return cp;
|
|
|
|
return static_cast<uchar32>((cp >> 10) + 0xd7c0);
|
|
// aaaaa bbbbcccc ddddeeee -> AA AAbb bbcc/cc dddd eeee where AAAA = aaaaa - 1.
|
|
}
|
|
|
|
template <typename ForwardIterator>
|
|
static bool seek_charboundary(ForwardIterator &begin, const ForwardIterator end)
|
|
{
|
|
for (; begin != end; ++begin)
|
|
{
|
|
if ((*begin & 0xdc00) != 0xdc00)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
};
|
|
template <typename charT>
|
|
const std::size_t utf16_traits<charT>::maxseqlen;
|
|
template <typename charT>
|
|
const int utf16_traits<charT>::utftype;
|
|
template <typename charT>
|
|
const std::size_t utf16_traits<charT>::bitsetsize;
|
|
template <typename charT>
|
|
const uchar32 utf16_traits<charT>::bitsetmask;
|
|
template <typename charT>
|
|
const uchar32 utf16_traits<charT>::cumask;
|
|
// utf16_traits
|
|
|
|
// specialisation for char.
|
|
template <>
|
|
struct utf_traits<char> : public utf_traits_core<char>
|
|
{
|
|
public:
|
|
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint(ForwardIterator begin, const ForwardIterator /* end */)
|
|
{
|
|
return static_cast<uchar32>(static_cast<unsigned char>(*begin));
|
|
}
|
|
|
|
template <typename ForwardIterator>
|
|
static uchar32 codepoint_inc(ForwardIterator &begin, const ForwardIterator /* end */)
|
|
{
|
|
return static_cast<uchar32>(static_cast<unsigned char>(*begin++));
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 prevcodepoint(BidirectionalIterator cur, const BidirectionalIterator /* begin */)
|
|
{
|
|
return static_cast<uchar32>(static_cast<unsigned char>(*--cur));
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
static uchar32 dec_codepoint(BidirectionalIterator &cur, const BidirectionalIterator /* begin */)
|
|
{
|
|
return static_cast<uchar32>(static_cast<unsigned char>(*--cur));
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
#endif // !defined(SRELLDBG_NO_BMH)
|
|
}; // utf_traits<char>
|
|
|
|
// specialisation for signed char.
|
|
template <>
|
|
struct utf_traits<signed char> : public utf_traits<char>
|
|
{
|
|
};
|
|
|
|
// (signed) short, (signed) int, (signed) long, (signed) long long, ...
|
|
|
|
#if defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
template <>
|
|
struct utf_traits<char16_t> : public utf16_traits<char16_t>
|
|
{
|
|
};
|
|
#endif
|
|
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
template <>
|
|
struct utf_traits<char8_t> : public utf8_traits<char8_t>
|
|
{
|
|
};
|
|
#endif
|
|
|
|
} // regex_internal
|
|
|
|
// ... "rei_utf_traits.hpp"]
|
|
// ["regex_traits.hpp" ...
|
|
|
|
// 28.7, class template regex_traits:
|
|
template <class charT>
|
|
struct regex_traits
|
|
{
|
|
public:
|
|
|
|
typedef charT char_type;
|
|
typedef std::basic_string<char_type> string_type;
|
|
typedef std::locale locale_type;
|
|
// typedef bitmask_type char_class_type;
|
|
typedef int char_class_type;
|
|
|
|
typedef regex_internal::utf_traits<charT> utf_traits;
|
|
|
|
public:
|
|
|
|
// regex_traits();
|
|
|
|
static std::size_t length(const char_type *p)
|
|
{
|
|
return std::char_traits<charT>::length(p);
|
|
}
|
|
|
|
charT translate(const charT c) const
|
|
{
|
|
return c;
|
|
}
|
|
|
|
charT translate_nocase(const charT c) const
|
|
{
|
|
return c;
|
|
}
|
|
|
|
template <class ForwardIterator>
|
|
string_type transform(ForwardIterator first, ForwardIterator last) const
|
|
{
|
|
return string_type(first, last);
|
|
}
|
|
|
|
template <class ForwardIterator>
|
|
string_type transform_primary(ForwardIterator first, ForwardIterator last) const
|
|
{
|
|
return string_type(first, last);
|
|
}
|
|
|
|
template <class ForwardIterator>
|
|
string_type lookup_collatename(ForwardIterator first, ForwardIterator last) const
|
|
{
|
|
return string_type(first, last);
|
|
}
|
|
|
|
template <class ForwardIterator>
|
|
char_class_type lookup_classname(ForwardIterator /* first */, ForwardIterator /* last */, bool /* icase */ = false) const
|
|
{
|
|
return static_cast<char_class_type>(0);
|
|
}
|
|
|
|
bool isctype(const charT /* c */, const char_class_type /* f */) const
|
|
{
|
|
return false;
|
|
}
|
|
|
|
int value(const charT /* ch */, const int /* radix */) const
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
locale_type imbue(const locale_type /* l */)
|
|
{
|
|
return locale_type();
|
|
}
|
|
|
|
locale_type getloc() const
|
|
{
|
|
return locale_type();
|
|
}
|
|
}; // regex_traits
|
|
|
|
template <class charT>
|
|
struct u8regex_traits : public regex_traits<charT>
|
|
{
|
|
typedef regex_internal::utf8_traits<charT> utf_traits;
|
|
};
|
|
|
|
template <class charT>
|
|
struct u16regex_traits : public regex_traits<charT>
|
|
{
|
|
typedef regex_internal::utf16_traits<charT> utf_traits;
|
|
};
|
|
|
|
// ... "regex_traits.hpp"]
|
|
// ["rei_memory.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
/*
|
|
* Similar to std::basic_string, except for:
|
|
* a. only allocates memory, does not initialise it.
|
|
* b. uses realloc() to avoid moving data as much as possible when
|
|
* resizing an allocated buffer.
|
|
*/
|
|
template <typename ElemT>
|
|
class simple_array
|
|
{
|
|
public:
|
|
|
|
typedef ElemT value_type;
|
|
typedef std::size_t size_type;
|
|
typedef ElemT &reference;
|
|
typedef const ElemT &const_reference;
|
|
typedef ElemT *pointer;
|
|
typedef const ElemT *const_pointer;
|
|
|
|
static const size_type npos = static_cast<size_type>(-1);
|
|
|
|
public:
|
|
|
|
simple_array()
|
|
: buffer_(NULL)
|
|
, size_(0)
|
|
, capacity_(0)
|
|
{
|
|
}
|
|
|
|
simple_array(const size_type initsize)
|
|
: buffer_(NULL)
|
|
, size_(0)
|
|
, capacity_(0)
|
|
{
|
|
if (initsize)
|
|
{
|
|
buffer_ = static_cast<pointer>(std::malloc(initsize * sizeof (ElemT)));
|
|
|
|
if (buffer_ != NULL)
|
|
size_ = capacity_ = initsize;
|
|
else
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
|
|
simple_array(const simple_array &right, size_type pos, size_type len = npos)
|
|
: buffer_(NULL)
|
|
, size_(0)
|
|
, capacity_(0)
|
|
{
|
|
if (pos > right.size_)
|
|
pos = right.size_;
|
|
|
|
{
|
|
const size_type len2 = right.size_ - pos;
|
|
if (len > len2)
|
|
len = len2;
|
|
}
|
|
|
|
if (len)
|
|
{
|
|
buffer_ = static_cast<pointer>(std::malloc(len * sizeof (ElemT)));
|
|
|
|
if (buffer_ != NULL)
|
|
{
|
|
for (capacity_ = len; size_ < capacity_;)
|
|
buffer_[size_++] = right[pos++];
|
|
}
|
|
else
|
|
{
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
}
|
|
|
|
simple_array(const simple_array &right)
|
|
: buffer_(NULL)
|
|
, size_(0)
|
|
, capacity_(0)
|
|
{
|
|
operator=(right);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
simple_array(simple_array &&right) SRELL_NOEXCEPT
|
|
: buffer_(right.buffer_)
|
|
, size_(right.size_)
|
|
, capacity_(right.capacity_)
|
|
{
|
|
right.size_ = 0;
|
|
right.capacity_ = 0;
|
|
right.buffer_ = NULL;
|
|
}
|
|
#endif
|
|
|
|
simple_array &operator=(const simple_array &right)
|
|
{
|
|
if (this != &right)
|
|
{
|
|
resize(right.size_);
|
|
for (size_type i = 0; i < right.size_; ++i)
|
|
buffer_[i] = right.buffer_[i];
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
simple_array &operator=(simple_array &&right) SRELL_NOEXCEPT
|
|
{
|
|
if (this != &right)
|
|
{
|
|
if (this->buffer_ != NULL)
|
|
std::free(this->buffer_);
|
|
|
|
this->size_ = right.size_;
|
|
this->capacity_ = right.capacity_;
|
|
this->buffer_ = right.buffer_;
|
|
|
|
right.size_ = 0;
|
|
right.capacity_ = 0;
|
|
right.buffer_ = NULL;
|
|
}
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
~simple_array()
|
|
{
|
|
if (buffer_ != NULL)
|
|
std::free(buffer_);
|
|
}
|
|
|
|
size_type size() const
|
|
{
|
|
return size_;
|
|
}
|
|
|
|
void clear()
|
|
{
|
|
size_ = 0;
|
|
}
|
|
|
|
void resize(const size_type newsize)
|
|
{
|
|
if (newsize > capacity_)
|
|
reserve(newsize);
|
|
|
|
size_ = newsize;
|
|
}
|
|
|
|
void resize(const size_type newsize, const ElemT &type)
|
|
{
|
|
size_type oldsize = size_;
|
|
|
|
resize(newsize);
|
|
for (; oldsize < size_; ++oldsize)
|
|
buffer_[oldsize] = type;
|
|
}
|
|
|
|
reference operator[](const size_type pos)
|
|
{
|
|
return buffer_[pos];
|
|
}
|
|
|
|
const_reference operator[](const size_type pos) const
|
|
{
|
|
return buffer_[pos];
|
|
}
|
|
|
|
void push_back(const_reference n)
|
|
{
|
|
const size_type oldsize = size_;
|
|
|
|
if (++size_ > capacity_)
|
|
reserve(size_);
|
|
|
|
buffer_[oldsize] = n;
|
|
}
|
|
|
|
void push_backncr(const ElemT e)
|
|
{
|
|
push_back(e);
|
|
}
|
|
|
|
const_reference back() const
|
|
{
|
|
return buffer_[size_ - 1];
|
|
}
|
|
|
|
reference back()
|
|
{
|
|
return buffer_[size_ - 1];
|
|
}
|
|
|
|
void pop_back()
|
|
{
|
|
--size_;
|
|
}
|
|
|
|
simple_array &operator+=(const simple_array &right)
|
|
{
|
|
return append(right);
|
|
}
|
|
|
|
simple_array &append(const size_type size, const ElemT &type)
|
|
{
|
|
resize(size_ + size, type);
|
|
return *this;
|
|
}
|
|
|
|
simple_array &append(const simple_array &right)
|
|
{
|
|
size_type oldsize = size_;
|
|
|
|
resize(size_ + right.size_);
|
|
for (size_type i = 0; i < right.size_; ++i, ++oldsize)
|
|
buffer_[oldsize] = right.buffer_[i];
|
|
|
|
return *this;
|
|
}
|
|
|
|
simple_array &append(const simple_array &right, size_type pos, size_type len /* = npos */)
|
|
{
|
|
{
|
|
const size_type len2 = right.size_ - pos;
|
|
if (len > len2)
|
|
len = len2;
|
|
}
|
|
|
|
size_type oldsize = size_;
|
|
|
|
resize(size_ + len);
|
|
len += pos; // end.
|
|
for (; pos < len; ++oldsize, ++pos)
|
|
buffer_[oldsize] = right.buffer_[pos];
|
|
|
|
return *this;
|
|
}
|
|
|
|
void erase(const size_type pos)
|
|
{
|
|
if (pos < size_)
|
|
{
|
|
std::memmove(buffer_ + pos, buffer_ + pos + 1, (size_ - pos - 1) * sizeof (ElemT));
|
|
--size_;
|
|
}
|
|
}
|
|
void erase(const size_type pos, const size_type len)
|
|
{
|
|
if (pos < size_)
|
|
{
|
|
size_type rmndr = size_ - pos;
|
|
|
|
if (rmndr > len)
|
|
{
|
|
rmndr -= len;
|
|
std::memmove(buffer_ + pos, buffer_ + pos + len, rmndr * sizeof (ElemT));
|
|
size_ -= len;
|
|
}
|
|
else
|
|
size_ = pos;
|
|
}
|
|
}
|
|
|
|
// For rei_compiler class.
|
|
void insert(const size_type pos, const ElemT &type)
|
|
{
|
|
move_forward(pos, 1);
|
|
buffer_[pos] = type;
|
|
}
|
|
|
|
void insert(size_type pos, const simple_array &right)
|
|
{
|
|
move_forward(pos, right.size_);
|
|
for (size_type i = 0; i < right.size_; ++i, ++pos)
|
|
buffer_[pos] = right.buffer_[i];
|
|
}
|
|
|
|
void insert(size_type destpos, const simple_array &right, size_type srcpos, size_type srclen = npos)
|
|
{
|
|
{
|
|
const size_type len2 = right.size_ - srcpos;
|
|
if (srclen > len2)
|
|
srclen = len2;
|
|
}
|
|
|
|
move_forward(destpos, srclen);
|
|
srclen += srcpos; // srcend.
|
|
for (; srcpos < srclen; ++destpos, ++srcpos)
|
|
buffer_[destpos] = right.buffer_[srcpos];
|
|
}
|
|
|
|
simple_array &replace(size_type pos, size_type count, const simple_array &right)
|
|
{
|
|
if (count < right.size_)
|
|
move_forward(pos + count, right.size_ - count);
|
|
else if (count > right.size_)
|
|
{
|
|
const pointer base = buffer_ + pos;
|
|
|
|
std::memmove(base + right.size_, base + count, (size_ - pos - count) * sizeof (ElemT));
|
|
size_ -= count - right.size_;
|
|
}
|
|
|
|
for (size_type i = 0; i < right.size_; ++pos, ++i)
|
|
buffer_[pos] = right[i];
|
|
|
|
return *this;
|
|
}
|
|
|
|
size_type find(const value_type c, size_type pos = 0) const
|
|
{
|
|
for (; pos <= size_; ++pos)
|
|
if (buffer_[pos] == c)
|
|
return pos;
|
|
|
|
return npos;
|
|
}
|
|
|
|
const_pointer data() const
|
|
{
|
|
return buffer_;
|
|
}
|
|
|
|
int compare(size_type pos, const size_type count1, const_pointer p, const size_type count2) const
|
|
{
|
|
size_type count = count1 <= count2 ? count1 : count2;
|
|
|
|
for (; count; ++pos, ++p, --count)
|
|
{
|
|
const value_type &v = buffer_[pos];
|
|
if (v != *p)
|
|
return v < *p ? -1 : 1;
|
|
}
|
|
return count1 == count2 ? 0 : (count1 < count2 ? -1 : 1);
|
|
}
|
|
|
|
size_type max_size() const
|
|
{
|
|
return maxsize_;
|
|
}
|
|
|
|
void swap(simple_array &right)
|
|
{
|
|
if (this != &right)
|
|
{
|
|
const pointer tmpbuffer = this->buffer_;
|
|
const size_type tmpsize = this->size_;
|
|
const size_type tmpcapacity = this->capacity_;
|
|
|
|
this->buffer_ = right.buffer_;
|
|
this->size_ = right.size_;
|
|
this->capacity_ = right.capacity_;
|
|
|
|
right.buffer_ = tmpbuffer;
|
|
right.size_ = tmpsize;
|
|
right.capacity_ = tmpcapacity;
|
|
}
|
|
}
|
|
|
|
private:
|
|
|
|
void reserve(const size_type newsize)
|
|
{
|
|
// if (newsize > capacity_)
|
|
{
|
|
if (newsize <= maxsize_)
|
|
{
|
|
// capacity_ = newsize + (newsize >> 1); // newsize * 1.5.
|
|
capacity_ = ((newsize >> 8) + 1) << 8; // Round up to a multiple of 256.
|
|
|
|
if (capacity_ > maxsize_)
|
|
capacity_ = maxsize_;
|
|
|
|
const size_type newsize_in_byte = capacity_ * sizeof (ElemT);
|
|
const pointer oldbuffer = buffer_;
|
|
|
|
buffer_ = static_cast<pointer>(std::realloc(buffer_, newsize_in_byte));
|
|
if (buffer_ != NULL)
|
|
return;
|
|
|
|
// Even if realloc() failed, already-existing buffer remains valid.
|
|
std::free(oldbuffer);
|
|
// buffer_ = NULL;
|
|
size_ = capacity_ = 0;
|
|
}
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
|
|
void move_forward(const size_type pos, const size_type count)
|
|
{
|
|
const size_type oldsize = size_;
|
|
|
|
resize(size_ + count);
|
|
|
|
if (pos < oldsize)
|
|
{
|
|
const pointer base = buffer_ + pos;
|
|
|
|
std::memmove(base + count, base, (oldsize - pos) * sizeof (ElemT));
|
|
}
|
|
}
|
|
|
|
private:
|
|
|
|
pointer buffer_;
|
|
size_type size_;
|
|
size_type capacity_;
|
|
|
|
// static const size_type maxsize_ = (npos - sizeof (simple_array)) / sizeof (ElemT);
|
|
static const size_type maxsize_ = (npos - sizeof (pointer) - sizeof (size_type) * 2) / sizeof (ElemT) / 2;
|
|
};
|
|
template <typename ElemT>
|
|
const typename simple_array<ElemT>::size_type simple_array<ElemT>::npos;
|
|
// simple_array
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_memory.hpp"]
|
|
// ["rei_bitset.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
// Always uses a heap instead of the stack.
|
|
template <const std::size_t Bits>
|
|
class bitset
|
|
{
|
|
private:
|
|
|
|
typedef unsigned long array_type;
|
|
|
|
public:
|
|
|
|
bitset()
|
|
: buffer_(static_cast<array_type *>(std::malloc(size_in_byte_)))
|
|
{
|
|
if (buffer_ != NULL)
|
|
{
|
|
reset();
|
|
return;
|
|
}
|
|
throw std::bad_alloc();
|
|
}
|
|
|
|
bitset(const bitset &right)
|
|
: buffer_(static_cast<array_type *>(std::malloc(size_in_byte_)))
|
|
{
|
|
if (buffer_ != NULL)
|
|
{
|
|
operator=(right);
|
|
return;
|
|
}
|
|
throw std::bad_alloc();
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
bitset(bitset &&right) SRELL_NOEXCEPT
|
|
: buffer_(right.buffer_)
|
|
{
|
|
right.buffer_ = NULL;
|
|
}
|
|
#endif
|
|
|
|
bitset &operator=(const bitset &right)
|
|
{
|
|
if (this != &right)
|
|
{
|
|
// for (std::size_t i = 0; i < arraylength_; ++i)
|
|
// buffer_[i] = right.buffer_[i];
|
|
std::memcpy(buffer_, right.buffer_, size_in_byte_);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
bitset &operator=(bitset &&right) SRELL_NOEXCEPT
|
|
{
|
|
if (this != &right)
|
|
{
|
|
if (this->buffer_ != NULL)
|
|
std::free(this->buffer_);
|
|
|
|
this->buffer_ = right.buffer_;
|
|
right.buffer_ = NULL;
|
|
}
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
~bitset()
|
|
{
|
|
if (buffer_ != NULL)
|
|
std::free(buffer_);
|
|
}
|
|
|
|
bitset &reset()
|
|
{
|
|
std::memset(buffer_, 0, size_in_byte_);
|
|
return *this;
|
|
}
|
|
|
|
bitset &reset(const std::size_t bit)
|
|
{
|
|
buffer_[bit / bits_per_elem_] &= ~(1 << (bit & bitmask_));
|
|
return *this;
|
|
}
|
|
|
|
bitset &set(const std::size_t bit)
|
|
{
|
|
buffer_[bit / bits_per_elem_] |= (1 << (bit & bitmask_));
|
|
return *this;
|
|
}
|
|
|
|
#if 0
|
|
void set_range(const std::size_t firstbit, const std::size_t lastbit)
|
|
{
|
|
const std::size_t lastelemidx = lastbit / bits_per_elem_;
|
|
std::size_t firstelemidx = firstbit / bits_per_elem_;
|
|
const array_type lastelemmask = ~(allbits1_ << ((lastbit & bitmask_) + 1));
|
|
array_type ormask = allbits1_ << (firstbit & bitmask_);
|
|
|
|
if (firstelemidx < lastelemidx)
|
|
{
|
|
buffer_[firstelemidx] |= ormask;
|
|
ormask = allbits1_;
|
|
|
|
for (++firstelemidx; firstelemidx < lastelemidx; ++firstelemidx)
|
|
buffer_[firstelemidx] |= ormask;
|
|
}
|
|
ormask &= lastelemmask;
|
|
buffer_[lastelemidx] |= ormask;
|
|
|
|
}
|
|
#endif
|
|
|
|
bool test(const std::size_t bit) const
|
|
{
|
|
return (buffer_[bit / bits_per_elem_] & (1 << (bit & bitmask_))) != 0;
|
|
}
|
|
|
|
bool operator[](const std::size_t bit) const
|
|
{
|
|
return (buffer_[bit / bits_per_elem_] & (1 << (bit & bitmask_))) != 0;
|
|
}
|
|
|
|
bitset<Bits> &flip()
|
|
{
|
|
for (std::size_t i = 0; i < arraylength_; ++i)
|
|
buffer_[i] = ~buffer_[i];
|
|
return *this;
|
|
}
|
|
|
|
void swap(bitset &right)
|
|
{
|
|
if (this != &right)
|
|
{
|
|
array_type *const tmpbuffer = this->buffer_;
|
|
this->buffer_ = right.buffer_;
|
|
right.buffer_ = tmpbuffer;
|
|
}
|
|
}
|
|
|
|
private:
|
|
|
|
#if defined(__cpp_constexpr)
|
|
static constexpr std::size_t pow2leN(const std::size_t n, const std::size_t p2)
|
|
{
|
|
return ((p2 << 1) == 0 || (p2 << 1) > n) ? p2 : pow2leN(n, p2 << 1);
|
|
}
|
|
static const std::size_t bits_per_elem_ = pow2leN(CHAR_BIT * sizeof (array_type), 8);
|
|
#else
|
|
static const std::size_t bpe_tmp_ = CHAR_BIT * sizeof (array_type);
|
|
static const std::size_t bits_per_elem_ = bpe_tmp_ >= 64 ? 64 : (bpe_tmp_ >= 32 ? 32 : (bpe_tmp_ >= 16 ? 16 : 8));
|
|
#endif
|
|
static const std::size_t bitmask_ = bits_per_elem_ - 1;
|
|
static const std::size_t arraylength_ = (Bits + bitmask_) / bits_per_elem_;
|
|
static const std::size_t size_in_byte_ = arraylength_ * sizeof (array_type);
|
|
static const array_type allbits1_ = ~static_cast<array_type>(0);
|
|
|
|
array_type *buffer_;
|
|
};
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_bitset.hpp"]
|
|
// ["rei_ucf.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
#if !defined(SRELL_NO_UNICODE_ICASE)
|
|
|
|
namespace ucf_constants
|
|
{
|
|
|
|
#include "srell_ucfdata2.hpp"
|
|
|
|
} // namespace ucf_constants
|
|
|
|
namespace ucf_internal
|
|
{
|
|
|
|
typedef ucf_constants::unicode_casefolding<uchar32, uchar32> ucfdata;
|
|
|
|
} // namespace ucf_internal
|
|
#endif // !defined(SRELL_NO_UNICODE_ICASE)
|
|
|
|
namespace ucf_constants
|
|
{
|
|
#if !defined(SRELL_NO_UNICODE_ICASE)
|
|
static const uchar32 rev_maxset = ucf_internal::ucfdata::rev_maxset;
|
|
#else
|
|
static const uchar32 rev_maxset = 2;
|
|
#endif
|
|
} // namespace ucf_constants
|
|
|
|
class unicode_case_folding
|
|
{
|
|
public:
|
|
|
|
static uchar32 do_casefolding(const uchar32 cp)
|
|
{
|
|
#if !defined(SRELL_NO_UNICODE_ICASE)
|
|
if (cp <= ucf_internal::ucfdata::ucf_maxcodepoint)
|
|
return cp + ucf_internal::ucfdata::ucf_deltatable[ucf_internal::ucfdata::ucf_segmenttable[cp >> 8] + (cp & 0xff)];
|
|
#else
|
|
if (cp >= char_alnum::ch_A && cp <= char_alnum::ch_Z) // 'A' && 'Z'
|
|
return static_cast<uchar32>(cp - char_alnum::ch_A + char_alnum::ch_a); // - 'A' + 'a'
|
|
#endif
|
|
return cp;
|
|
}
|
|
|
|
static uchar32 casefoldedcharset(uchar32 out[ucf_constants::rev_maxset], const uchar32 cp)
|
|
{
|
|
#if !defined(SRELL_NO_UNICODE_ICASE)
|
|
uchar32 count = 0;
|
|
|
|
if (cp <= ucf_internal::ucfdata::rev_maxcodepoint)
|
|
{
|
|
const uchar32 offset_of_charset = ucf_internal::ucfdata::rev_indextable[ucf_internal::ucfdata::rev_segmenttable[cp >> 8] + (cp & 0xff)];
|
|
const uchar32 *ptr = &ucf_internal::ucfdata::rev_charsettable[offset_of_charset];
|
|
|
|
for (; *ptr != cfcharset_eos_ && count < ucf_constants::rev_maxset; ++ptr, ++count)
|
|
out[count] = *ptr;
|
|
}
|
|
if (count == 0)
|
|
out[count++] = cp;
|
|
|
|
return count;
|
|
#else
|
|
// const uchar32 nocase = static_cast<uchar32>(cp & ~0x20);
|
|
const uchar32 nocase = static_cast<uchar32>(cp | constants::asc_icase);
|
|
|
|
out[0] = cp;
|
|
// if (nocase >= char_alnum::ch_A && nocase <= char_alnum::ch_Z)
|
|
if (nocase >= char_alnum::ch_a && nocase <= char_alnum::ch_z)
|
|
{
|
|
out[1] = static_cast<uchar32>(cp ^ constants::asc_icase);
|
|
return 2;
|
|
}
|
|
return 1;
|
|
#endif
|
|
}
|
|
|
|
unicode_case_folding &operator=(const unicode_case_folding &)
|
|
{
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
unicode_case_folding &operator=(unicode_case_folding &&) SRELL_NOEXCEPT
|
|
{
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
void swap(unicode_case_folding & /* right */)
|
|
{
|
|
}
|
|
|
|
private:
|
|
|
|
#if !defined(SRELL_NO_UNICODE_ICASE)
|
|
static const uchar32 cfcharset_eos_ = ucf_internal::ucfdata::eos;
|
|
#endif
|
|
|
|
public: // For debug.
|
|
|
|
void print_tables() const;
|
|
};
|
|
// unicode_case_folding
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_ucf.hpp"]
|
|
// ["rei_up.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
namespace up_constants
|
|
{
|
|
|
|
#include "srell_updata2.hpp"
|
|
|
|
static const uint_l32 error_property = static_cast<uint_l32>(-1);
|
|
} // namespace up_constants
|
|
|
|
namespace up_internal
|
|
{
|
|
typedef up_constants::up_type pname_type;
|
|
typedef const char *pname_string_type;
|
|
|
|
#if defined(SRELL_UPDATA_VERSION) && (SRELL_UPDATA_VERSION >= 200)
|
|
struct pvalue_type
|
|
{
|
|
pname_type pname;
|
|
uint_l32 pnumber;
|
|
pname_string_type csstrings;
|
|
};
|
|
#else
|
|
struct pvalue_type
|
|
{
|
|
pname_type pname;
|
|
pname_string_type csstrings;
|
|
uint_l32 pnumber;
|
|
};
|
|
#endif
|
|
|
|
struct offset_and_number
|
|
{
|
|
std::size_t offset;
|
|
std::size_t number_of_pairs;
|
|
};
|
|
|
|
typedef up_constants::unicode_property_data<
|
|
pname_string_type,
|
|
uchar32,
|
|
pvalue_type,
|
|
offset_and_number
|
|
>
|
|
updata;
|
|
|
|
} // namespace up_internal
|
|
|
|
//template <typename PairType>
|
|
class unicode_property
|
|
{
|
|
public:
|
|
|
|
typedef simple_array<char> pstring;
|
|
|
|
unicode_property()
|
|
{
|
|
}
|
|
|
|
unicode_property &operator=(const unicode_property &)
|
|
{
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
unicode_property &operator=(unicode_property &&) SRELL_NOEXCEPT
|
|
{
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
static uint_l32 lookup_property(const pstring &name, const pstring &value)
|
|
{
|
|
pname_type ptype = name.size() ? lookup_property_name(name) : up_constants::uptype_general_category;
|
|
// property_type property_number = lookup_property_value(ptype, value);
|
|
uint_l32 property_number = lookup_property_value(ptype, value);
|
|
|
|
if (property_number == static_cast<uint_l32>(up_constants::upid_unknown) && name.size() == 0)
|
|
{
|
|
ptype = up_constants::uptype_binary;
|
|
property_number = lookup_property_value(ptype, value);
|
|
}
|
|
|
|
return property_number != static_cast<uint_l32>(up_constants::upid_unknown) ? property_number : up_constants::error_property;
|
|
}
|
|
|
|
static std::size_t ranges_offset(const uint_l32 property_number)
|
|
{
|
|
#if defined(SRELL_UPDATA_VERSION)
|
|
return updata::positiontable[property_number].offset;
|
|
#else
|
|
const offset_and_number *const postable = updata::position_table();
|
|
return postable[property_number].offset;
|
|
#endif
|
|
}
|
|
|
|
static std::size_t number_of_ranges(const uint_l32 property_number)
|
|
{
|
|
#if defined(SRELL_UPDATA_VERSION)
|
|
return updata::positiontable[property_number].number_of_pairs;
|
|
#else
|
|
const offset_and_number *const postable = updata::position_table();
|
|
return postable[property_number].number_of_pairs;
|
|
#endif
|
|
}
|
|
|
|
static const uchar32 *ranges_address(const uint_l32 property_number)
|
|
{
|
|
#if defined(SRELL_UPDATA_VERSION)
|
|
return &updata::rangetable[ranges_offset(property_number) << 1];
|
|
#else
|
|
const uchar32 *const ranges = updata::ranges();
|
|
return &ranges[ranges_offset(property_number) << 1];
|
|
#endif
|
|
}
|
|
|
|
static bool is_valid_pno(const uint_l32 pno)
|
|
{
|
|
return pno != up_constants::error_property && pno <= max_property_number;
|
|
}
|
|
|
|
static bool is_pos(const uint_l32 pno)
|
|
{
|
|
return pno > max_property_number && pno <= max_pos_number;
|
|
}
|
|
|
|
private:
|
|
|
|
typedef up_internal::pname_type pname_type;
|
|
typedef up_internal::pname_string_type pname_string_type;
|
|
typedef up_internal::pvalue_type pvalue_type;
|
|
typedef up_internal::offset_and_number offset_and_number;
|
|
typedef up_internal::updata updata;
|
|
|
|
static pname_type lookup_property_name(const pstring &name)
|
|
{
|
|
#if defined(SRELL_UPDATA_VERSION)
|
|
for (std::size_t pno = 0; *updata::propertynametable[pno]; ++pno)
|
|
{
|
|
if (check_if_included(name, updata::propertynametable[pno]))
|
|
return static_cast<pname_type>(pno);
|
|
}
|
|
#else
|
|
const pname_string_type *const pname_table = updata::propertyname_table();
|
|
|
|
for (std::size_t pno = 0; *pname_table[pno]; ++pno)
|
|
{
|
|
if (check_if_included(name, pname_table[pno]))
|
|
return static_cast<pname_type>(pno);
|
|
}
|
|
#endif
|
|
return up_constants::uptype_unknown;
|
|
}
|
|
|
|
// Checks if value is included in colon-separated strings.
|
|
static bool check_if_included(const pstring &value, pname_string_type csstrings)
|
|
{
|
|
if (static_cast<uchar32>(*csstrings) != meta_char::mc_astrsk) // '*'
|
|
{
|
|
while (*csstrings)
|
|
{
|
|
const pname_string_type begin = csstrings;
|
|
|
|
for (; static_cast<uchar32>(*csstrings) != meta_char::mc_colon && static_cast<uchar32>(*csstrings) != char_ctrl::cc_nul; ++csstrings);
|
|
|
|
const std::size_t length = csstrings - begin;
|
|
|
|
if (static_cast<std::size_t>(value.size()) == length)
|
|
if (value.compare(0, value.size(), begin, length) == 0)
|
|
return true;
|
|
|
|
if (static_cast<uchar32>(*csstrings) == meta_char::mc_colon)
|
|
++csstrings;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static uint_l32 lookup_property_value(const pname_type ptype, const pstring &value)
|
|
{
|
|
#if defined(SRELL_UPDATA_VERSION)
|
|
for (std::size_t pno = 0; *updata::rangenumbertable[pno].csstrings; ++pno)
|
|
{
|
|
const pvalue_type &pvalue = updata::rangenumbertable[pno];
|
|
if (pvalue.pname == ptype && check_if_included(value, pvalue.csstrings))
|
|
return pvalue.pnumber;
|
|
}
|
|
#else
|
|
const pvalue_type *const pvalue_table = updata::rangenumber_table();
|
|
|
|
for (std::size_t pno = 0; *pvalue_table[pno].csstrings; ++pno)
|
|
{
|
|
const pvalue_type &pvalue = pvalue_table[pno];
|
|
if (pvalue.pname == ptype && check_if_included(value, pvalue.csstrings))
|
|
return pvalue.pnumber;
|
|
}
|
|
#endif
|
|
return static_cast<uint_l32>(up_constants::upid_unknown);
|
|
}
|
|
|
|
private:
|
|
|
|
static const std::size_t max_property_number = static_cast<std::size_t>(up_constants::upid_max_property_number);
|
|
static const std::size_t max_pos_number = static_cast<std::size_t>(up_constants::upid_max_pos_number);
|
|
};
|
|
// unicode_property
|
|
|
|
#endif // !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_up.hpp"]
|
|
// ["rei_range_pair.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
struct range_pair // , public std::pair<charT, charT>
|
|
{
|
|
uchar32 second;
|
|
uchar32 first;
|
|
|
|
void set(const uchar32 min, const uchar32 max)
|
|
{
|
|
this->first = min;
|
|
this->second = max;
|
|
}
|
|
|
|
void set(const uchar32 minmax)
|
|
{
|
|
this->first = minmax;
|
|
this->second = minmax;
|
|
}
|
|
|
|
bool is_range_valid() const
|
|
{
|
|
return first <= second;
|
|
}
|
|
|
|
bool operator==(const range_pair &right) const
|
|
{
|
|
return this->first == right.first && this->second == right.second;
|
|
}
|
|
|
|
bool operator<(const range_pair &right) const
|
|
{
|
|
return this->second < right.first; // This assumes that optimise() has been called.
|
|
}
|
|
|
|
void swap(range_pair &right)
|
|
{
|
|
const range_pair tmp = *this;
|
|
*this = right;
|
|
right = tmp;
|
|
}
|
|
|
|
bool unify_range(const range_pair &right)
|
|
{
|
|
range_pair &left = *this;
|
|
|
|
if (right.first <= left.second || left.second + 1 == right.first) // r1 <= l2 || l2+1 == r1
|
|
{
|
|
// l1 l2+1 < r1 r2 excluded.
|
|
|
|
if (left.first <= right.second || right.second + 1 == left.first) // l1 <= r2 || r2+1 == l1
|
|
{
|
|
// r1 r2+1 < l1 l2 excluded.
|
|
|
|
if (left.first > right.first)
|
|
left.first = right.first;
|
|
|
|
if (left.second < right.second)
|
|
left.second = right.second;
|
|
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
};
|
|
// range_pair
|
|
|
|
struct range_pair_helper : public range_pair
|
|
{
|
|
range_pair_helper(const uchar32 min, const uchar32 max)
|
|
{
|
|
this->first = min;
|
|
this->second = max;
|
|
}
|
|
|
|
range_pair_helper(const uchar32 minmax)
|
|
{
|
|
this->first = minmax;
|
|
this->second = minmax;
|
|
}
|
|
};
|
|
// range_pair_helper
|
|
|
|
struct range_pairs // : public simple_array<range_pair>
|
|
{
|
|
public:
|
|
|
|
typedef simple_array<range_pair> array_type;
|
|
typedef array_type::size_type size_type;
|
|
|
|
range_pairs()
|
|
{
|
|
}
|
|
|
|
range_pairs(const range_pairs &rp) : rparray_(rp.rparray_)
|
|
{
|
|
}
|
|
|
|
range_pairs &operator=(const range_pairs &rp)
|
|
{
|
|
rparray_.operator=(rp.rparray_);
|
|
return *this;
|
|
}
|
|
|
|
range_pairs(const size_type initsize) : rparray_(initsize)
|
|
{
|
|
}
|
|
|
|
range_pairs(const range_pairs &right, size_type pos, size_type size)
|
|
: rparray_(right.rparray_, pos, size)
|
|
{
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
range_pairs(range_pairs &&rp) SRELL_NOEXCEPT
|
|
: rparray_(std::move(rp.rparray_))
|
|
{
|
|
}
|
|
|
|
range_pairs &operator=(range_pairs &&rp) SRELL_NOEXCEPT
|
|
{
|
|
rparray_.operator=(std::move(rp.rparray_));
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
void clear()
|
|
{
|
|
rparray_.clear();
|
|
}
|
|
|
|
size_type size() const
|
|
{
|
|
return rparray_.size();
|
|
}
|
|
|
|
const range_pair &operator[](const size_type pos) const
|
|
{
|
|
return rparray_[pos];
|
|
}
|
|
range_pair &operator[](const size_type pos)
|
|
{
|
|
return rparray_[pos];
|
|
}
|
|
|
|
void resize(const size_type size)
|
|
{
|
|
rparray_.resize(size);
|
|
}
|
|
|
|
void swap(range_pairs &right)
|
|
{
|
|
rparray_.swap(right.rparray_);
|
|
}
|
|
|
|
void set_solerange(const range_pair &right)
|
|
{
|
|
rparray_.clear();
|
|
rparray_.push_back(right);
|
|
}
|
|
|
|
void append_newclass(const range_pairs &right)
|
|
{
|
|
rparray_.append(right.rparray_);
|
|
}
|
|
|
|
void append_newpair(const range_pair &right)
|
|
{
|
|
rparray_.push_back(right);
|
|
}
|
|
|
|
void join(const range_pair &right)
|
|
{
|
|
size_type pos = 0;
|
|
|
|
for (; pos < rparray_.size(); ++pos)
|
|
{
|
|
range_pair &curpair = rparray_[pos];
|
|
|
|
if (curpair.unify_range(right))
|
|
{
|
|
for (++pos; pos < rparray_.size();)
|
|
{
|
|
if (curpair.unify_range(rparray_[pos]))
|
|
rparray_.erase(pos);
|
|
else
|
|
break;
|
|
}
|
|
return;
|
|
}
|
|
if (right.second < curpair.first)
|
|
break;
|
|
}
|
|
rparray_.insert(pos, right);
|
|
}
|
|
|
|
void merge(const range_pairs &right)
|
|
{
|
|
for (size_type i = 0; i < right.size(); ++i)
|
|
join(right[i]);
|
|
}
|
|
|
|
bool same(uchar32 pos, const uchar32 count, const range_pairs &right) const
|
|
{
|
|
if (count == right.size())
|
|
{
|
|
for (uchar32 i = 0; i < count; ++i, ++pos)
|
|
if (!(rparray_[pos] == right[i]))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
int relationship(const range_pairs &right) const
|
|
{
|
|
if (rparray_.size() == right.rparray_.size())
|
|
{
|
|
for (size_type i = 0; i < rparray_.size(); ++i)
|
|
{
|
|
if (!(this->rparray_[i] == right.rparray_[i]))
|
|
{
|
|
if (i == 0)
|
|
goto check_overlap;
|
|
|
|
return 1; // Overlapped.
|
|
}
|
|
}
|
|
return 0; // Same.
|
|
}
|
|
check_overlap:
|
|
return is_overlap(right) ? 1 : 2; // Overlapped or exclusive.
|
|
}
|
|
|
|
void negation()
|
|
{
|
|
uchar32 begin = 0;
|
|
range_pairs newpairs;
|
|
|
|
for (size_type i = 0; i < rparray_.size(); ++i)
|
|
{
|
|
const range_pair &range = rparray_[i];
|
|
|
|
if (begin < range.first)
|
|
newpairs.join(range_pair_helper(begin, range.first - 1));
|
|
|
|
begin = range.second + 1;
|
|
}
|
|
|
|
if (begin <= constants::unicode_max_codepoint)
|
|
newpairs.join(range_pair_helper(begin, constants::unicode_max_codepoint));
|
|
|
|
*this = newpairs;
|
|
}
|
|
|
|
bool is_overlap(const range_pairs &right) const
|
|
{
|
|
for (size_type i = 0; i < rparray_.size(); ++i)
|
|
{
|
|
const range_pair &leftrange = rparray_[i];
|
|
|
|
for (size_type j = 0; j < right.size(); ++j)
|
|
{
|
|
const range_pair &rightrange = right[j];
|
|
|
|
if (rightrange.first <= leftrange.second) // Excludes l1 l2 < r1 r2.
|
|
if (leftrange.first <= rightrange.second) // Excludes r1 r2 < l1 l2.
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void load_from_memory(const uchar32 *array, size_type number_of_pairs)
|
|
{
|
|
for (; number_of_pairs; --number_of_pairs, array += 2)
|
|
join(range_pair_helper(array[0], array[1]));
|
|
}
|
|
|
|
void make_caseunfoldedcharset()
|
|
{
|
|
uchar32 table[ucf_constants::rev_maxset] = {};
|
|
bitset<constants::unicode_max_codepoint + 1> bs;
|
|
|
|
for (size_type i = 0; i < rparray_.size(); ++i)
|
|
{
|
|
const range_pair &range = rparray_[i];
|
|
|
|
for (uchar32 ucp = range.first; ucp <= range.second; ++ucp)
|
|
{
|
|
const uchar32 setnum = unicode_case_folding::casefoldedcharset(table, ucp);
|
|
|
|
for (uchar32 j = 0; j < setnum; ++j)
|
|
bs.set(table[j]);
|
|
}
|
|
}
|
|
load_from_bitset(bs);
|
|
}
|
|
|
|
// For updataout.hpp.
|
|
void remove_range(const range_pair &right)
|
|
{
|
|
for (size_type pos = 0; pos < rparray_.size();)
|
|
{
|
|
range_pair &left = rparray_[pos];
|
|
|
|
if (right.first <= left.first && left.first <= right.second) // r1 <= l1 <= r2.
|
|
{
|
|
if (left.second > right.second) // r1 <= l1 <= r2 < l2.
|
|
{
|
|
left.first = right.second + 1; // carry doesn't happen.
|
|
++pos;
|
|
}
|
|
else // r1 <= l1 <= l2 <= r2.
|
|
rparray_.erase(pos);
|
|
}
|
|
else if (right.first <= left.second && left.second <= right.second) // r1 <= l2 <= r2.
|
|
{
|
|
if (left.first < right.first) // l1 < r1 <= l2 <= r2.
|
|
{
|
|
left.second = right.first - 1;
|
|
++pos;
|
|
}
|
|
else // r1 <= l1 <= l2 <= r2.
|
|
rparray_.erase(pos);
|
|
}
|
|
else if (left.first < right.first && right.second < left.second) // l1 < r1 && r2 < l2.
|
|
{
|
|
range_pair newrange(left);
|
|
|
|
left.second = right.first - 1;
|
|
newrange.first = right.second + 1;
|
|
rparray_.insert(++pos, newrange);
|
|
++pos;
|
|
}
|
|
else
|
|
++pos;
|
|
}
|
|
}
|
|
|
|
// template <typename ucf>
|
|
uchar32 consists_of_one_character(const bool icase) const
|
|
{
|
|
if (rparray_.size() >= 1)
|
|
{
|
|
uchar32 (*const casefolding_func)(const uchar32) = !icase ? do_nothing : unicode_case_folding::do_casefolding;
|
|
const uchar32 ucp1st = casefolding_func(rparray_[0].first);
|
|
|
|
for (size_type no = 0; no < rparray_.size(); ++no)
|
|
{
|
|
const range_pair &cr = rparray_[no];
|
|
|
|
for (uchar32 ucp = cr.first;; ++ucp)
|
|
{
|
|
if (ucp1st != casefolding_func(ucp))
|
|
return constants::invalid_u32value;
|
|
|
|
if (ucp == cr.second)
|
|
break;
|
|
}
|
|
}
|
|
return ucp1st;
|
|
}
|
|
return constants::invalid_u32value;
|
|
}
|
|
|
|
void split_ranges(range_pairs &kept, range_pairs &removed, const range_pairs &rightranges) const
|
|
{
|
|
range_pair newpair;
|
|
|
|
kept.rparray_ = this->rparray_; // Subtraction set.
|
|
removed.clear(); // Intersection set.
|
|
|
|
for (size_type i = 0;; ++i)
|
|
{
|
|
RETRY_SAMEINDEXNO:
|
|
if (i >= kept.rparray_.size())
|
|
break;
|
|
|
|
range_pair &left = kept.rparray_[i];
|
|
|
|
for (size_type j = 0; j < rightranges.rparray_.size(); ++j)
|
|
{
|
|
const range_pair &right = rightranges.rparray_[j];
|
|
|
|
if (right.first <= left.second) // Excludes l1 l2 < r1 r2.
|
|
{
|
|
if (left.first <= right.second) // Excludes r1 r2 < l1 l2.
|
|
{
|
|
if (left.first < right.first)
|
|
{
|
|
if (right.second < left.second)
|
|
{
|
|
removed.join(range_pair_helper(right.first, right.second));
|
|
|
|
newpair.set(right.second + 1, left.second);
|
|
left.second = right.first - 1;
|
|
kept.rparray_.insert(i + 1, newpair);
|
|
}
|
|
else
|
|
{
|
|
removed.join(range_pair_helper(right.first, left.second));
|
|
left.second = right.first - 1;
|
|
}
|
|
}
|
|
else if (right.second < left.second)
|
|
{
|
|
removed.join(range_pair_helper(left.first, right.second));
|
|
left.first = right.second + 1;
|
|
}
|
|
else
|
|
{
|
|
removed.join(range_pair_helper(left.first, left.second));
|
|
kept.rparray_.erase(i);
|
|
goto RETRY_SAMEINDEXNO;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(SRELLDBG_NO_BITSET)
|
|
bool is_included(const uchar32 ch) const
|
|
{
|
|
#if 01
|
|
const range_pair *const end = rparray_.data() + rparray_.size();
|
|
|
|
for (const range_pair *cur = rparray_.data(); cur != end; ++cur)
|
|
{
|
|
if (ch <= cur->second)
|
|
return ch >= cur->first;
|
|
#else
|
|
for (size_type i = 0; i < rparray_.size(); ++i)
|
|
{
|
|
if (rparray_[i].is_included(ch))
|
|
return true;
|
|
#endif
|
|
}
|
|
return false;
|
|
}
|
|
#endif // defined(SRELLDBG_NO_BITSET)
|
|
|
|
// For multiple_range_pairs functions.
|
|
|
|
bool is_included_ls(const uchar32 pos, uchar32 count, const uchar32 c) const
|
|
{
|
|
const range_pair *cur = &rparray_[pos];
|
|
|
|
for (; count; ++cur, --count)
|
|
{
|
|
if (c <= cur->second)
|
|
return c >= cur->first;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool is_included(const uchar32 pos, uchar32 count, const uchar32 c) const
|
|
{
|
|
const range_pair *base = &rparray_[pos];
|
|
|
|
while (count)
|
|
{
|
|
uchar32 mid = count >> 1;
|
|
const range_pair &rp = base[mid];
|
|
|
|
if (c <= rp.second)
|
|
{
|
|
if (c >= rp.first)
|
|
return true;
|
|
|
|
count = mid;
|
|
}
|
|
else
|
|
{
|
|
++mid;
|
|
count -= mid;
|
|
base += mid;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void replace(const size_type pos, const size_type count, const range_pairs &right)
|
|
{
|
|
rparray_.replace(pos, count, right.rparray_);
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
|
|
// For Eytzinger layout functions.
|
|
|
|
bool is_included_el(uchar32 pos, const uchar32 len, const uchar32 c) const
|
|
{
|
|
const range_pair *const base = &rparray_[pos];
|
|
|
|
#if defined(__GNUC__)
|
|
__builtin_prefetch(base);
|
|
#endif
|
|
for (pos = 0; pos < len;)
|
|
{
|
|
const range_pair &rp = base[pos];
|
|
|
|
if (c <= rp.second)
|
|
{
|
|
if (c >= rp.first)
|
|
return true;
|
|
|
|
pos = (pos << 1) + 1;
|
|
}
|
|
else
|
|
{
|
|
pos = (pos << 1) + 2;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
uchar32 create_el(const range_pair *srcbase, const uchar32 srcsize)
|
|
{
|
|
const uchar32 basepos = static_cast<uchar32>(rparray_.size());
|
|
|
|
rparray_.resize(basepos + srcsize);
|
|
set_eytzinger_layout(0, srcbase, srcsize, &rparray_[basepos], 0);
|
|
|
|
return srcsize;
|
|
}
|
|
|
|
#endif // !defined(SRELLDBG_NO_CCPOS)
|
|
|
|
uint_l32 total_codepoints() const
|
|
{
|
|
uint_l32 num = 0;
|
|
|
|
for (size_type no = 0; no < rparray_.size(); ++no)
|
|
{
|
|
const range_pair &cr = rparray_[no];
|
|
|
|
num += cr.second - cr.first + 1;
|
|
}
|
|
return num;
|
|
}
|
|
|
|
private:
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
|
|
uchar32 set_eytzinger_layout(uchar32 srcpos, const range_pair *const srcbase, const uchar32 srclen,
|
|
range_pair *const destbase, const uchar32 destpos)
|
|
{
|
|
if (destpos < srclen)
|
|
{
|
|
const uchar32 nextpos = (destpos << 1) + 1;
|
|
|
|
srcpos = set_eytzinger_layout(srcpos, srcbase, srclen, destbase, nextpos);
|
|
destbase[destpos] = srcbase[srcpos++];
|
|
srcpos = set_eytzinger_layout(srcpos, srcbase, srclen, destbase, nextpos + 1);
|
|
}
|
|
return srcpos;
|
|
}
|
|
|
|
#endif // !defined(SRELLDBG_NO_CCPOS)
|
|
|
|
static uchar32 do_nothing(const uchar32 cp)
|
|
{
|
|
return cp;
|
|
}
|
|
|
|
template <typename BitSetT>
|
|
void load_from_bitset(const BitSetT &bs)
|
|
{
|
|
uchar32 begin = constants::invalid_u32value;
|
|
range_pairs newranges;
|
|
|
|
for (uchar32 ucp = 0;; ++ucp)
|
|
{
|
|
if (ucp > constants::unicode_max_codepoint || !bs.test(ucp))
|
|
{
|
|
if (begin != constants::invalid_u32value)
|
|
{
|
|
newranges.join(range_pair_helper(begin, ucp - 1));
|
|
begin = constants::invalid_u32value;
|
|
}
|
|
if (ucp > constants::unicode_max_codepoint)
|
|
break;
|
|
}
|
|
else if (begin == constants::invalid_u32value && bs.test(ucp))
|
|
begin = ucp;
|
|
}
|
|
rparray_.swap(newranges.rparray_);
|
|
}
|
|
|
|
array_type rparray_;
|
|
|
|
public: // For debug.
|
|
|
|
void print_pairs(const int, const char *const = NULL, const char *const = NULL) const;
|
|
};
|
|
// range_pairs
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_range_pair.hpp"]
|
|
// ["rei_char_class.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
// For RegExpIdentifierStart and RegExpIdentifierPart
|
|
struct identifier_charclass
|
|
{
|
|
public:
|
|
|
|
void clear()
|
|
{
|
|
char_class_.clear();
|
|
char_class_pos_.clear();
|
|
}
|
|
|
|
void setup()
|
|
{
|
|
if (char_class_pos_.size() == 0)
|
|
{
|
|
static const uchar32 additions[] = {
|
|
// reg_exp_identifier_start, reg_exp_identifier_part.
|
|
0x24, 0x24, 0x5f, 0x5f, 0x200c, 0x200d // '$' '_' <ZWNJ>-<ZWJ>
|
|
};
|
|
range_pairs ranges;
|
|
|
|
// For reg_exp_identifier_start.
|
|
{
|
|
const uchar32 *const IDs_address = unicode_property::ranges_address(upid_bp_ID_Start);
|
|
const std::size_t IDs_number = unicode_property::number_of_ranges(upid_bp_ID_Start);
|
|
ranges.load_from_memory(IDs_address, IDs_number);
|
|
}
|
|
ranges.load_from_memory(&additions[0], 2);
|
|
append_charclass(ranges);
|
|
|
|
// For reg_exp_identifier_part.
|
|
ranges.clear();
|
|
{
|
|
const uchar32 *const IDc_address = unicode_property::ranges_address(upid_bp_ID_Continue);
|
|
const std::size_t IDc_number = unicode_property::number_of_ranges(upid_bp_ID_Continue);
|
|
ranges.load_from_memory(IDc_address, IDc_number);
|
|
}
|
|
ranges.load_from_memory(&additions[0], 3);
|
|
append_charclass(ranges);
|
|
}
|
|
}
|
|
|
|
bool is_identifier(const uchar32 ch, const bool part) const
|
|
{
|
|
const range_pair &rp = char_class_pos_[part ? 1 : 0];
|
|
|
|
return char_class_.is_included(rp.first, rp.second, ch);
|
|
}
|
|
|
|
private:
|
|
|
|
void append_charclass(const range_pairs &rps)
|
|
{
|
|
char_class_pos_.push_back(range_pair_helper(static_cast<uchar32>(char_class_.size()), static_cast<uchar32>(rps.size())));
|
|
char_class_.append_newclass(rps);
|
|
}
|
|
|
|
range_pairs char_class_;
|
|
range_pairs::array_type char_class_pos_;
|
|
|
|
// UnicodeIDStart::
|
|
// any Unicode code point with the Unicode property "ID_Start"
|
|
// UnicodeIDContinue::
|
|
// any Unicode code point with the Unicode property "ID_Continue"
|
|
static const uint_l32 upid_bp_ID_Start = static_cast<uint_l32>(up_constants::bp_ID_Start);
|
|
static const uint_l32 upid_bp_ID_Continue = static_cast<uint_l32>(up_constants::bp_ID_Continue);
|
|
};
|
|
// identifier_charclass
|
|
#endif // !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
class re_character_class
|
|
{
|
|
public:
|
|
|
|
enum
|
|
{ // 0 1 2 3 4 5
|
|
newline, dotall, space, digit, word, icase_word,
|
|
// 6
|
|
number_of_predefcls
|
|
};
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
typedef unicode_property::pstring pstring;
|
|
#endif
|
|
|
|
re_character_class()
|
|
{
|
|
setup_predefinedclass();
|
|
}
|
|
|
|
re_character_class &operator=(const re_character_class &that)
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->char_class_ = that.char_class_;
|
|
this->char_class_pos_ = that.char_class_pos_;
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
this->char_class_el_ = that.char_class_el_;
|
|
this->char_class_pos_el_ = that.char_class_pos_el_;
|
|
#endif
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
re_character_class &operator=(re_character_class &&that) SRELL_NOEXCEPT
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->char_class_ = std::move(that.char_class_);
|
|
this->char_class_pos_ = std::move(that.char_class_pos_);
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
this->char_class_el_ = std::move(that.char_class_el_);
|
|
this->char_class_pos_el_ = std::move(that.char_class_pos_el_);
|
|
#endif
|
|
}
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
bool is_included(const uint_l32 class_number, const uchar32 c) const
|
|
{
|
|
// return char_class_.is_included(char_class_pos_[class_number], c);
|
|
const range_pair &rp = char_class_pos_[class_number];
|
|
|
|
return char_class_.is_included(rp.first, rp.second, c);
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
// bool is_included(const uint_l32 pos, const uint_l32 len, const uchar32 &c) const
|
|
bool is_included(const uchar32 pos, const uchar32 len, const uchar32 c) const
|
|
{
|
|
return char_class_el_.is_included_el(pos, len, c);
|
|
}
|
|
#endif
|
|
|
|
void setup_icase_word()
|
|
{
|
|
range_pair &icase_pos = char_class_pos_[icase_word];
|
|
|
|
if (icase_pos.second == char_class_pos_[word].second)
|
|
{
|
|
range_pairs icasewordclass(char_class_, icase_pos.first, icase_pos.second);
|
|
|
|
icasewordclass.make_caseunfoldedcharset();
|
|
// Includes 017f and 212a so that they and their case-folded
|
|
// characters 's' and 'k' will be excluded from the character
|
|
// set that /[\W]/i matches.
|
|
|
|
char_class_.replace(icase_pos.first, icase_pos.second, icasewordclass);
|
|
|
|
if (icase_pos.second < static_cast<uchar32>(icasewordclass.size()))
|
|
{
|
|
const uchar32 delta = static_cast<uchar32>(icasewordclass.size() - icase_pos.second);
|
|
|
|
for (int i = number_of_predefcls; i < static_cast<int>(char_class_pos_.size()); ++i)
|
|
char_class_pos_[i].first += delta;
|
|
}
|
|
icase_pos.second = static_cast<uchar32>(icasewordclass.size());
|
|
}
|
|
}
|
|
|
|
void clear()
|
|
{
|
|
char_class_pos_.resize(number_of_predefcls);
|
|
|
|
uchar32 basesize = 0;
|
|
for (int i = 0; i < number_of_predefcls; ++i)
|
|
basesize += char_class_pos_[i].second;
|
|
|
|
char_class_.resize(basesize);
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
char_class_el_.clear();
|
|
char_class_pos_el_.clear();
|
|
#endif
|
|
}
|
|
|
|
uint_l32 register_newclass(const range_pairs &rps)
|
|
{
|
|
for (range_pairs::size_type no = 0; no < char_class_pos_.size(); ++no)
|
|
{
|
|
const range_pair &rp = char_class_pos_[no];
|
|
|
|
if (char_class_.same(rp.first, rp.second, rps))
|
|
return static_cast<uint_l32>(no);
|
|
}
|
|
|
|
append_charclass(rps);
|
|
return static_cast<uint_l32>(char_class_pos_.size() - 1);
|
|
}
|
|
|
|
range_pairs operator[](const uint_l32 no) const
|
|
{
|
|
const range_pair &ccpos = char_class_pos_[no];
|
|
range_pairs rp(ccpos.second);
|
|
|
|
for (uchar32 i = 0; i < ccpos.second; ++i)
|
|
rp[i] = char_class_[ccpos.first + i];
|
|
|
|
return rp;
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
const range_pair &charclasspos(const uint_l32 no) // const
|
|
{
|
|
const range_pair &pos = char_class_pos_el_[no];
|
|
|
|
if (pos.second == 0)
|
|
finalise(no);
|
|
return pos;
|
|
}
|
|
|
|
void finalise()
|
|
{
|
|
char_class_el_.clear();
|
|
char_class_pos_el_.resize(char_class_pos_.size());
|
|
std::memset(&char_class_pos_el_[0], 0, char_class_pos_el_.size() * sizeof (range_pairs::array_type::value_type));
|
|
}
|
|
|
|
void finalise(const uint_l32 no)
|
|
{
|
|
const range_pair &posinfo = char_class_pos_[no];
|
|
range_pair &outpair = char_class_pos_el_[no];
|
|
|
|
outpair.first = static_cast<uchar32>(char_class_el_.size());
|
|
outpair.second = char_class_el_.create_el(&char_class_[posinfo.first], posinfo.second); //arraysize;
|
|
|
|
}
|
|
|
|
#endif // #if !defined(SRELLDBG_NO_CCPOS)
|
|
|
|
void optimise()
|
|
{
|
|
}
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
uint_l32 get_propertynumber(const pstring &pname, const pstring &pvalue) const
|
|
{
|
|
const uint_l32 pno = static_cast<uint_l32>(unicode_property::lookup_property(pname, pvalue));
|
|
|
|
return (pno != up_constants::error_property) ? pno : up_constants::error_property;
|
|
}
|
|
|
|
bool load_upranges(range_pairs &newranges, const uint_l32 property_number) const
|
|
{
|
|
newranges.clear();
|
|
|
|
if (unicode_property::is_valid_pno(property_number))
|
|
{
|
|
if (property_number == upid_bp_Assigned)
|
|
{
|
|
load_updata(newranges, upid_gc_Cn);
|
|
newranges.negation();
|
|
}
|
|
else
|
|
load_updata(newranges, property_number);
|
|
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Properties of strings.
|
|
bool is_pos(const uint_l32 pno) const
|
|
{
|
|
return unicode_property::is_pos(pno);
|
|
}
|
|
|
|
bool get_prawdata(simple_array<uchar32> &seq, uint_l32 property_number)
|
|
{
|
|
if (property_number != up_constants::error_property)
|
|
{
|
|
if (property_number == upid_bp_Assigned)
|
|
property_number = upid_gc_Cn;
|
|
|
|
const uchar32 *const address = unicode_property::ranges_address(property_number);
|
|
// const std::size_t offset = unicode_property::ranges_offset(property_number);
|
|
const std::size_t number = unicode_property::number_of_ranges(property_number) * 2;
|
|
|
|
seq.resize(number);
|
|
for (uchar32 i = 0; i < number; ++i)
|
|
seq[i] = address[i];
|
|
|
|
return true;
|
|
}
|
|
seq.clear();
|
|
return false;
|
|
}
|
|
|
|
#endif // !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
void swap(re_character_class &right)
|
|
{
|
|
if (this != &right)
|
|
{
|
|
this->char_class_.swap(right.char_class_);
|
|
this->char_class_pos_.swap(right.char_class_pos_);
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
this->char_class_el_.swap(right.char_class_el_);
|
|
this->char_class_pos_el_.swap(right.char_class_pos_el_);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
private:
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
void load_updata(range_pairs &newranges, const uint_l32 property_number) const
|
|
{
|
|
const uchar32 *const address = unicode_property::ranges_address(property_number);
|
|
// const std::size_t offset = unicode_property::ranges_offset(property_number);
|
|
const std::size_t number = unicode_property::number_of_ranges(property_number);
|
|
|
|
newranges.load_from_memory(address, number);
|
|
}
|
|
|
|
#endif // !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
void append_charclass(const range_pairs &rps)
|
|
{
|
|
char_class_pos_.push_back(range_pair_helper(static_cast<uchar32>(char_class_.size()), static_cast<uchar32>(rps.size())));
|
|
char_class_.append_newclass(rps);
|
|
}
|
|
|
|
// The production CharacterClassEscape::s evaluates as follows:
|
|
// Return the set of characters containing the characters that are on the right-hand side of the WhiteSpace or LineTerminator productions.
|
|
// WhiteSpace::<TAB> <VT> <FF> <SP> <NBSP> <ZWNBSP> <USP>
|
|
// 0009 000B 000C 0020 00A0 FEFF Zs
|
|
// LineTerminator::<LF> <CR> <LS> <PS>
|
|
// 000A 000D 2028 2029
|
|
|
|
void setup_predefinedclass()
|
|
{
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
const uchar32 *const Zs_address = unicode_property::ranges_address(upid_gc_Zs);
|
|
// const std::size_t Zs_offset = unicode_property::ranges_offset(upid_gc_Zs);
|
|
const std::size_t Zs_number = unicode_property::number_of_ranges(upid_gc_Zs);
|
|
#else
|
|
static const uchar32 Zs[] = {
|
|
0x1680, 0x1680, 0x2000, 0x200a, // 0x2028, 0x2029,
|
|
0x202f, 0x202f, 0x205f, 0x205f, 0x3000, 0x3000
|
|
};
|
|
#endif // defined(SRELL_NO_UNICODE_PROPERTY)
|
|
static const uchar32 allranges[] = {
|
|
// dotall.
|
|
0x0000, 0x10ffff,
|
|
// newline.
|
|
0x0a, 0x0a, 0x0d, 0x0d, // \n \r
|
|
// newline, space.
|
|
0x2028, 0x2029,
|
|
// space.
|
|
0x09, 0x0d, // \t \n \v \f \r
|
|
0x20, 0x20, // ' '
|
|
0xa0, 0xa0, // <NBSP>
|
|
0xfeff, 0xfeff, // <BOM>
|
|
// digit, word.
|
|
0x30, 0x39, // '0'-'9'
|
|
0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a // 'A'-'Z' '_' 'a'-'z'
|
|
};
|
|
range_pairs ranges;
|
|
|
|
// newline.
|
|
ranges.load_from_memory(&allranges[2], 3);
|
|
append_charclass(ranges);
|
|
|
|
// dotall.
|
|
ranges.clear();
|
|
ranges.load_from_memory(&allranges[0], 1);
|
|
append_charclass(ranges);
|
|
|
|
// space.
|
|
ranges.clear();
|
|
ranges.load_from_memory(&allranges[6], 5);
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
ranges.load_from_memory(Zs_address, Zs_number);
|
|
#else
|
|
ranges.load_from_memory(Zs, 5);
|
|
#endif
|
|
append_charclass(ranges);
|
|
|
|
// digit.
|
|
ranges.clear();
|
|
ranges.load_from_memory(&allranges[16], 1);
|
|
append_charclass(ranges);
|
|
|
|
// word.
|
|
ranges.clear();
|
|
ranges.load_from_memory(&allranges[16], 4);
|
|
append_charclass(ranges);
|
|
|
|
// Reservation for icase_word.
|
|
append_charclass(ranges);
|
|
}
|
|
|
|
private:
|
|
|
|
range_pairs char_class_;
|
|
range_pairs::array_type char_class_pos_;
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
range_pairs char_class_el_;
|
|
range_pairs::array_type char_class_pos_el_;
|
|
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
static const uint_l32 upid_gc_Zs = static_cast<uint_l32>(up_constants::gc_Space_Separator);
|
|
static const uint_l32 upid_gc_Cn = static_cast<uint_l32>(up_constants::gc_Unassigned);
|
|
static const uint_l32 upid_bp_Assigned = static_cast<uint_l32>(up_constants::bp_Assigned);
|
|
|
|
#endif
|
|
|
|
public: // For debug.
|
|
|
|
void print_classes(const int) const;
|
|
};
|
|
// re_character_class
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_char_class.hpp"]
|
|
// ["rei_groupname_mapper.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
template <typename charT>
|
|
class groupname_mapper
|
|
{
|
|
public:
|
|
|
|
typedef simple_array<charT> gname_string;
|
|
typedef typename gname_string::size_type size_type;
|
|
static const uint_l32 notfound = static_cast<uint_l32>(-1);
|
|
|
|
groupname_mapper()
|
|
{
|
|
}
|
|
|
|
groupname_mapper(const groupname_mapper &right)
|
|
: names_(right.names_), keysize_classno_(right.keysize_classno_)
|
|
{
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
groupname_mapper(groupname_mapper &&right) SRELL_NOEXCEPT
|
|
: names_(std::move(right.names_)), keysize_classno_(std::move(right.keysize_classno_))
|
|
{
|
|
}
|
|
#endif
|
|
|
|
groupname_mapper &operator=(const groupname_mapper &right)
|
|
{
|
|
if (this != &right)
|
|
{
|
|
names_ = right.names_;
|
|
keysize_classno_ = right.keysize_classno_;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
groupname_mapper &operator=(groupname_mapper &&right) SRELL_NOEXCEPT
|
|
{
|
|
if (this != &right)
|
|
{
|
|
names_ = std::move(right.names_);
|
|
keysize_classno_ = std::move(right.keysize_classno_);
|
|
}
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
void clear()
|
|
{
|
|
names_.clear();
|
|
keysize_classno_.clear();
|
|
}
|
|
|
|
uint_l32 operator[](const gname_string &gname) const
|
|
{
|
|
uint_l32 pos = 0;
|
|
for (std::size_t i = 0; i < static_cast<std::size_t>(keysize_classno_.size()); i += 2)
|
|
{
|
|
const uint_l32 keysize = keysize_classno_[i];
|
|
|
|
if (keysize == static_cast<uint_l32>(gname.size()) && sameseq(pos, gname))
|
|
return keysize_classno_[++i];
|
|
|
|
pos += keysize;
|
|
}
|
|
return notfound;
|
|
}
|
|
|
|
gname_string operator[](const uint_l32 indexno) const
|
|
{
|
|
uint_l32 pos = 0;
|
|
for (std::size_t i = 0; i < static_cast<std::size_t>(keysize_classno_.size()); ++i)
|
|
{
|
|
const uint_l32 keysize = keysize_classno_[i];
|
|
const uint_l32 classno = keysize_classno_[++i];
|
|
|
|
if (classno == indexno)
|
|
return gname_string(names_, pos, keysize);
|
|
|
|
pos += keysize;
|
|
}
|
|
return gname_string();
|
|
}
|
|
|
|
size_type size() const
|
|
{
|
|
return static_cast<size_type>(keysize_classno_.size() >> 1);
|
|
}
|
|
|
|
bool push_back(const gname_string &gname, const uint_l32 class_number)
|
|
{
|
|
const uint_l32 num = operator[](gname);
|
|
|
|
if (num == notfound)
|
|
{
|
|
names_.append(gname);
|
|
keysize_classno_.append(1, static_cast<uint_l32>(gname.size()));
|
|
keysize_classno_.append(1, class_number);
|
|
return true;
|
|
}
|
|
return false; // Already exists.
|
|
}
|
|
|
|
void swap(groupname_mapper &right)
|
|
{
|
|
this->names_.swap(right.names_);
|
|
keysize_classno_.swap(right.keysize_classno_);
|
|
}
|
|
|
|
private:
|
|
|
|
bool sameseq(size_type pos, const gname_string &gname) const
|
|
{
|
|
for (size_type i = 0; i < gname.size(); ++i, ++pos)
|
|
if (pos >= names_.size() || names_[pos] != gname[i])
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
gname_string names_;
|
|
simple_array<uint_l32> keysize_classno_;
|
|
|
|
public: // For debug.
|
|
|
|
void print_mappings(const int) const;
|
|
};
|
|
template <typename charT>
|
|
const uint_l32 groupname_mapper<charT>::notfound;
|
|
// groupname_mapper
|
|
|
|
#endif // !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_groupname_mapper.hpp"]
|
|
// ["rei_state.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
struct re_quantifier
|
|
{
|
|
// atleast and atmost: for check_counter.
|
|
// offset and length: for charcter_class.
|
|
// (Special case 1) in roundbracket_open and roundbracket_pop atleast and atmost represent
|
|
// the minimum and maximum bracket numbers respectively inside the brackets itself.
|
|
// (Special case 2) in repeat_in_push and repeat_in_pop atleast and atmost represent the
|
|
// minimum and maximum bracket numbers respectively inside the repetition.
|
|
union
|
|
{
|
|
uint_l32 atleast;
|
|
// (Special case 3: v1) in lookaround_open represents the number of characters to be rewound.
|
|
// (Special case 3: v2) in lookaround_open represents: 0=lookaheads, 1=lookbehinds,
|
|
// 2=matchpointrewinder.
|
|
// (Special case 4) in NFA_states[0] represents the class number of the first character class.
|
|
uchar32 offset;
|
|
};
|
|
union
|
|
{
|
|
uint_l32 atmost;
|
|
uchar32 length;
|
|
};
|
|
|
|
union
|
|
{
|
|
bool is_greedy;
|
|
uint_l32 padding_;
|
|
};
|
|
|
|
void reset(const uint_l32 len = 1)
|
|
{
|
|
atleast = atmost = len;
|
|
is_greedy = true;
|
|
}
|
|
|
|
void set(const uint_l32 min, const uint_l32 max)
|
|
{
|
|
atleast = min;
|
|
atmost = max;
|
|
}
|
|
|
|
void set(const uint_l32 min, const uint_l32 max, const bool greedy)
|
|
{
|
|
atleast = min;
|
|
atmost = max;
|
|
is_greedy = greedy;
|
|
}
|
|
|
|
void setccpos(const uchar32 o, const uchar32 l)
|
|
{
|
|
offset = o;
|
|
length = l;
|
|
}
|
|
|
|
bool is_valid() const
|
|
{
|
|
return atleast <= atmost && atmost > 0;
|
|
}
|
|
|
|
void set_infinity()
|
|
{
|
|
atmost = constants::infinity;
|
|
}
|
|
|
|
bool is_infinity() const
|
|
{
|
|
return atmost == constants::infinity;
|
|
}
|
|
|
|
bool is_same() const
|
|
{
|
|
return atleast == atmost;
|
|
}
|
|
|
|
bool is_default() const
|
|
{
|
|
return atleast == 1 && atmost == 1;
|
|
}
|
|
|
|
bool is_asterisk() const
|
|
{
|
|
return atleast == 0 && atmost == constants::infinity;
|
|
}
|
|
bool is_plus() const
|
|
{
|
|
return atleast == 1 && atmost == constants::infinity;
|
|
}
|
|
bool is_asterisk_or_plus() const
|
|
{
|
|
return atleast <= 1 && atmost == constants::infinity;
|
|
}
|
|
bool is_question_or_asterisk() const
|
|
{
|
|
return atleast == 0 && (atmost == 1 || atmost == constants::infinity);
|
|
}
|
|
|
|
bool has_simple_equivalence() const
|
|
{
|
|
return (atleast <= 1 && atmost <= 3) || (atleast == 2 && atmost <= 4) || (atleast == atmost && atmost <= 6);
|
|
}
|
|
|
|
void multiply(const re_quantifier &q)
|
|
{
|
|
if (atleast != constants::infinity)
|
|
{
|
|
if (q.atleast != constants::infinity)
|
|
atleast *= q.atleast;
|
|
else
|
|
atleast = constants::infinity;
|
|
}
|
|
|
|
if (atmost != constants::infinity)
|
|
{
|
|
if (q.atmost != constants::infinity)
|
|
atmost *= q.atmost;
|
|
else
|
|
atmost = constants::infinity;
|
|
}
|
|
}
|
|
|
|
void add(const re_quantifier &q)
|
|
{
|
|
if (atleast != constants::infinity)
|
|
{
|
|
if (q.atleast != constants::infinity && (atleast + q.atleast) >= atleast)
|
|
atleast += q.atleast;
|
|
else
|
|
atleast = constants::infinity;
|
|
}
|
|
|
|
if (atmost != constants::infinity)
|
|
{
|
|
if (q.atmost != constants::infinity && (atmost + q.atmost) >= atmost)
|
|
atmost += q.atmost;
|
|
else
|
|
atmost = constants::infinity;
|
|
}
|
|
}
|
|
};
|
|
// re_quantifier
|
|
|
|
struct re_state
|
|
{
|
|
union
|
|
{
|
|
uchar32 character; // For character.
|
|
uint_l32 number; // For character_class, brackets, counter, repeat, backreference.
|
|
};
|
|
|
|
re_state_type type;
|
|
|
|
union
|
|
{
|
|
std::ptrdiff_t next1;
|
|
re_state *next_state1;
|
|
// Points to the next state.
|
|
// (Special case 1) in lookaround_open points to the next of lookaround_close.
|
|
};
|
|
union
|
|
{
|
|
std::ptrdiff_t next2;
|
|
re_state *next_state2;
|
|
// character and character_class: points to another possibility, non-backtracking.
|
|
// epsilon: points to another possibility, backtracking.
|
|
// save_and_reset_counter, roundbracket_open, and repeat_in_push: points to a
|
|
// restore state, backtracking.
|
|
// check_counter: complementary to next1 based on quantifier.is_greedy.
|
|
// (Special case 1) roundbracket_close, check_0_width_repeat, and backreference:
|
|
// points to the next state as an exit after 0 width match.
|
|
// (Special case 2) in NFA_states[0] holds the entry point for match_continuous/regex_match.
|
|
// (Special case 3) in lookaround_open points to the contents of brackets.
|
|
};
|
|
|
|
re_quantifier quantifier; // For check_counter, roundbrackets, repeasts, (?<=...) and (?<!...),
|
|
// and character_class.
|
|
|
|
union
|
|
{
|
|
bool is_not; // For \B, (?!...) and (?<!...).
|
|
bool dont_push; // For check_counter.
|
|
bool backrefnumber_unresolved; // For backreference (used only in compiler).
|
|
bool icase; // For [0] only.
|
|
bool multiline; // For bol, eol.
|
|
uint_l32 padding_;
|
|
};
|
|
|
|
// st_character, // 0x00
|
|
// char/number: character
|
|
// next1: gen.
|
|
// next2: +1 (exit. used only when '*' or '?')
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_character_class, // 0x01
|
|
// char/number: character class number
|
|
// next1: gen.
|
|
// next2: +1 (exit. used only when '*' or '?')
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_epsilon, // 0x02
|
|
// char/number: -
|
|
// next1: gen.
|
|
// next2: alt.
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_check_counter, // 0x03
|
|
// char/number: counter number
|
|
// next1: greedy: epsilon that may push backtracking data to decrement_counter.
|
|
// not-greedy: out-of-loop
|
|
// next2: complementary to next1
|
|
// q.atleast: gen.
|
|
// q.atmost: gen.
|
|
// q.greedy: gen.
|
|
// is_not/dont_push: - (was dont_push)
|
|
|
|
// st_decrement_counter, // 0x04
|
|
// char/number: counter number
|
|
// next1: 0 (always treated as "not matched")
|
|
// next2: 0
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_save_and_reset_counter, // 0x05
|
|
// char/number: counter number
|
|
// next1: +2 (check_counter)
|
|
// next2: +1 (restore_counter)
|
|
// quantifiers: -
|
|
// is_not/dont_push: - (was dont_push)
|
|
|
|
// st_restore_counter, // 0x06
|
|
// char/number: counter number
|
|
// next1: 0 (always treated as "not matched")
|
|
// next2: 0
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_roundbracket_open, // 0x07
|
|
// char/number: bracket number
|
|
// next1: +2 (next of roundbracket_pop, atom)
|
|
// next2: +1 (roundbracket_pop)
|
|
// q.atleast: min bracket number inside this bracket (except myself's number)
|
|
// q.atmost: max bracket number inside this bracket
|
|
// q.greedy: -
|
|
// is_not/dont_push: - (was dont_push)
|
|
|
|
// st_roundbracket_pop, // 0x08
|
|
// char/number: bracket number
|
|
// next1: 0 (always treated as "not matched")
|
|
// next2: 0
|
|
// q.atleast: min bracket number inside this bracket (i.except myself's number)
|
|
// q.atmost: max bracket number inside this bracket
|
|
// q.greedy: -
|
|
// is_not/dont_push: - (was dont_push)
|
|
|
|
// st_roundbracket_close, // 0x09
|
|
// char/number: bracket number
|
|
// next1: gen.
|
|
// next2: +1 (exit for 0 width loop)
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_repeat_in_push, // 0x0a
|
|
// char/number: repeat counter
|
|
// next1: +2 (next of repeat_in_pop, atom)
|
|
// next2: +1 (repeat_in_pop)
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_repeat_in_pop, // 0x0b
|
|
// char/number: repeat counter
|
|
// next1: 0 (always treated as "not matched")
|
|
// next2: 0
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_check_0_width_repeat, // 0x0c
|
|
// char/number: repeat counter
|
|
// next1: gen. (epsilon or check_counter)
|
|
// next2: +1 (exit for 0 width loop)
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_backreference, // 0x0d
|
|
// char/number: bracket number
|
|
// next1: gen.
|
|
// next2: +1 (exit for 0 width match)
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_lookaround_open, // 0x0e
|
|
// char/number: -
|
|
// next1: next of lookaround_close (to where jumps after lookaround assertion)
|
|
// next2: +2 (the contents of brackets)
|
|
// q.atleast: <fixed-width> number of chars to be rewound (for (?<=...) (?<!...))
|
|
// <variable-width> 0: lookahead, 1: lookbehind, 2: mprewinder.
|
|
// q.atmost: -
|
|
// q.greedy: -
|
|
// is_not/dont_push: not
|
|
|
|
// st_bol, // 0x0f
|
|
// char/number: -
|
|
// next1/next2: -
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_eol, // 0x10
|
|
// char/number: -
|
|
// next1/next2: -
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_boundary, // 0x11
|
|
// char/number: -
|
|
// next1/next2: -
|
|
// quantifiers: -
|
|
// is_not/dont_push: not
|
|
|
|
// st_success, // 0x12
|
|
// char/number: -
|
|
// next1/next2: -
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
// st_move_nextpos, // 0x13
|
|
// char/number: -
|
|
// next1/next2: -
|
|
// quantifiers: -
|
|
// is_not/dont_push: -
|
|
|
|
void reset()
|
|
{
|
|
number = 0;
|
|
type = st_character;
|
|
next1 = 1;
|
|
next2 = 0;
|
|
is_not = false;
|
|
quantifier.reset();
|
|
}
|
|
|
|
bool is_character_or_class() const
|
|
{
|
|
return type == st_character || type == st_character_class;
|
|
}
|
|
|
|
bool has_quantifier() const
|
|
{
|
|
// 1. character: size == 1 && type == character,
|
|
// 2. [...]: size == 1 && type == character_class,
|
|
// 3. (...): size == ? && type == roundbracket_open,
|
|
// 4. (?:...): size == ? && type == epsilon && character == ':',
|
|
// 5. backref: size == ? && type == backreference,
|
|
// -- assertions boundary --
|
|
// 6. lookaround: size == ? && type == lookaround,
|
|
// 7. assertion: size == 0 && type == one of assertions (^, $, \b and \B).
|
|
#if !defined(SRELL_ENABLE_GT)
|
|
return type < st_zero_width_boundary;
|
|
#else
|
|
// 5.5. independent: size == ? && type == lookaround && character == '>',
|
|
return type < st_zero_width_boundary || (type == st_lookaround_open && character == meta_char::mc_gt);
|
|
#endif
|
|
}
|
|
|
|
bool is_noncapturinggroup() const
|
|
{
|
|
return type == st_epsilon && character == meta_char::mc_colon;
|
|
}
|
|
|
|
bool has_0widthchecker() const
|
|
{
|
|
return type == st_roundbracket_open || type == st_backreference;
|
|
}
|
|
|
|
bool is_negcharclass() const
|
|
{
|
|
return type == st_character_class && is_not;
|
|
}
|
|
|
|
bool is_branch() const
|
|
{
|
|
return type == st_epsilon && next2 != 0 && character == meta_char::mc_bar; // '|'
|
|
}
|
|
};
|
|
// re_state
|
|
|
|
struct re_flags
|
|
{
|
|
// bool i;
|
|
// bool m;
|
|
// bool s;
|
|
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
bool back;
|
|
#endif
|
|
|
|
void reset(const regex_constants::syntax_option_type /* flags */)
|
|
{
|
|
// i = (flags & regex_constants::icase) != 0; // Case-insensitive.
|
|
// m = (flags & regex_constants::multiline) != 0;
|
|
// s = (flags & regex_constants::dotall) != 0;
|
|
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
back = false;
|
|
#endif
|
|
}
|
|
|
|
void restore_from(const re_flags &backup)
|
|
{
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
back = backup.back;
|
|
#endif
|
|
}
|
|
};
|
|
// re_flags
|
|
|
|
template <typename charT>
|
|
struct re_compiler_state : public re_flags
|
|
{
|
|
bool backref_used;
|
|
|
|
simple_array<uint_l32> atleast_widths_of_brackets;
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
groupname_mapper<charT> unresolved_gnames;
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
identifier_charclass idchecker;
|
|
#endif
|
|
|
|
void reset(const regex_constants::syntax_option_type flags)
|
|
{
|
|
re_flags::reset(flags);
|
|
|
|
backref_used = false;
|
|
atleast_widths_of_brackets.clear();
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
unresolved_gnames.clear();
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
// idchecker.clear(); // Keeps data once created.
|
|
#endif
|
|
}
|
|
};
|
|
// re_compiler_state
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_state.hpp"]
|
|
// ["rei_search_state.hpp" ...
|
|
|
|
template <typename BidirectionalIterator>
|
|
class sub_match /* : std::pair<BidirectionalIterator, BidirectionalIterator> */;
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
//template <typename charT>
|
|
struct re_state;
|
|
|
|
template </* typename charT, */typename BidirectionalIterator>
|
|
struct re_search_state_core
|
|
{
|
|
const re_state/* <charT> */ *in_NFA_states;
|
|
BidirectionalIterator in_string;
|
|
};
|
|
|
|
template <typename BidirectionalIterator>
|
|
struct re_submatch_core
|
|
{
|
|
BidirectionalIterator open_at;
|
|
BidirectionalIterator close_at;
|
|
};
|
|
|
|
template <typename BidirectionalIterator>
|
|
struct re_submatch_type
|
|
{
|
|
re_submatch_core<BidirectionalIterator> core;
|
|
uint_l32 counter;
|
|
};
|
|
|
|
template </*typename charT, */typename BidirectionalIterator>
|
|
struct re_search_state_types
|
|
{
|
|
typedef re_submatch_core<BidirectionalIterator> submatch_core;
|
|
typedef re_submatch_type<BidirectionalIterator> submatch_type;
|
|
typedef uint_l32 counter_type;
|
|
typedef BidirectionalIterator position_type;
|
|
|
|
typedef std::vector<submatch_type> submatch_array;
|
|
|
|
typedef re_search_state_core</*charT, */BidirectionalIterator> search_core_state;
|
|
|
|
typedef std::vector<search_core_state> backtracking_array;
|
|
typedef std::vector<submatch_core> capture_array;
|
|
typedef simple_array<counter_type> counter_array;
|
|
typedef std::vector<position_type> repeat_array;
|
|
};
|
|
|
|
template </*typename charT1, */typename charT2>
|
|
struct re_search_state_types</*charT1, */const charT2 *>
|
|
{
|
|
typedef re_submatch_core<const charT2 *> submatch_core;
|
|
typedef re_submatch_type<const charT2 *> submatch_type;
|
|
typedef uint_l32 counter_type;
|
|
typedef const charT2 *position_type;
|
|
|
|
typedef simple_array<submatch_type> submatch_array;
|
|
|
|
typedef re_search_state_core</*charT1, */const charT2 *> search_core_state;
|
|
|
|
typedef simple_array<search_core_state> backtracking_array;
|
|
typedef simple_array<submatch_core> capture_array;
|
|
typedef simple_array<position_type> repeat_array;
|
|
typedef simple_array<counter_type> counter_array;
|
|
};
|
|
// re_search_state_types
|
|
|
|
template </*typename charT, */typename BidirectionalIterator>
|
|
class re_search_state : public re_search_state_types</*charT, */BidirectionalIterator>
|
|
{
|
|
private:
|
|
|
|
typedef re_search_state_types</*charT, */BidirectionalIterator> base_type;
|
|
|
|
public:
|
|
|
|
typedef typename base_type::submatch_core submatchcore_type;
|
|
typedef typename base_type::submatch_type submatch_type;
|
|
typedef typename base_type::counter_type counter_type;
|
|
typedef typename base_type::position_type position_type;
|
|
|
|
typedef typename base_type::submatch_array submatch_array;
|
|
|
|
typedef typename base_type::search_core_state search_core_state;
|
|
|
|
typedef typename base_type::backtracking_array backtracking_array;
|
|
typedef typename base_type::capture_array capture_array;
|
|
typedef typename base_type::counter_array counter_array;
|
|
typedef typename base_type::repeat_array repeat_array;
|
|
|
|
typedef typename backtracking_array::size_type btstack_size_type;
|
|
|
|
public:
|
|
|
|
struct bottom_state
|
|
{
|
|
btstack_size_type btstack_size;
|
|
typename capture_array::size_type capturestack_size;
|
|
typename counter_array::size_type counterstack_size;
|
|
typename repeat_array::size_type repeatstack_size;
|
|
|
|
bottom_state(
|
|
const btstack_size_type bt,
|
|
const typename capture_array::size_type h,
|
|
const typename counter_array::size_type c,
|
|
const typename repeat_array::size_type r)
|
|
: btstack_size(bt)
|
|
, capturestack_size(h)
|
|
, counterstack_size(c)
|
|
, repeatstack_size(r)
|
|
{
|
|
}
|
|
};
|
|
|
|
public:
|
|
|
|
search_core_state nth;
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
std::size_t failure_counter;
|
|
#endif
|
|
|
|
BidirectionalIterator srchend;
|
|
BidirectionalIterator lblim;
|
|
|
|
BidirectionalIterator nextpos;
|
|
|
|
backtracking_array bt_stack;
|
|
|
|
capture_array capture_stack;
|
|
counter_array counter_stack;
|
|
repeat_array repeat_stack;
|
|
|
|
submatch_array bracket;
|
|
counter_array counter;
|
|
repeat_array repeat;
|
|
|
|
btstack_size_type btstack_size;
|
|
|
|
BidirectionalIterator srchbegin;
|
|
|
|
public:
|
|
|
|
void init
|
|
(
|
|
const BidirectionalIterator begin,
|
|
const BidirectionalIterator end,
|
|
const BidirectionalIterator lookbehindlimit,
|
|
const regex_constants::match_flag_type flags
|
|
)
|
|
{
|
|
lblim = lookbehindlimit;
|
|
nextpos = srchbegin = begin;
|
|
srchend = end;
|
|
flags_ = flags;
|
|
}
|
|
|
|
void set_entrypoint(const re_state *const entry)
|
|
{
|
|
entry_state_ = entry;
|
|
}
|
|
|
|
void init_for_automaton
|
|
(
|
|
uint_l32 num_of_submatches,
|
|
const uint_l32 num_of_counters,
|
|
const uint_l32 num_of_repeats
|
|
)
|
|
{
|
|
|
|
bracket.resize(num_of_submatches);
|
|
counter.resize(num_of_counters);
|
|
repeat.resize(num_of_repeats);
|
|
|
|
nth.in_string = (flags_ & regex_constants::match_continuous) ? srchbegin : srchend;
|
|
|
|
while (num_of_submatches > 1)
|
|
{
|
|
submatch_type &br = bracket[--num_of_submatches];
|
|
|
|
br.core.open_at = br.core.close_at = this->srchend;
|
|
br.counter = 0;
|
|
// 15.10.2.9; AtomEscape:
|
|
// If the regular expression has n or more capturing parentheses
|
|
// but the nth one is undefined because it hasn't captured anything,
|
|
// then the backreference always succeeds.
|
|
|
|
// C.f., table 27 and 28 on TR1, table 142 and 143 on C++11.
|
|
}
|
|
|
|
clear_stacks();
|
|
}
|
|
|
|
#if defined(SRELL_NO_LIMIT_COUNTER)
|
|
void reset(/* const BidirectionalIterator start */)
|
|
#else
|
|
void reset(/* const BidirectionalIterator start, */ const std::size_t limit)
|
|
#endif
|
|
{
|
|
nth.in_NFA_states = this->entry_state_;
|
|
|
|
bracket[0].core.open_at = nth.in_string;
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
failure_counter = limit;
|
|
#endif
|
|
}
|
|
|
|
bool is_at_lookbehindlimit() const
|
|
{
|
|
return nth.in_string == this->lblim;
|
|
}
|
|
|
|
bool is_at_srchend() const
|
|
{
|
|
return nth.in_string == this->srchend;
|
|
}
|
|
|
|
bool is_null() const
|
|
{
|
|
return nth.in_string == bracket[0].core.open_at;
|
|
}
|
|
|
|
// regex_constants::match_flag_type flags() const
|
|
// {
|
|
// return this->flags_;
|
|
// }
|
|
|
|
bool match_not_bol_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_not_bol)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool match_not_eol_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_not_eol)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool match_not_bow_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_not_bow)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool match_not_eow_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_not_eow)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool match_prev_avail_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_prev_avail)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool match_not_null_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_not_null)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool match_continuous_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_continuous)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool match_match_flag() const
|
|
{
|
|
if (this->flags_ & regex_constants::match_match_)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool set_bracket0(const BidirectionalIterator begin, const BidirectionalIterator end)
|
|
{
|
|
nth.in_string = begin;
|
|
nextpos = end;
|
|
return true;
|
|
}
|
|
|
|
void clear_stacks()
|
|
{
|
|
btstack_size = 0;
|
|
bt_stack.clear();
|
|
capture_stack.clear();
|
|
repeat_stack.clear();
|
|
counter_stack.clear();
|
|
}
|
|
|
|
btstack_size_type size() const // For debug.
|
|
{
|
|
return bt_stack.size();
|
|
}
|
|
|
|
bool is_empty() const // For debug.
|
|
{
|
|
if (btstack_size == 0
|
|
&& bt_stack.size() == 0
|
|
&& capture_stack.size() == 0
|
|
&& repeat_stack.size() == 0
|
|
&& counter_stack.size() == 0)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
private:
|
|
|
|
/* const */regex_constants::match_flag_type flags_;
|
|
const re_state/* <charT> */ * /* const */entry_state_;
|
|
};
|
|
// re_search_state
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_search_state.hpp"]
|
|
// ["rei_bmh.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
|
|
template <typename charT, typename utf_traits>
|
|
class re_bmh
|
|
{
|
|
public:
|
|
|
|
re_bmh()
|
|
{
|
|
}
|
|
|
|
re_bmh(const re_bmh &right)
|
|
{
|
|
operator=(right);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
re_bmh(re_bmh &&right) SRELL_NOEXCEPT
|
|
{
|
|
operator=(std::move(right));
|
|
}
|
|
#endif
|
|
|
|
re_bmh &operator=(const re_bmh &that)
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->u32string_ = that.u32string_;
|
|
|
|
this->bmtable_ = that.bmtable_;
|
|
this->repseq_ = that.repseq_;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
re_bmh &operator=(re_bmh &&that) SRELL_NOEXCEPT
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->u32string_ = std::move(that.u32string_);
|
|
|
|
this->bmtable_ = std::move(that.bmtable_);
|
|
this->repseq_ = std::move(that.repseq_);
|
|
}
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
void clear()
|
|
{
|
|
u32string_.clear();
|
|
|
|
bmtable_.clear();
|
|
repseq_.clear();
|
|
}
|
|
|
|
void setup(const simple_array<uchar32> &u32s, const bool icase)
|
|
{
|
|
u32string_ = u32s;
|
|
setup_();
|
|
|
|
if (!icase)
|
|
setup_for_casesensitive();
|
|
else
|
|
setup_for_icase();
|
|
}
|
|
|
|
template <typename RandomAccessIterator>
|
|
bool do_casesensitivesearch(re_search_state<RandomAccessIterator> &sstate, const std::random_access_iterator_tag) const
|
|
{
|
|
RandomAccessIterator begin = sstate.srchbegin;
|
|
const RandomAccessIterator end = sstate.srchend;
|
|
std::size_t offset = static_cast<std::size_t>(repseq_.size() - 1);
|
|
const charT *const relastchar = &repseq_[offset];
|
|
|
|
for (; static_cast<std::size_t>(end - begin) > offset;)
|
|
{
|
|
begin += offset;
|
|
|
|
if (*begin == *relastchar)
|
|
{
|
|
const charT *re = relastchar;
|
|
RandomAccessIterator tail = begin;
|
|
|
|
for (; *--re == *--tail;)
|
|
{
|
|
if (re == repseq_.data())
|
|
return sstate.set_bracket0(tail, ++begin);
|
|
}
|
|
}
|
|
offset = bmtable_[*begin & 0xff];
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
bool do_casesensitivesearch(re_search_state<BidirectionalIterator> &sstate, const std::bidirectional_iterator_tag) const
|
|
{
|
|
BidirectionalIterator begin = sstate.srchbegin;
|
|
const BidirectionalIterator end = sstate.srchend;
|
|
std::size_t offset = static_cast<std::size_t>(repseq_.size() - 1);
|
|
const charT *const relastchar = &repseq_[offset];
|
|
|
|
for (;;)
|
|
{
|
|
for (; offset; --offset, ++begin)
|
|
if (begin == end)
|
|
return false;
|
|
|
|
if (*begin == *relastchar)
|
|
{
|
|
const charT *re = relastchar;
|
|
BidirectionalIterator tail = begin;
|
|
|
|
for (; *--re == *--tail;)
|
|
{
|
|
if (re == repseq_.data())
|
|
return sstate.set_bracket0(tail, ++begin);
|
|
}
|
|
}
|
|
offset = bmtable_[*begin & 0xff];
|
|
}
|
|
}
|
|
|
|
template <typename RandomAccessIterator>
|
|
bool do_icasesearch(re_search_state<RandomAccessIterator> &sstate, const std::random_access_iterator_tag) const
|
|
{
|
|
const RandomAccessIterator begin = sstate.srchbegin;
|
|
const RandomAccessIterator end = sstate.srchend;
|
|
std::size_t offset = bmtable_[256];
|
|
const uchar32 entrychar = u32string_[u32string_.size() - 1];
|
|
const uchar32 *const re2ndlastchar = &u32string_[u32string_.size() - 2];
|
|
RandomAccessIterator curpos = begin;
|
|
|
|
for (; static_cast<std::size_t>(end - curpos) > offset;)
|
|
{
|
|
curpos += offset;
|
|
|
|
for (; utf_traits::is_trailing(*curpos);)
|
|
if (++curpos == end)
|
|
return false;
|
|
|
|
const uchar32 txtlastchar = utf_traits::codepoint(curpos, end);
|
|
|
|
if (txtlastchar == entrychar || unicode_case_folding::do_casefolding(txtlastchar) == entrychar)
|
|
{
|
|
const uchar32 *re = re2ndlastchar;
|
|
RandomAccessIterator tail = curpos;
|
|
|
|
// for (; *--re == unicode_case_folding::do_casefolding(utf_traits::dec_codepoint(tail, begin));)
|
|
for (; *re == unicode_case_folding::do_casefolding(utf_traits::dec_codepoint(tail, begin)); --re)
|
|
{
|
|
if (re == u32string_.data())
|
|
{
|
|
utf_traits::codepoint_inc(curpos, end);
|
|
return sstate.set_bracket0(tail, curpos);
|
|
}
|
|
if (tail == begin)
|
|
break;
|
|
}
|
|
}
|
|
offset = bmtable_[txtlastchar & 0xff];
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <typename BidirectionalIterator>
|
|
bool do_icasesearch(re_search_state<BidirectionalIterator> &sstate, const std::bidirectional_iterator_tag) const
|
|
{
|
|
const BidirectionalIterator begin = sstate.srchbegin;
|
|
const BidirectionalIterator end = sstate.srchend;
|
|
|
|
if (begin != end)
|
|
{
|
|
std::size_t offset = bmtable_[256]; //static_cast<std::size_t>(u32string_.size() - 1);
|
|
const uchar32 entrychar = u32string_[offset];
|
|
const uchar32 *const re2ndlastchar = &u32string_[offset - 1];
|
|
BidirectionalIterator curpos = begin;
|
|
|
|
for (;;)
|
|
{
|
|
for (;;)
|
|
{
|
|
if (++curpos == end)
|
|
return false;
|
|
if (!utf_traits::is_trailing(*curpos))
|
|
if (--offset == 0)
|
|
break;
|
|
}
|
|
// const uchar32 txtlastchar = unicode_case_folding::do_casefolding(utf_traits::codepoint(curpos, end));
|
|
const uchar32 txtlastchar = utf_traits::codepoint(curpos, end);
|
|
|
|
// if (txtlastchar == *re2ndlastchar)
|
|
// if (txtlastchar == *re2ndlastchar || unicode_case_folding::do_casefolding(txtlastchar) == *re2ndlastchar)
|
|
if (txtlastchar == entrychar || unicode_case_folding::do_casefolding(txtlastchar) == entrychar)
|
|
{
|
|
const uchar32 *re = re2ndlastchar;
|
|
BidirectionalIterator tail = curpos;
|
|
|
|
for (; *re == unicode_case_folding::do_casefolding(utf_traits::dec_codepoint(tail, begin)); --re)
|
|
{
|
|
if (re == u32string_.data())
|
|
{
|
|
utf_traits::codepoint_inc(curpos, end);
|
|
return sstate.set_bracket0(tail, curpos);
|
|
}
|
|
if (tail == begin)
|
|
break;
|
|
}
|
|
}
|
|
offset = bmtable_[txtlastchar & 0xff];
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
private:
|
|
|
|
void setup_()
|
|
{
|
|
bmtable_.resize(257);
|
|
}
|
|
|
|
void setup_for_casesensitive()
|
|
{
|
|
charT mbstr[utf_traits::maxseqlen];
|
|
const std::size_t u32str_lastcharpos_ = static_cast<std::size_t>(u32string_.size() - 1);
|
|
|
|
repseq_.clear();
|
|
|
|
for (std::size_t i = 0; i <= u32str_lastcharpos_; ++i)
|
|
{
|
|
const uchar32 seqlen = utf_traits::to_codeunits(mbstr, u32string_[i]);
|
|
|
|
for (uchar32 j = 0; j < seqlen; ++j)
|
|
repseq_.push_back(mbstr[j]);
|
|
}
|
|
|
|
for (std::size_t i = 0; i < 256; ++i)
|
|
bmtable_[i] = static_cast<std::size_t>(repseq_.size());
|
|
|
|
const std::size_t repseq_lastcharpos_ = static_cast<std::size_t>(repseq_.size() - 1);
|
|
|
|
for (std::size_t i = 0; i < repseq_lastcharpos_; ++i)
|
|
bmtable_[repseq_[i] & 0xff] = repseq_lastcharpos_ - i;
|
|
}
|
|
|
|
void setup_for_icase()
|
|
{
|
|
charT mbstr[utf_traits::maxseqlen];
|
|
uchar32 u32table[ucf_constants::rev_maxset];
|
|
const std::size_t u32str_lastcharpos = static_cast<std::size_t>(u32string_.size() - 1);
|
|
simple_array<std::size_t> minlen(u32string_.size());
|
|
std::size_t cu_repseq_lastcharpos = 0;
|
|
|
|
for (std::size_t i = 0; i <= u32str_lastcharpos; ++i)
|
|
{
|
|
const uchar32 setnum = unicode_case_folding::casefoldedcharset(u32table, u32string_[i]);
|
|
uchar32 u32c = u32table[0];
|
|
|
|
for (uchar32 j = 1; j < setnum; ++j)
|
|
if (u32c > u32table[j])
|
|
u32c = u32table[j];
|
|
|
|
if (i < u32str_lastcharpos)
|
|
cu_repseq_lastcharpos += minlen[i] = utf_traits::to_codeunits(mbstr, u32c);
|
|
}
|
|
|
|
++cu_repseq_lastcharpos;
|
|
|
|
for (std::size_t i = 0; i < 256; ++i)
|
|
bmtable_[i] = cu_repseq_lastcharpos;
|
|
|
|
bmtable_[256] = --cu_repseq_lastcharpos;
|
|
|
|
for (std::size_t i = 0; i < u32str_lastcharpos; ++i)
|
|
{
|
|
const uchar32 setnum = unicode_case_folding::casefoldedcharset(u32table, u32string_[i]);
|
|
|
|
for (uchar32 j = 0; j < setnum; ++j)
|
|
bmtable_[u32table[j] & 0xff] = cu_repseq_lastcharpos;
|
|
|
|
cu_repseq_lastcharpos -= minlen[i];
|
|
}
|
|
}
|
|
|
|
public: // For debug.
|
|
|
|
void print_table() const;
|
|
void print_seq() const;
|
|
|
|
private:
|
|
|
|
simple_array<uchar32> u32string_;
|
|
// std::size_t bmtable_[256];
|
|
simple_array<std::size_t> bmtable_;
|
|
simple_array<charT> repseq_;
|
|
};
|
|
// re_bmh
|
|
|
|
#endif // !defined(SRELLDBG_NO_BMH)
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_bmh.hpp"]
|
|
// ["rei_upos.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
struct posdata_holder
|
|
{
|
|
simple_array<uchar32> indices;
|
|
simple_array<uchar32> seqs;
|
|
range_pairs ranges;
|
|
range_pair length;
|
|
|
|
void clear()
|
|
{
|
|
indices.clear();
|
|
seqs.clear();
|
|
ranges.clear();
|
|
length.set(1);
|
|
}
|
|
|
|
bool has_empty() const
|
|
{
|
|
return (indices.size() >= 2 && indices[0] != indices[1]) ? true : false;
|
|
}
|
|
|
|
bool has_data() const
|
|
{
|
|
return ranges.size() > 0 || indices.size() > 0;
|
|
}
|
|
|
|
bool may_contain_strings() const
|
|
{
|
|
return indices.size() > 0; // >= 2;
|
|
}
|
|
|
|
void swap(posdata_holder &right)
|
|
{
|
|
indices.swap(right.indices);
|
|
seqs.swap(right.seqs);
|
|
ranges.swap(right.ranges);
|
|
length.swap(right.length);
|
|
}
|
|
|
|
void do_union(const posdata_holder &right)
|
|
{
|
|
simple_array<uchar32> curseq;
|
|
|
|
ranges.merge(right.ranges);
|
|
|
|
if (right.has_empty() && !has_empty())
|
|
register_emptystring();
|
|
|
|
for (uchar32 seqlen = 2; seqlen < static_cast<uchar32>(right.indices.size()); ++seqlen)
|
|
{
|
|
const uchar32 end = right.indices[seqlen - 1];
|
|
uchar32 begin = right.indices[seqlen];
|
|
|
|
if (begin != end)
|
|
{
|
|
const std::size_t complen = seqlen * sizeof (uchar32);
|
|
|
|
ensure_length(seqlen);
|
|
curseq.resize(seqlen);
|
|
|
|
for (; begin < end;)
|
|
{
|
|
const uchar32 inspos = find_seq(&right.seqs[begin], seqlen, complen);
|
|
|
|
if (inspos == indices[seqlen - 1])
|
|
{
|
|
for (uchar32 i = 0; i < seqlen; ++i, ++begin)
|
|
curseq[i] = right.seqs[begin];
|
|
|
|
seqs.insert(inspos, curseq);
|
|
for (uchar32 i = 0; i < seqlen; ++i)
|
|
indices[i] += seqlen;
|
|
}
|
|
else
|
|
begin += seqlen;
|
|
}
|
|
}
|
|
}
|
|
check_lengths();
|
|
}
|
|
|
|
void do_subtract(const posdata_holder &right)
|
|
{
|
|
const uchar32 maxlen = static_cast<uchar32>(indices.size() <= right.indices.size() ? indices.size() : right.indices.size());
|
|
|
|
{
|
|
range_pairs kept;
|
|
range_pairs removed;
|
|
|
|
ranges.split_ranges(kept, removed, right.ranges);
|
|
ranges.swap(kept);
|
|
}
|
|
|
|
if (right.has_empty() && has_empty())
|
|
unregister_emptystring();
|
|
|
|
for (uchar32 seqlen = 2; seqlen < maxlen; ++seqlen)
|
|
{
|
|
const uchar32 end = right.indices[seqlen - 1];
|
|
uchar32 begin = right.indices[seqlen];
|
|
|
|
if (begin != end)
|
|
{
|
|
const std::size_t complen = seqlen * sizeof (uchar32);
|
|
|
|
for (; begin < end;)
|
|
{
|
|
const uchar32 delpos = find_seq(&right.seqs[begin], seqlen, complen);
|
|
|
|
if (delpos < indices[seqlen - 1])
|
|
{
|
|
seqs.erase(delpos, seqlen);
|
|
|
|
for (uchar32 i = 0; i < seqlen; ++i)
|
|
indices[i] -= seqlen;
|
|
}
|
|
else
|
|
begin += seqlen;
|
|
}
|
|
}
|
|
}
|
|
check_lengths();
|
|
}
|
|
|
|
void do_and(const posdata_holder &right)
|
|
{
|
|
const uchar32 maxlen = static_cast<uchar32>(indices.size() <= right.indices.size() ? indices.size() : right.indices.size());
|
|
posdata_holder newpos;
|
|
simple_array<uchar32> curseq;
|
|
|
|
{
|
|
range_pairs kept;
|
|
|
|
ranges.split_ranges(kept, newpos.ranges, right.ranges);
|
|
ranges.swap(newpos.ranges);
|
|
}
|
|
|
|
if (has_empty() && right.has_empty())
|
|
newpos.register_emptystring();
|
|
else if (may_contain_strings() || right.may_contain_strings())
|
|
ensure_length(1);
|
|
|
|
for (uchar32 seqlen = 2; seqlen < maxlen; ++seqlen)
|
|
{
|
|
const uchar32 end = right.indices[seqlen - 1];
|
|
uchar32 begin = right.indices[seqlen];
|
|
|
|
if (begin != end)
|
|
{
|
|
const std::size_t complen = seqlen * sizeof (uchar32);
|
|
const uchar32 myend = indices[seqlen - 1];
|
|
|
|
curseq.resize(seqlen);
|
|
|
|
for (; begin < end; begin += seqlen)
|
|
{
|
|
const uchar32 srcpos = find_seq(&right.seqs[begin], seqlen, complen);
|
|
|
|
if (srcpos < myend)
|
|
{
|
|
newpos.ensure_length(seqlen);
|
|
|
|
const uchar32 inspos = newpos.find_seq(&right.seqs[begin], seqlen, complen);
|
|
|
|
if (inspos == newpos.indices[seqlen - 1])
|
|
{
|
|
for (uchar32 i = 0; i < seqlen; ++i)
|
|
curseq[i] = right.seqs[begin + i];
|
|
|
|
newpos.seqs.insert(inspos, curseq);
|
|
for (uchar32 i = 0; i < seqlen; ++i)
|
|
newpos.indices[i] += seqlen;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
this->indices.swap(newpos.indices);
|
|
this->seqs.swap(newpos.seqs);
|
|
check_lengths();
|
|
}
|
|
|
|
void split_seqs_and_ranges(const simple_array<uchar32> &inseqs, const bool icase, const bool back)
|
|
{
|
|
const uchar32 max = static_cast<uchar32>(inseqs.size());
|
|
simple_array<uchar32> curseq;
|
|
|
|
clear();
|
|
|
|
for (uchar32 indx = 0; indx < max;)
|
|
{
|
|
const uchar32 elen = inseqs[indx++];
|
|
|
|
if (elen == 1) // Range.
|
|
{
|
|
ranges.join(range_pair_helper(inseqs[indx], inseqs[indx + 1]));
|
|
indx += 2;
|
|
}
|
|
else if (elen == 2)
|
|
{
|
|
const uchar32 ucpval = inseqs[indx++];
|
|
|
|
if (ucpval != constants::ccstr_empty)
|
|
ranges.join(range_pair_helper(ucpval));
|
|
else
|
|
register_emptystring();
|
|
}
|
|
else if (elen >= 3)
|
|
{
|
|
const uchar32 seqlen = elen - 1;
|
|
|
|
ensure_length(seqlen);
|
|
|
|
const uchar32 inspos = indices[seqlen - 1];
|
|
|
|
curseq.resize(seqlen);
|
|
if (!back)
|
|
{
|
|
for (uchar32 j = 0; j < seqlen; ++j, ++indx)
|
|
curseq[j] = inseqs[indx];
|
|
}
|
|
else
|
|
{
|
|
for (uchar32 j = seqlen; j; ++indx)
|
|
curseq[--j] = inseqs[indx];
|
|
}
|
|
|
|
if (icase)
|
|
{
|
|
for (simple_array<uchar32>::size_type i = 0; i < curseq.size(); ++i)
|
|
curseq[i] = unicode_case_folding::do_casefolding(curseq[i]);
|
|
}
|
|
|
|
const std::size_t complen = seqlen * sizeof (uchar32);
|
|
|
|
for (uchar32 i = indices[seqlen];; i += seqlen)
|
|
{
|
|
if (i == inspos)
|
|
{
|
|
seqs.insert(inspos, curseq);
|
|
for (uchar32 i = 0; i < seqlen; ++i)
|
|
indices[i] += seqlen;
|
|
break;
|
|
}
|
|
|
|
if (std::memcmp(&seqs[i], curseq.data(), complen) == 0)
|
|
break;
|
|
}
|
|
|
|
}
|
|
//elen == 0: Padding.
|
|
}
|
|
|
|
// if (this->is_icase())
|
|
if (icase)
|
|
ranges.make_caseunfoldedcharset();
|
|
|
|
check_lengths();
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
void register_emptystring()
|
|
{
|
|
if (indices.size() < 2)
|
|
{
|
|
indices.resize(2);
|
|
indices[1] = 0;
|
|
indices[0] = 1;
|
|
}
|
|
else if (indices[0] == indices[1])
|
|
{
|
|
++indices[0];
|
|
}
|
|
length.first = 0;
|
|
}
|
|
|
|
void unregister_emptystring()
|
|
{
|
|
if (indices.size() >= 2 && indices[0] != indices[1])
|
|
indices[0] = indices[1];
|
|
}
|
|
|
|
void ensure_length(const uchar32 seqlen)
|
|
{
|
|
uchar32 curlen = static_cast<uchar32>(indices.size());
|
|
|
|
if (seqlen >= curlen)
|
|
{
|
|
indices.resize(seqlen + 1);
|
|
for (; curlen <= seqlen; ++curlen)
|
|
indices[curlen] = 0;
|
|
}
|
|
}
|
|
|
|
uchar32 find_seq(const uchar32 *const seqbegin, const uchar32 seqlen, const std::size_t complen) const
|
|
{
|
|
const uchar32 end = indices[seqlen - 1];
|
|
|
|
for (uchar32 begin = indices[seqlen]; begin < end; begin += seqlen)
|
|
{
|
|
if (std::memcmp(seqbegin, &seqs[begin], complen) == 0)
|
|
return begin;
|
|
}
|
|
return end;
|
|
}
|
|
|
|
void check_lengths()
|
|
{
|
|
length.set(constants::max_u32value, 0);
|
|
|
|
for (uchar32 i = 2; i < static_cast<uchar32>(indices.size()); ++i)
|
|
{
|
|
if (indices[i] != indices[i - 1])
|
|
{
|
|
if (length.first > i)
|
|
length.first = i;
|
|
if (length.second < i)
|
|
length.second = i;
|
|
}
|
|
}
|
|
|
|
if (ranges.size())
|
|
{
|
|
if (length.first > 1)
|
|
length.first = 1;
|
|
if (length.second < 1)
|
|
length.second = 1;
|
|
}
|
|
|
|
if (has_empty())
|
|
length.first = 0;
|
|
|
|
if (length.second == 0)
|
|
length.first = 0;
|
|
}
|
|
};
|
|
// posdata_holder
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_upos.hpp"]
|
|
// ["rei_compiler.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
template <typename charT, typename traits>
|
|
struct re_object_core
|
|
{
|
|
protected:
|
|
|
|
typedef re_state/*<charT>*/ state_type;
|
|
typedef simple_array<state_type> state_array;
|
|
|
|
state_array NFA_states;
|
|
re_character_class character_class;
|
|
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS)
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
bitset<traits::utf_traits::bitsetsize> firstchar_class_bs;
|
|
#else
|
|
range_pairs firstchar_class;
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
public:
|
|
|
|
std::size_t limit_counter;
|
|
|
|
protected:
|
|
#endif
|
|
|
|
typedef typename traits::utf_traits utf_traits;
|
|
|
|
uint_l32 number_of_brackets;
|
|
uint_l32 number_of_counters;
|
|
uint_l32 number_of_repeats;
|
|
regex_constants::syntax_option_type soflags;
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
groupname_mapper<charT> namedcaptures;
|
|
typedef typename groupname_mapper<charT>::gname_string gname_string;
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
re_bmh<charT, utf_traits> *bmdata;
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
private:
|
|
|
|
static const std::size_t lcounter_defnum_ = 16777216;
|
|
|
|
#endif
|
|
|
|
protected:
|
|
|
|
re_object_core()
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
: limit_counter(lcounter_defnum_)
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
, bmdata(NULL)
|
|
#endif
|
|
#elif !defined(SRELLDBG_NO_BMH)
|
|
: bmdata(NULL)
|
|
#endif
|
|
{
|
|
}
|
|
|
|
re_object_core(const re_object_core &right)
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
: bmdata(NULL)
|
|
#endif
|
|
{
|
|
operator=(right);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
re_object_core(re_object_core &&right) SRELL_NOEXCEPT
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
: bmdata(NULL)
|
|
#endif
|
|
{
|
|
operator=(std::move(right));
|
|
}
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
~re_object_core()
|
|
{
|
|
if (bmdata)
|
|
delete bmdata;
|
|
}
|
|
#endif
|
|
|
|
void reset(const regex_constants::syntax_option_type flags)
|
|
{
|
|
NFA_states.clear();
|
|
character_class.clear();
|
|
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS)
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
firstchar_class_bs.reset();
|
|
#else
|
|
firstchar_class.clear();
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
limit_counter = lcounter_defnum_;
|
|
#endif
|
|
|
|
number_of_brackets = 1;
|
|
number_of_counters = 0;
|
|
number_of_repeats = 0;
|
|
soflags = flags; // regex_constants::ECMAScript;
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
namedcaptures.clear();
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
// bmdata->clear();
|
|
if (bmdata)
|
|
delete bmdata;
|
|
bmdata = NULL;
|
|
#endif
|
|
}
|
|
|
|
re_object_core &operator=(const re_object_core &that)
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->NFA_states = that.NFA_states;
|
|
this->character_class = that.character_class;
|
|
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS)
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
this->firstchar_class_bs = that.firstchar_class_bs;
|
|
#else
|
|
this->firstchar_class = that.firstchar_class;
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
this->limit_counter = that.limit_counter;
|
|
#endif
|
|
|
|
// this->utf_traits_inst = that.utf_traits_inst;
|
|
|
|
this->number_of_brackets = that.number_of_brackets;
|
|
this->number_of_counters = that.number_of_counters;
|
|
this->number_of_repeats = that.number_of_repeats;
|
|
this->soflags = that.soflags;
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
this->namedcaptures = that.namedcaptures;
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
if (that.bmdata)
|
|
{
|
|
if (this->bmdata)
|
|
*this->bmdata = *that.bmdata;
|
|
else
|
|
this->bmdata = new re_bmh<charT, utf_traits>(*that.bmdata);
|
|
}
|
|
else if (this->bmdata)
|
|
{
|
|
delete this->bmdata;
|
|
this->bmdata = NULL;
|
|
}
|
|
#endif
|
|
|
|
if (that.NFA_states.size())
|
|
repair_nextstates(&that.NFA_states[0]);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
re_object_core &operator=(re_object_core &&that) SRELL_NOEXCEPT
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->NFA_states = std::move(that.NFA_states);
|
|
this->character_class = std::move(that.character_class);
|
|
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS)
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
this->firstchar_class_bs = std::move(that.firstchar_class_bs);
|
|
#else
|
|
this->firstchar_class = std::move(that.firstchar_class);
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
this->limit_counter = that.limit_counter;
|
|
#endif
|
|
|
|
this->number_of_brackets = that.number_of_brackets;
|
|
this->number_of_counters = that.number_of_counters;
|
|
this->number_of_repeats = that.number_of_repeats;
|
|
this->soflags = that.soflags;
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
this->namedcaptures = std::move(that.namedcaptures);
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
if (this->bmdata)
|
|
delete this->bmdata;
|
|
this->bmdata = that.bmdata;
|
|
that.bmdata = NULL;
|
|
#endif
|
|
}
|
|
return *this;
|
|
}
|
|
#endif // defined(SRELL_CPP11_MOVE_ENABLED)
|
|
|
|
void swap(re_object_core &right)
|
|
{
|
|
if (this != &right)
|
|
{
|
|
this->NFA_states.swap(right.NFA_states);
|
|
this->character_class.swap(right.character_class);
|
|
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS)
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
this->firstchar_class_bs.swap(right.firstchar_class_bs);
|
|
#else
|
|
this->firstchar_class.swap(right.firstchar_class);
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
{
|
|
const std::size_t tmp_limit_counter = this->limit_counter;
|
|
this->limit_counter = right.limit_counter;
|
|
right.limit_counter = tmp_limit_counter;
|
|
}
|
|
#endif
|
|
// this->utf_traits_inst.swap(right.utf_traits_inst);
|
|
|
|
{
|
|
const uint_l32 tmp_numof_brackets = this->number_of_brackets;
|
|
this->number_of_brackets = right.number_of_brackets;
|
|
right.number_of_brackets = tmp_numof_brackets;
|
|
}
|
|
{
|
|
const uint_l32 tmp_numof_counters = this->number_of_counters;
|
|
this->number_of_counters = right.number_of_counters;
|
|
right.number_of_counters = tmp_numof_counters;
|
|
}
|
|
{
|
|
const uint_l32 tmp_numof_repeats = this->number_of_repeats;
|
|
this->number_of_repeats = right.number_of_repeats;
|
|
right.number_of_repeats = tmp_numof_repeats;
|
|
}
|
|
{
|
|
const regex_constants::syntax_option_type tmp_soflags = this->soflags;
|
|
this->soflags = right.soflags;
|
|
right.soflags = tmp_soflags;
|
|
}
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
this->namedcaptures.swap(right.namedcaptures);
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
{
|
|
re_bmh<charT, utf_traits> *const tmp_bmdata = this->bmdata;
|
|
this->bmdata = right.bmdata;
|
|
right.bmdata = tmp_bmdata;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
void throw_error(const regex_constants::error_type &e)
|
|
{
|
|
// reset();
|
|
NFA_states.clear();
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
if (bmdata)
|
|
delete bmdata;
|
|
bmdata = NULL;
|
|
#endif
|
|
throw regex_error(e);
|
|
}
|
|
|
|
private:
|
|
|
|
void repair_nextstates(const state_type *const oldbase)
|
|
{
|
|
state_type *const newbase = &this->NFA_states[0];
|
|
|
|
for (typename state_array::size_type i = 0; i < this->NFA_states.size(); ++i)
|
|
{
|
|
state_type &state = this->NFA_states[i];
|
|
|
|
if (state.next_state1)
|
|
state.next_state1 = state.next_state1 - oldbase + newbase;
|
|
|
|
if (state.next_state2)
|
|
state.next_state2 = state.next_state2 - oldbase + newbase;
|
|
}
|
|
}
|
|
};
|
|
// re_object_core
|
|
|
|
template <typename charT, typename traits>
|
|
class re_compiler : public re_object_core<charT, traits>
|
|
{
|
|
protected:
|
|
|
|
template <typename ForwardIterator>
|
|
bool compile(ForwardIterator begin, const ForwardIterator end, const regex_constants::syntax_option_type flags /* = regex_constants::ECMAScript */)
|
|
{
|
|
simple_array<uchar32> u32;
|
|
|
|
while (begin != end)
|
|
{
|
|
const uchar32 u32c = utf_traits::codepoint_inc(begin, end);
|
|
if (u32c > constants::unicode_max_codepoint)
|
|
this->throw_error(regex_constants::error_utf8);
|
|
u32.push_backncr(u32c);
|
|
}
|
|
|
|
return compile_core(u32.data(), u32.data() + u32.size(), flags);
|
|
}
|
|
|
|
bool is_icase() const
|
|
{
|
|
#if !defined(SRELL_NO_ICASE)
|
|
if (this->soflags & regex_constants::icase)
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
bool is_ricase() const
|
|
{
|
|
#if !defined(SRELL_NO_ICASE)
|
|
return /* this->NFA_states.size() && */ this->NFA_states[0].icase == true;
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
bool is_multiline() const
|
|
{
|
|
if (this->soflags & regex_constants::multiline)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool is_dotall() const
|
|
{
|
|
return (this->soflags & regex_constants::dotall) ? true : false;
|
|
}
|
|
|
|
bool is_vmode() const
|
|
{
|
|
#if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
return (this->soflags & regex_constants::unicodesets) ? true : false;
|
|
#else
|
|
return false;
|
|
#endif
|
|
|
|
}
|
|
|
|
bool is_optimize() const
|
|
{
|
|
return (this->soflags & regex_constants::optimize) ? true : false;
|
|
}
|
|
|
|
private:
|
|
|
|
typedef re_object_core<charT, traits> base_type;
|
|
typedef typename base_type::utf_traits utf_traits;
|
|
typedef typename base_type::state_type state_type;
|
|
typedef typename base_type::state_array state_array;
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
typedef typename base_type::gname_string gname_string;
|
|
#endif
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
typedef typename re_character_class::pstring pstring;
|
|
#endif
|
|
typedef typename state_array::size_type state_size_type;
|
|
|
|
bool compile_core(const uchar32 *begin, const uchar32 *const end, const regex_constants::syntax_option_type flags)
|
|
{
|
|
re_quantifier piecesize;
|
|
re_compiler_state<charT> cstate;
|
|
state_type atom;
|
|
|
|
this->reset(flags);
|
|
// this->soflags = flags;
|
|
cstate.reset(flags);
|
|
|
|
atom.reset();
|
|
atom.type = st_epsilon;
|
|
atom.next2 = 1;
|
|
this->NFA_states.push_back(atom);
|
|
|
|
if (!make_nfa_states(this->NFA_states, piecesize, begin, end, cstate))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (begin != end)
|
|
this->throw_error(regex_constants::error_paren); // ')'s are too many.
|
|
|
|
if (!check_backreferences(cstate))
|
|
this->throw_error(regex_constants::error_backref);
|
|
|
|
#if !defined(SRELL_NO_ICASE)
|
|
if (this->is_icase())
|
|
this->NFA_states[0].icase = check_if_really_needs_icase_search();
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
setup_bmhdata();
|
|
#endif
|
|
|
|
atom.type = st_success;
|
|
atom.next1 = 0;
|
|
atom.next2 = 0;
|
|
this->NFA_states.push_back(atom);
|
|
|
|
optimise();
|
|
relativejump_to_absolutejump();
|
|
|
|
return true;
|
|
}
|
|
|
|
bool make_nfa_states(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
typename state_array::size_type prevbranch_end = 0;
|
|
state_type atom;
|
|
state_array branch;
|
|
re_quantifier branchsize;
|
|
|
|
piecesize.reset(0);
|
|
|
|
for (;;)
|
|
{
|
|
branch.clear();
|
|
|
|
if (!make_branch(branch, branchsize, curpos, end, cstate))
|
|
return false;
|
|
|
|
// For piecesize.atleast, 0 as the initial value and 0 as an
|
|
// actual value must be distinguished.
|
|
if (piecesize.atmost == 0 || piecesize.atleast > branchsize.atleast)
|
|
piecesize.atleast = branchsize.atleast;
|
|
|
|
if (piecesize.atmost < branchsize.atmost)
|
|
piecesize.atmost = branchsize.atmost;
|
|
|
|
if (curpos != end && *curpos == meta_char::mc_bar)
|
|
{
|
|
atom.reset();
|
|
atom.character = meta_char::mc_bar;
|
|
atom.type = st_epsilon;
|
|
atom.next2 = static_cast<std::ptrdiff_t>(branch.size()) + 2;
|
|
branch.insert(0, atom);
|
|
}
|
|
|
|
if (prevbranch_end)
|
|
piece[prevbranch_end].next1 = static_cast<std::ptrdiff_t>(branch.size()) + 1;
|
|
|
|
piece += branch;
|
|
|
|
// end or ')'
|
|
if (curpos == end || *curpos == meta_char::mc_rbracl)
|
|
break;
|
|
|
|
// *curpos == '|'
|
|
|
|
prevbranch_end = piece.size();
|
|
atom.reset();
|
|
atom.type = st_epsilon;
|
|
piece.push_back(atom);
|
|
|
|
++curpos;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool make_branch(state_array &branch, re_quantifier &branchsize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
state_array piece;
|
|
state_array piece_with_quantifier;
|
|
re_quantifier quantifier;
|
|
|
|
branchsize.reset(0);
|
|
|
|
for (;;)
|
|
{
|
|
re_quantifier piecesize;
|
|
|
|
if (curpos == end)
|
|
return true;
|
|
|
|
piece.clear();
|
|
piece_with_quantifier.clear();
|
|
|
|
switch (*curpos)
|
|
{
|
|
// case char_ctrl::cc_nul: // '\0':
|
|
case meta_char::mc_bar: // '|':
|
|
case meta_char::mc_rbracl: // ')':
|
|
return true;
|
|
|
|
default:
|
|
if (!get_atom(piece, piecesize, curpos, end, cstate))
|
|
return false;
|
|
}
|
|
|
|
if (piece.size())
|
|
{
|
|
const state_type &firstatom = piece[0];
|
|
|
|
quantifier.reset(); // quantifier.atleast = quantifier.atmost = 1;
|
|
|
|
if (firstatom.has_quantifier())
|
|
{
|
|
if (curpos != end && !get_quantifier(quantifier, curpos, end))
|
|
return false;
|
|
}
|
|
|
|
if (piece.size() == 2 && firstatom.is_noncapturinggroup() && piece[1].is_noncapturinggroup())
|
|
{
|
|
// (?:) alone or followed by a quantifier.
|
|
// piece_with_quantifier += piece;
|
|
; // Do nothing.
|
|
}
|
|
else
|
|
combine_piece_with_quantifier(piece_with_quantifier, piece, quantifier, piecesize);
|
|
|
|
#if 01
|
|
piecesize.multiply(quantifier);
|
|
branchsize.add(piecesize);
|
|
#else
|
|
branchsize.atleast += piecesize.atleast * quantifier.atleast;
|
|
if (!branchsize.is_infinity())
|
|
{
|
|
if (piecesize.is_infinity() || quantifier.is_infinity())
|
|
branchsize.set_infinity();
|
|
else
|
|
branchsize.atmost += piecesize.atmost * quantifier.atmost;
|
|
}
|
|
#endif
|
|
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
|
|
if (!cstate.back)
|
|
branch += piece_with_quantifier;
|
|
else
|
|
branch.insert(0, piece_with_quantifier);
|
|
#else
|
|
branch += piece_with_quantifier;
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
bool get_atom(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
state_type atom;
|
|
|
|
atom.reset();
|
|
atom.character = *curpos++;
|
|
|
|
switch (atom.character)
|
|
{
|
|
case meta_char::mc_rbraop: // '(':
|
|
return get_piece_in_roundbrackets(piece, piecesize, curpos, end, cstate);
|
|
|
|
case meta_char::mc_sbraop: // '[':
|
|
#if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
if (this->is_vmode()) // vmode.
|
|
return parse_charclass_v(piece, piecesize, curpos, end, cstate);
|
|
#endif
|
|
if (!register_character_class(atom, curpos, end, cstate))
|
|
return false;
|
|
|
|
break;
|
|
|
|
case meta_char::mc_escape: // '\\':
|
|
if (!translate_atom_escape(atom, piece, piecesize, curpos, end, cstate))
|
|
return false;
|
|
|
|
if (piece.size())
|
|
return true;
|
|
break;
|
|
|
|
case meta_char::mc_period: // '.':
|
|
atom.type = st_character_class;
|
|
#if !defined(SRELL_NO_SINGLELINE)
|
|
if (this->is_dotall())
|
|
{
|
|
atom.number = static_cast<uint_l32>(re_character_class::dotall);
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
// atom.number = static_cast<uint_l32>(re_character_class::newline);
|
|
range_pairs nlclass = this->character_class[static_cast<uint_l32>(re_character_class::newline)];
|
|
|
|
nlclass.negation();
|
|
atom.number = this->character_class.register_newclass(nlclass);
|
|
}
|
|
break;
|
|
|
|
case meta_char::mc_caret: // '^':
|
|
atom.type = st_bol;
|
|
atom.quantifier.reset(0);
|
|
// if (current_flags.m)
|
|
if (is_multiline())
|
|
atom.multiline = true;
|
|
break;
|
|
|
|
case meta_char::mc_dollar: // '$':
|
|
atom.type = st_eol;
|
|
atom.quantifier.reset(0);
|
|
// if (current_flags.m)
|
|
if (is_multiline())
|
|
atom.multiline = true;
|
|
break;
|
|
|
|
case meta_char::mc_astrsk: // '*':
|
|
case meta_char::mc_plus: // '+':
|
|
case meta_char::mc_query: // '?':
|
|
case meta_char::mc_cbraop: // '{'
|
|
this->throw_error(regex_constants::error_badrepeat);
|
|
|
|
default:;
|
|
}
|
|
|
|
if (atom.type == st_character)
|
|
{
|
|
if (this->is_icase())
|
|
atom.character = unicode_case_folding::do_casefolding(atom.character);
|
|
}
|
|
|
|
piece.push_back(atom);
|
|
piecesize = atom.quantifier;
|
|
|
|
return true;
|
|
}
|
|
|
|
// '('.
|
|
|
|
bool get_piece_in_roundbrackets(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
const re_flags originalflags(cstate);
|
|
state_type atom;
|
|
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_paren);
|
|
|
|
atom.reset();
|
|
atom.type = st_roundbracket_open;
|
|
|
|
if (*curpos == meta_char::mc_query) // '?'
|
|
{
|
|
if (!extended_roundbrackets(piece, atom, ++curpos, end, cstate))
|
|
return false;
|
|
}
|
|
|
|
if (atom.type == st_roundbracket_open)
|
|
{
|
|
push_bracket_open(piece, atom);
|
|
}
|
|
|
|
// if (curpos == end)
|
|
// this->throw_error(regex_constants::error_paren);
|
|
|
|
if (!make_nfa_states(piece, piecesize, curpos, end, cstate))
|
|
return false;
|
|
|
|
// end or ')'?
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_paren);
|
|
|
|
++curpos;
|
|
|
|
cstate.restore_from(originalflags);
|
|
|
|
switch (atom.type)
|
|
{
|
|
case st_epsilon:
|
|
|
|
// if (piece.size() <= 2) // ':' or ':' + one.
|
|
if (piece.size() == 2) // ':' + something.
|
|
{
|
|
piece.erase(0);
|
|
return true;
|
|
}
|
|
|
|
piece[0].quantifier.atmost = this->number_of_brackets - 1;
|
|
break;
|
|
|
|
// case st_lookaround_pop:
|
|
case st_lookaround_open:
|
|
{
|
|
state_type &firstatom = piece[0];
|
|
|
|
#if defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
// if (firstatom.reverse)
|
|
if (firstatom.quantifier.atleast) // > 0 means lookbehind.
|
|
{
|
|
if (!piecesize.is_same() || piecesize.is_infinity())
|
|
this->throw_error(regex_constants::error_lookbehind);
|
|
|
|
firstatom.quantifier = piecesize;
|
|
}
|
|
#endif
|
|
|
|
#if defined(SRELL_ENABLE_GT)
|
|
if (firstatom.character != meta_char::mc_gt)
|
|
#endif
|
|
piecesize.reset(0);
|
|
|
|
firstatom.next1 = static_cast<std::ptrdiff_t>(piece.size()) + 1;
|
|
|
|
atom.type = st_lookaround_close;
|
|
atom.next1 = 0;
|
|
atom.next2 = 0;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
set_bracket_close(piece, atom, piecesize, cstate);
|
|
}
|
|
|
|
piece.push_back(atom);
|
|
return true;
|
|
}
|
|
|
|
bool extended_roundbrackets(state_array &piece, state_type &atom, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
bool lookbehind = false;
|
|
#endif
|
|
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_paren);
|
|
|
|
atom.character = *curpos;
|
|
|
|
if (atom.character == meta_char::mc_lt) // '<'
|
|
{
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
lookbehind = true;
|
|
#endif
|
|
if (++curpos == end)
|
|
this->throw_error(regex_constants::error_paren);
|
|
|
|
atom.character = *curpos;
|
|
|
|
if (atom.character != meta_char::mc_eq && atom.character != meta_char::mc_exclam)
|
|
{
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
return parse_groupname(curpos, end, cstate);
|
|
#else
|
|
this->throw_error(regex_constants::error_paren);
|
|
#endif // !defined(SRELL_NO_NAMEDCAPTURE)
|
|
}
|
|
}
|
|
else
|
|
atom.quantifier.atleast = 0;
|
|
// Sets atleast to 0 for other assertions than lookbehinds. The automaton
|
|
// checks atleast to know whether lookbehinds or other assertions.
|
|
|
|
switch (atom.character)
|
|
{
|
|
case meta_char::mc_colon:
|
|
atom.type = st_epsilon;
|
|
atom.quantifier.atleast = this->number_of_brackets;
|
|
break;
|
|
|
|
case meta_char::mc_exclam: // '!':
|
|
atom.is_not = true;
|
|
//@fallthrough@
|
|
|
|
case meta_char::mc_eq: // '=':
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
cstate.back = lookbehind;
|
|
#else
|
|
// atom.reverse = lookbehind;
|
|
#endif
|
|
|
|
#if defined(SRELL_ENABLE_GT)
|
|
case meta_char::mc_gt:
|
|
#endif
|
|
atom.type = st_lookaround_open;
|
|
atom.next2 = 1;
|
|
break;
|
|
|
|
default:
|
|
this->throw_error(regex_constants::error_paren);
|
|
}
|
|
|
|
++curpos;
|
|
piece.push_back(atom);
|
|
return true;
|
|
}
|
|
|
|
void push_bracket_open(state_array &piece, state_type &atom)
|
|
{
|
|
atom.number = this->number_of_brackets;
|
|
atom.next1 = 2;
|
|
atom.next2 = 1;
|
|
piece.push_back(atom);
|
|
++this->number_of_brackets;
|
|
|
|
atom.type = st_roundbracket_pop;
|
|
atom.next1 = 0;
|
|
atom.next2 = 0;
|
|
piece.push_back(atom);
|
|
}
|
|
|
|
void set_bracket_close(state_array &piece, state_type &atom, const re_quantifier &piecesize, re_compiler_state<charT> &cstate)
|
|
{
|
|
// uint_l32 max_bracketno = atom.number;
|
|
|
|
atom.type = st_roundbracket_close;
|
|
atom.next1 = 1;
|
|
atom.next2 = 1;
|
|
#if 0
|
|
for (typename state_array::size_type i = 0; i < piece.size(); ++i)
|
|
{
|
|
const state_type &state = piece[i];
|
|
|
|
if (state.type == st_roundbracket_open && max_bracketno < state.number)
|
|
max_bracketno = state.number;
|
|
}
|
|
#endif
|
|
|
|
re_quantifier &rb_open = piece[0].quantifier;
|
|
re_quantifier &rb_pop = piece[1].quantifier;
|
|
|
|
rb_open.atleast = rb_pop.atleast = atom.number + 1;
|
|
rb_open.atmost = rb_pop.atmost = this->number_of_brackets - 1; // max_bracketno;
|
|
|
|
if (cstate.atleast_widths_of_brackets.size() < atom.number)
|
|
cstate.atleast_widths_of_brackets.resize(atom.number, 0);
|
|
|
|
cstate.atleast_widths_of_brackets[atom.number - 1] = piecesize.atleast;
|
|
}
|
|
|
|
void combine_piece_with_quantifier(state_array &piece_with_quantifier, state_array &piece, const re_quantifier &quantifier, const re_quantifier &piecesize)
|
|
{
|
|
state_type &firstatom = piece[0];
|
|
// const bool firstpiece_is_roundbracket_open = (firstatom.type == st_roundbracket_open);
|
|
const bool piece_has_0widthchecker = firstatom.has_0widthchecker();
|
|
const bool piece_is_noncapturinggroup_contaning_capturinggroup = firstatom.is_noncapturinggroup() && firstatom.quantifier.is_valid();
|
|
state_type atom;
|
|
|
|
if (quantifier.atmost == 0)
|
|
return;
|
|
|
|
atom.reset();
|
|
atom.quantifier = quantifier;
|
|
if (firstatom.is_character_or_class())
|
|
atom.character = meta_char::mc_astrsk; // For nextpos_optimisation1_3().
|
|
|
|
if (quantifier.atmost == 1)
|
|
{
|
|
if (quantifier.atleast == 0)
|
|
{
|
|
atom.type = st_epsilon;
|
|
atom.next2 = static_cast<std::ptrdiff_t>(piece.size()) + 1;
|
|
|
|
if (!quantifier.is_greedy)
|
|
{
|
|
atom.next1 = atom.next2;
|
|
atom.next2 = 1;
|
|
}
|
|
|
|
if (atom.character == meta_char::mc_astrsk)
|
|
firstatom.quantifier = quantifier;
|
|
|
|
piece_with_quantifier.push_back(atom);
|
|
// (push)
|
|
}
|
|
|
|
if (piece.size() >= 2 && firstatom.type == st_roundbracket_open && piece[1].type == st_roundbracket_pop)
|
|
{
|
|
firstatom.quantifier.atmost = 0u;
|
|
piece[1].quantifier.atmost = 0u;
|
|
}
|
|
|
|
piece_with_quantifier += piece;
|
|
return;
|
|
}
|
|
|
|
// atmost >= 2
|
|
|
|
#if !defined(SRELLDBG_NO_SIMPLEEQUIV)
|
|
// The counter requires at least 6 states: save, restore, check, inc, dec, atom(s).
|
|
// A character or charclass quantified by one of these has a simple equivalent representation:
|
|
// a{0,2} 1.epsilon(2|5), 2.CHorCL(3), 3.epsilon(4|5), 4.CHorCL(5), [5].
|
|
// a{0,3} 1.epsilon(2|7), 2.CHorCL(3), 3.epsilon(4|7), 4.CHorCL(5), 5.epsilon(6|7), 6.CHorCL(7), [7].
|
|
// a{1,2} 1.CHorCL(2), 2.epsilon(3|4), 3.CHorCL(4), [4].
|
|
// a{1,3} 1.CHorCL(2), 2.epsilon(3|6), 3.CHorCL(4), 4.epsilon(5|6), 5.CHorCL(6), [6].
|
|
// a{2,3} 1.CHorCL(2), 2.CHorCL(3), 3.epsilon(4|5), 4.CHorCL(5), [5].
|
|
// a{2,4} 1.CHorCL(2), 2.CHorCL(3), 3.epsilon(4|7), 4.CHorCL(5), 5.epsilon(6|7), 6.CHorCL(7), [7].
|
|
if (piece.size() == 1 && firstatom.is_character_or_class() && quantifier.has_simple_equivalence())
|
|
{
|
|
const typename state_array::size_type branchsize = piece.size() + 1;
|
|
|
|
for (uint_l32 i = 0; i < quantifier.atleast; ++i)
|
|
piece_with_quantifier += piece;
|
|
|
|
if (atom.character == meta_char::mc_astrsk)
|
|
firstatom.quantifier.set(0, 1, quantifier.is_greedy);
|
|
|
|
atom.type = st_epsilon;
|
|
atom.next2 = (quantifier.atmost - quantifier.atleast) * branchsize;
|
|
if (!quantifier.is_greedy)
|
|
{
|
|
atom.next1 = atom.next2;
|
|
atom.next2 = 1;
|
|
}
|
|
for (uint_l32 i = quantifier.atleast; i < quantifier.atmost; ++i)
|
|
{
|
|
piece_with_quantifier.push_back(atom);
|
|
piece_with_quantifier += piece;
|
|
quantifier.is_greedy ? (atom.next2 -= branchsize) : (atom.next1 -= branchsize);
|
|
}
|
|
return;
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_SIMPLEEQUIV)
|
|
|
|
atom.type = st_epsilon;
|
|
if (quantifier.is_asterisk()) // {0,}
|
|
{
|
|
// greedy: 1.epsilon(2|4), 2.piece, 3.LAorC0WR(1|0), 4.OutOfLoop.
|
|
// !greedy: 1.epsilon(4|2), 2.piece, 3.LAorC0WR(1|0), 4.OutOfLoop.
|
|
// LAorC0WR: LastAtomOfPiece or Check0WidthRepeat.
|
|
// atom.type points to 1.
|
|
}
|
|
else if (quantifier.is_plus()) // {1,}
|
|
{
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
|
|
if (piece.size() == 1 && firstatom.is_character_or_class())
|
|
{
|
|
piece_with_quantifier += piece;
|
|
--atom.quantifier.atleast; // /.+/ -> /..*/.
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
atom.next1 = 2;
|
|
atom.next2 = 0;
|
|
piece_with_quantifier.push_back(atom);
|
|
// greedy: 1.epsilon(3), 2.epsilon(3|5), 3.piece, 4.LAorC0WR(2|0), 5.OutOfLoop.
|
|
// !greedy: 1.epsilon(3), 2.epsilon(5|3), 3.piece, 4.LAorC0WR(2|0), 5.OutOfLoop.
|
|
// atom.type points to 2.
|
|
}
|
|
}
|
|
else
|
|
{
|
|
atom.number = this->number_of_counters;
|
|
++this->number_of_counters;
|
|
|
|
atom.type = st_save_and_reset_counter;
|
|
atom.next1 = 2;
|
|
atom.next2 = 1;
|
|
piece_with_quantifier.push_back(atom);
|
|
|
|
atom.type = st_restore_counter;
|
|
atom.next1 = 0;
|
|
atom.next2 = 0;
|
|
piece_with_quantifier.push_back(atom);
|
|
// 1.save_and_reset_counter(3|2), 2.restore_counter(0|0),
|
|
|
|
atom.next1 = 0;
|
|
atom.next2 = 0;
|
|
atom.type = st_decrement_counter;
|
|
piece.insert(0, atom);
|
|
|
|
atom.next1 = 2;
|
|
// atom.next2 = piece[1].is_character_or_class() ? 0 : 1;
|
|
// atom.next2 = 0;
|
|
for (state_size_type i = 1; i < piece.size(); ++i)
|
|
{
|
|
const state_type &state = piece[i];
|
|
|
|
if (state.is_character_or_class() || (state.type == st_epsilon && state.next2 == 0))
|
|
;
|
|
else
|
|
{
|
|
atom.next2 = 1;
|
|
break;
|
|
}
|
|
}
|
|
atom.type = st_epsilon; // st_increment_counter;
|
|
piece.insert(0, atom);
|
|
piece[0].number = 0;
|
|
|
|
atom.type = st_check_counter;
|
|
// greedy: 3.check_counter(4|6), 4.piece, 5.LAorC0WR(3|0), 6.OutOfLoop.
|
|
// !greedy: 3.check_counter(6|4), 4.piece, 5.LAorC0WR(3|0), 6.OutOfLoop.
|
|
// 4.piece = { 4a.increment_counter(4c|4b), 4b.decrement_counter(0|0), 4c.OriginalPiece }.
|
|
}
|
|
|
|
// atom.type is epsilon or check_counter.
|
|
// Its "next"s point to piece and OutOfLoop.
|
|
|
|
if (!piece_is_noncapturinggroup_contaning_capturinggroup && (piecesize.atleast || piece_has_0widthchecker))
|
|
{
|
|
const typename state_array::size_type piece_size = piece.size();
|
|
state_type &lastatom = piece[piece_size - 1];
|
|
|
|
lastatom.next1 = 0 - static_cast<std::ptrdiff_t>(piece_size);
|
|
// Points to the one immediately before piece, which will be pushed last in this block.
|
|
|
|
// atom.type has already been set. epsilon or check_counter.
|
|
atom.next1 = 1;
|
|
atom.next2 = static_cast<std::ptrdiff_t>(piece_size) + 1;
|
|
if (!quantifier.is_greedy)
|
|
{
|
|
atom.next1 = atom.next2;
|
|
atom.next2 = 1;
|
|
}
|
|
piece_with_quantifier.push_back(atom);
|
|
}
|
|
else
|
|
{
|
|
// atom.type has already been set. epsilon or check_counter.
|
|
atom.next1 = 1;
|
|
atom.next2 = static_cast<std::ptrdiff_t>(piece.size()) + 4; // To OutOfLoop.
|
|
// The reason for +3 than above is that push, pop, and check_0_width are added below.
|
|
if (!quantifier.is_greedy)
|
|
{
|
|
atom.next1 = atom.next2;
|
|
atom.next2 = 1;
|
|
}
|
|
piece_with_quantifier.push_back(atom); // *1
|
|
|
|
atom.number = this->number_of_repeats;
|
|
++this->number_of_repeats;
|
|
|
|
const state_size_type org1stpos = (atom.type == st_check_counter) ? 2 : 0;
|
|
|
|
if (piece_is_noncapturinggroup_contaning_capturinggroup)
|
|
atom.quantifier = piece[org1stpos].quantifier;
|
|
else
|
|
atom.quantifier.set(1, 0);
|
|
|
|
atom.type = st_repeat_in_pop;
|
|
atom.next1 = 0;
|
|
atom.next2 = 0;
|
|
piece.insert(org1stpos, atom);
|
|
|
|
atom.type = st_repeat_in_push;
|
|
atom.next1 = 2;
|
|
atom.next2 = 1;
|
|
piece.insert(org1stpos, atom);
|
|
|
|
atom.type = st_check_0_width_repeat;
|
|
atom.next1 = 0 - static_cast<std::ptrdiff_t>(piece.size()) - 1; // Points to *1.
|
|
atom.next2 = 1;
|
|
piece.push_back(atom);
|
|
// greedy: 1.epsilon(2|6),
|
|
// !greedy: 1.epsilon(6|2),
|
|
// 2.repeat_in_push(4|3), 3.repeat_in_pop(0|0), 4.piece,
|
|
// 5.check_0_width_repeat(1|6), 6.OutOfLoop.
|
|
// or
|
|
// greedy: 1.check_counter(2|8),
|
|
// !greedy: 1.check_counter(8|2),
|
|
// 2.increment_counter(4|3), 3.decrement_counter(0|0)
|
|
// 4.repeat_in_push(6|5), 5.repeat_in_pop(0|0), 6.piece,
|
|
// 7.check_0_width_repeat(1|8), 8.OutOfLoop.
|
|
}
|
|
piece_with_quantifier += piece;
|
|
}
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
bool parse_groupname(const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
const gname_string groupname = get_groupname(curpos, end, cstate);
|
|
|
|
if (!this->namedcaptures.push_back(groupname, this->number_of_brackets))
|
|
this->throw_error(regex_constants::error_backref);
|
|
|
|
return true;
|
|
}
|
|
#endif // !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
// '['.
|
|
|
|
bool register_character_class(state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state<charT> & /* cstate */)
|
|
{
|
|
range_pairs ranges;
|
|
range_pair code_range;
|
|
state_type classatom;
|
|
range_pairs curranges;
|
|
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_brack);
|
|
|
|
atom.type = st_character_class;
|
|
|
|
if (*curpos == meta_char::mc_caret) // '^'
|
|
{
|
|
atom.is_not = true;
|
|
++curpos;
|
|
}
|
|
|
|
for (;;)
|
|
{
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_brack);
|
|
|
|
if (*curpos == meta_char::mc_sbracl) // ']'
|
|
break;
|
|
|
|
classatom.reset();
|
|
|
|
if (!get_character_in_class(curranges, classatom, curpos, end))
|
|
return false;
|
|
|
|
if (classatom.type == st_character_class)
|
|
{
|
|
ranges.merge(curranges);
|
|
continue;
|
|
}
|
|
|
|
code_range.first = code_range.second = classatom.character;
|
|
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_brack);
|
|
|
|
if (*curpos == meta_char::mc_minus) // '-'
|
|
{
|
|
++curpos;
|
|
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_brack);
|
|
|
|
if (*curpos == meta_char::mc_sbracl)
|
|
{
|
|
PUSH_SEPARATELY:
|
|
ranges.join(code_range);
|
|
code_range.first = code_range.second = meta_char::mc_minus;
|
|
}
|
|
else
|
|
{
|
|
if (!get_character_in_class(curranges, classatom, curpos, end))
|
|
return false;
|
|
|
|
if (classatom.type == st_character_class)
|
|
{
|
|
ranges.merge(curranges);
|
|
goto PUSH_SEPARATELY;
|
|
}
|
|
|
|
code_range.second = classatom.character;
|
|
|
|
if (!code_range.is_range_valid())
|
|
this->throw_error(regex_constants::error_range);
|
|
}
|
|
}
|
|
ranges.join(code_range);
|
|
}
|
|
|
|
// *curpos == ']'
|
|
++curpos;
|
|
if (this->is_icase())
|
|
ranges.make_caseunfoldedcharset();
|
|
|
|
if (atom.is_not)
|
|
{
|
|
ranges.negation();
|
|
atom.is_not = false;
|
|
}
|
|
|
|
// atom.character = this->is_icase() ? ranges.template consists_of_one_character<unicode_case_folding>() : ranges.template consists_of_one_character<nocase_faketraits>();
|
|
atom.character = ranges.consists_of_one_character(this->is_icase());
|
|
|
|
if (atom.character != constants::invalid_u32value)
|
|
{
|
|
atom.type = st_character;
|
|
return true;
|
|
}
|
|
|
|
atom.number = this->character_class.register_newclass(ranges);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool get_character_in_class(range_pairs &rp, state_type &atom, const uchar32 *&curpos, const uchar32 *const end /* , const re_compiler_state &cstate */)
|
|
{
|
|
atom.character = *curpos++;
|
|
|
|
if (atom.character != meta_char::mc_escape) // '\\'
|
|
return true;
|
|
|
|
rp.clear();
|
|
return translate_escseq(&rp, atom, curpos, end, true);
|
|
}
|
|
|
|
void add_predefclass_to_charclass(range_pairs &cls, const state_type &classatom)
|
|
{
|
|
range_pairs predefclass = this->character_class[classatom.number];
|
|
|
|
if (classatom.is_not)
|
|
predefclass.negation();
|
|
|
|
cls.merge(predefclass);
|
|
}
|
|
|
|
#if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
bool parse_charclass_v(state_array &piece, re_quantifier &piecesize, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
posdata_holder pos;
|
|
|
|
parse_unicharset(pos, curpos, end, cstate);
|
|
|
|
if (!pos.may_contain_strings())
|
|
{
|
|
state_type atom;
|
|
|
|
atom.reset();
|
|
atom.character = pos.ranges.consists_of_one_character(this->is_icase());
|
|
|
|
if (atom.character == constants::invalid_u32value)
|
|
{
|
|
atom.type = st_character_class;
|
|
atom.number = this->character_class.register_newclass(pos.ranges);
|
|
}
|
|
|
|
piece.push_back(atom);
|
|
piecesize = atom.quantifier;
|
|
}
|
|
else
|
|
{
|
|
transform_seqdata(piece, pos);
|
|
|
|
piecesize.atleast = static_cast<uint_l32>(pos.length.first);
|
|
piecesize.atmost = static_cast<uint_l32>(pos.length.second);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void parse_unicharset(posdata_holder &basepos, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state<charT> &cstate)
|
|
{
|
|
enum operation_type
|
|
{
|
|
op_init, op_firstcc, op_union, op_intersection, op_subtraction
|
|
};
|
|
operation_type otype = op_init;
|
|
posdata_holder newpos;
|
|
range_pair code_range;
|
|
state_type ccatom;
|
|
bool invert;
|
|
|
|
if (curpos == end)
|
|
goto ERROR_NOT_CLOSED;
|
|
|
|
if (*curpos == meta_char::mc_caret) // '^'
|
|
{
|
|
invert = true;
|
|
++curpos;
|
|
}
|
|
else
|
|
invert = false;
|
|
|
|
// ClassSetCharacter ::
|
|
// \ CharacterEscape[+UnicodeMode]
|
|
// \ ClassSetReservedPunctuator
|
|
// \ b
|
|
|
|
for (;;)
|
|
{
|
|
if (curpos == end)
|
|
goto ERROR_NOT_CLOSED;
|
|
|
|
if (*curpos == meta_char::mc_sbracl) // ']'
|
|
break;
|
|
|
|
const uchar32 next2chars = check_doublepunctuators(curpos, end);
|
|
|
|
switch (otype)
|
|
{
|
|
case op_intersection:
|
|
if (next2chars != char_other::co_amp)
|
|
goto ERROR_DOUBLE_PUNCT;
|
|
curpos += 2;
|
|
break;
|
|
|
|
case op_subtraction:
|
|
if (next2chars != meta_char::mc_minus)
|
|
goto ERROR_DOUBLE_PUNCT;
|
|
curpos += 2;
|
|
break;
|
|
|
|
case op_firstcc:
|
|
if (next2chars == char_other::co_amp)
|
|
otype = op_intersection;
|
|
else if (next2chars == meta_char::mc_minus)
|
|
otype = op_subtraction;
|
|
else if (next2chars == constants::invalid_u32value)
|
|
break;
|
|
else
|
|
goto ERROR_DOUBLE_PUNCT;
|
|
|
|
curpos += 2;
|
|
break;
|
|
|
|
// case op_union:
|
|
// case op_init:
|
|
default:
|
|
if (next2chars != constants::invalid_u32value)
|
|
goto ERROR_DOUBLE_PUNCT;
|
|
}
|
|
|
|
AFTER_OPERATOR:
|
|
|
|
if (curpos == end)
|
|
goto ERROR_NOT_CLOSED;
|
|
|
|
ccatom.reset();
|
|
|
|
if (*curpos == meta_char::mc_sbraop) // '['
|
|
{
|
|
++curpos;
|
|
parse_unicharset(newpos, curpos, end, cstate);
|
|
}
|
|
else
|
|
get_character_in_class_vmode(newpos, ccatom, curpos, end, cstate, false);
|
|
|
|
if (otype == op_init)
|
|
otype = op_firstcc;
|
|
else if (otype == op_firstcc)
|
|
otype = op_union;
|
|
|
|
if (curpos == end)
|
|
goto ERROR_NOT_CLOSED;
|
|
|
|
if (ccatom.type == st_character_class)
|
|
{
|
|
}
|
|
else if (ccatom.type == st_character)
|
|
{
|
|
if (!newpos.has_data())
|
|
{
|
|
code_range.set(ccatom.character);
|
|
|
|
if (otype <= op_union)
|
|
{
|
|
if (*curpos == meta_char::mc_minus) // '-'
|
|
{
|
|
++curpos;
|
|
|
|
if (curpos == end)
|
|
goto ERROR_BROKEN_RANGE;
|
|
|
|
if (otype < op_union && *curpos == meta_char::mc_minus) // '-'
|
|
{
|
|
otype = op_subtraction;
|
|
++curpos;
|
|
basepos.ranges.join(code_range);
|
|
goto AFTER_OPERATOR;
|
|
}
|
|
|
|
get_character_in_class_vmode(newpos, ccatom, curpos, end, cstate, true);
|
|
|
|
otype = op_union;
|
|
code_range.second = ccatom.character;
|
|
if (!code_range.is_range_valid())
|
|
goto ERROR_BROKEN_RANGE;
|
|
}
|
|
}
|
|
|
|
newpos.ranges.join(code_range);
|
|
if (this->is_icase())
|
|
newpos.ranges.make_caseunfoldedcharset();
|
|
}
|
|
}
|
|
|
|
switch (otype)
|
|
{
|
|
case op_union:
|
|
basepos.do_union(newpos);
|
|
break;
|
|
|
|
case op_intersection:
|
|
basepos.do_and(newpos);
|
|
break;
|
|
|
|
case op_subtraction:
|
|
basepos.do_subtract(newpos);
|
|
break;
|
|
|
|
default:
|
|
// case op_firstcc:
|
|
basepos.swap(newpos);
|
|
}
|
|
}
|
|
|
|
// *curpos == ']'
|
|
++curpos;
|
|
|
|
if (this->is_icase())
|
|
basepos.ranges.make_caseunfoldedcharset();
|
|
|
|
if (invert)
|
|
{
|
|
if (basepos.may_contain_strings())
|
|
goto ERROR_NOT_INVERTIBLE;
|
|
|
|
basepos.ranges.negation();
|
|
}
|
|
|
|
return;
|
|
|
|
ERROR_NOT_CLOSED:
|
|
this->throw_error(regex_constants::error_brack);
|
|
|
|
ERROR_BROKEN_RANGE:
|
|
this->throw_error(regex_constants::error_range);
|
|
|
|
ERROR_NOT_INVERTIBLE:
|
|
this->throw_error(regex_constants::error_complement);
|
|
|
|
ERROR_DOUBLE_PUNCT:
|
|
this->throw_error(regex_constants::error_operator);
|
|
}
|
|
|
|
uchar32 check_doublepunctuators(const uchar32 *curpos, const uchar32 *const end) const
|
|
{
|
|
const uchar32 firstchar = *curpos++;
|
|
|
|
if (curpos == end || *curpos != firstchar)
|
|
return constants::invalid_u32value;
|
|
|
|
switch (firstchar)
|
|
{
|
|
// ClassSetReservedDoublePunctuator :: one of
|
|
// && !! ## $$ %% ** ++ ,, .. :: ;; << == >> ?? @@ ^^ `` ~~
|
|
case char_other::co_amp: // '&'
|
|
case meta_char::mc_exclam: // '!'
|
|
case meta_char::mc_sharp: // '#'
|
|
case meta_char::mc_dollar: // '$'
|
|
case char_other::co_perc: // '%'
|
|
case meta_char::mc_astrsk: // '*'
|
|
case meta_char::mc_plus: // '+'
|
|
case meta_char::mc_comma: // ','
|
|
case meta_char::mc_period: // '.'
|
|
case meta_char::mc_colon: // ':'
|
|
case char_other::co_smcln: // ';'
|
|
case meta_char::mc_lt: // '<'
|
|
case meta_char::mc_eq: // '='
|
|
case meta_char::mc_gt: // '>'
|
|
case meta_char::mc_query: // '?'
|
|
case char_other::co_atmrk: // '@'
|
|
case meta_char::mc_caret: // '^'
|
|
case char_other::co_grav: // '`'
|
|
case char_other::co_tilde: // '~'
|
|
case meta_char::mc_minus: // '-'
|
|
return firstchar;
|
|
|
|
default:
|
|
return constants::invalid_u32value;
|
|
}
|
|
}
|
|
|
|
bool get_character_in_class_vmode(
|
|
posdata_holder &pos,
|
|
state_type &ccatom,
|
|
const uchar32 *&curpos,
|
|
const uchar32 *const end,
|
|
const re_compiler_state<charT> &cstate,
|
|
const bool no_ccesc
|
|
)
|
|
{
|
|
pos.clear();
|
|
|
|
ccatom.character = *curpos++;
|
|
|
|
switch (ccatom.character)
|
|
{
|
|
// ClassSetSyntaxCharacter :: one of
|
|
// ( ) [ ] { } / - \ |
|
|
case meta_char::mc_rbraop: // '('
|
|
case meta_char::mc_rbracl: // ')'
|
|
case meta_char::mc_sbraop: // '['
|
|
case meta_char::mc_sbracl: // ']'
|
|
case meta_char::mc_cbraop: // '{'
|
|
case meta_char::mc_cbracl: // '}'
|
|
case char_other::co_slash: // '/'
|
|
case meta_char::mc_minus: // '-'
|
|
case meta_char::mc_bar: // '|'
|
|
this->throw_error(regex_constants::error_noescape);
|
|
//@fallthrough@
|
|
|
|
case meta_char::mc_escape: // '\\'
|
|
break;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
ccatom.character = *curpos++;
|
|
|
|
if (!no_ccesc)
|
|
{
|
|
if (((ccatom.character | constants::asc_icase) == char_alnum::ch_p))
|
|
{
|
|
return parse_escape_p_vmode(pos, ccatom, curpos, end, cstate);
|
|
}
|
|
else if (ccatom.character == char_alnum::ch_q)
|
|
{
|
|
return parse_escape_q_vmode(pos, curpos, end, cstate);
|
|
}
|
|
}
|
|
|
|
switch (ccatom.character)
|
|
{
|
|
// ClassSetReservedPunctuator :: one of
|
|
// & - ! # % , : ; < = > @ ` ~
|
|
case char_other::co_amp: // '&'
|
|
case meta_char::mc_exclam: // '!'
|
|
case meta_char::mc_sharp: // '#'
|
|
case char_other::co_perc: // '%'
|
|
case meta_char::mc_comma: // ','
|
|
case meta_char::mc_colon: // ':'
|
|
case char_other::co_smcln: // ';'
|
|
case meta_char::mc_lt: // '<'
|
|
case meta_char::mc_eq: // '='
|
|
case meta_char::mc_gt: // '>'
|
|
case char_other::co_atmrk: // '@'
|
|
case char_other::co_grav: // '`'
|
|
case char_other::co_tilde: // '~'
|
|
return false;
|
|
|
|
default:;
|
|
}
|
|
|
|
translate_escseq_nocheck(&pos.ranges, ccatom, curpos, end, true, no_ccesc);
|
|
return false;
|
|
}
|
|
|
|
bool parse_escape_q_vmode(posdata_holder &pos, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state<charT> &cstate)
|
|
{
|
|
if (curpos == end || *curpos != meta_char::mc_cbraop) // '{'
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
simple_array<uchar32> seqs;
|
|
simple_array<uchar32> curseq;
|
|
posdata_holder dummypos;
|
|
state_type qatom;
|
|
|
|
++curpos;
|
|
|
|
for (;;)
|
|
{
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
if (*curpos == meta_char::mc_bar || *curpos == meta_char::mc_cbracl) // '|' or '}'.
|
|
{
|
|
const uint_l32 seqlen = static_cast<uint_l32>(curseq.size());
|
|
|
|
if (seqlen <= 1)
|
|
{
|
|
seqs.push_backncr(2);
|
|
seqs.push_backncr(seqlen != 0 ? curseq[0] : constants::ccstr_empty);
|
|
}
|
|
else // >= 2
|
|
{
|
|
seqs.push_backncr(seqlen + 1);
|
|
seqs.append(curseq);
|
|
}
|
|
|
|
if (*curpos == meta_char::mc_cbracl) // '}'
|
|
break;
|
|
|
|
curseq.clear();
|
|
++curpos;
|
|
}
|
|
else
|
|
{
|
|
qatom.reset();
|
|
get_character_in_class_vmode(dummypos, qatom, curpos, end, cstate, true);
|
|
|
|
curseq.push_backncr(qatom.character);
|
|
}
|
|
}
|
|
|
|
++curpos;
|
|
pos.split_seqs_and_ranges(seqs, this->is_icase(), cstate.back);
|
|
|
|
return true;
|
|
}
|
|
|
|
#endif // !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
bool translate_escseq(range_pairs *const rp, state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const bool insidecharclass)
|
|
{
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
atom.character = *curpos++;
|
|
|
|
return translate_escseq_nocheck(rp, atom, curpos, end, insidecharclass, false);
|
|
}
|
|
|
|
bool translate_character_class_escape(range_pairs *const rp, state_type &cceatom
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
, const uchar32 *&curpos
|
|
, const uchar32 *const end
|
|
, const bool insidecharclass
|
|
#else
|
|
, const uchar32 *&
|
|
, const uchar32 *const
|
|
, const bool
|
|
#endif
|
|
)
|
|
{
|
|
// Predefined classes.
|
|
switch (cceatom.character)
|
|
{
|
|
case char_alnum::ch_D: // 'D':
|
|
cceatom.is_not = true;
|
|
//@fallthrough@
|
|
|
|
case char_alnum::ch_d: // 'd':
|
|
cceatom.number = static_cast<uint_l32>(re_character_class::digit); // \d, \D.
|
|
break;
|
|
|
|
case char_alnum::ch_S: // 'S':
|
|
cceatom.is_not = true;
|
|
//@fallthrough@
|
|
|
|
case char_alnum::ch_s: // 's':
|
|
cceatom.number = static_cast<uint_l32>(re_character_class::space); // \s, \S.
|
|
break;
|
|
|
|
case char_alnum::ch_W: // 'W':
|
|
cceatom.is_not = true;
|
|
//@fallthrough@
|
|
|
|
case char_alnum::ch_w: // 'w':
|
|
if (this->is_icase())
|
|
{
|
|
this->character_class.setup_icase_word();
|
|
cceatom.number = static_cast<uint_l32>(re_character_class::icase_word);
|
|
}
|
|
else
|
|
cceatom.number = static_cast<uint_l32>(re_character_class::word); // \w, \W.
|
|
break;
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
// Prepared for Unicode properties and script names.
|
|
case char_alnum::ch_P: // \P{...}
|
|
cceatom.is_not = true;
|
|
//@fallthrough@
|
|
|
|
case char_alnum::ch_p: // \p{...}
|
|
{
|
|
range_pairs lranges;
|
|
range_pairs *const pranges = (rp != NULL) ? rp : &lranges;
|
|
|
|
get_property_ranges(*pranges, curpos, end);
|
|
|
|
if (cceatom.is_not)
|
|
{
|
|
pranges->negation();
|
|
cceatom.is_not = false;
|
|
}
|
|
|
|
if (!insidecharclass && this->is_icase())
|
|
pranges->make_caseunfoldedcharset();
|
|
|
|
if (rp == NULL)
|
|
cceatom.number = this->character_class.register_newclass(*pranges);
|
|
}
|
|
cceatom.type = st_character_class;
|
|
return true;
|
|
#endif // !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
if (rp != NULL)
|
|
add_predefclass_to_charclass(*rp, cceatom);
|
|
else
|
|
{
|
|
if (cceatom.is_not)
|
|
{
|
|
range_pairs lranges;
|
|
|
|
add_predefclass_to_charclass(lranges, cceatom);
|
|
cceatom.number = this->character_class.register_newclass(lranges);
|
|
}
|
|
}
|
|
|
|
cceatom.is_not = false;
|
|
cceatom.type = st_character_class;
|
|
return true;
|
|
}
|
|
|
|
bool translate_escseq_nocheck(range_pairs *const rp, state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const bool insidecharclass, const bool no_ccesc)
|
|
{
|
|
if (!no_ccesc && translate_character_class_escape(rp, atom, curpos, end, insidecharclass))
|
|
return true;
|
|
|
|
switch (atom.character)
|
|
{
|
|
case char_alnum::ch_b:
|
|
atom.character = char_ctrl::cc_bs; // '\b' 0x08:BS
|
|
break;
|
|
|
|
case char_alnum::ch_t:
|
|
atom.character = char_ctrl::cc_htab; // '\t' 0x09:HT
|
|
break;
|
|
|
|
case char_alnum::ch_n:
|
|
atom.character = char_ctrl::cc_nl; // '\n' 0x0a:LF
|
|
break;
|
|
|
|
case char_alnum::ch_v:
|
|
atom.character = char_ctrl::cc_vtab; // '\v' 0x0b:VT
|
|
break;
|
|
|
|
case char_alnum::ch_f:
|
|
atom.character = char_ctrl::cc_ff; // '\f' 0x0c:FF
|
|
break;
|
|
|
|
case char_alnum::ch_r:
|
|
atom.character = char_ctrl::cc_cr; // '\r' 0x0d:CR
|
|
break;
|
|
|
|
case char_alnum::ch_c: // \cX
|
|
if (curpos != end)
|
|
{
|
|
// atom.character = static_cast<uchar32>(utf_traits().codepoint_inc(curpos, end) & 0x1f); // *curpos++
|
|
atom.character = static_cast<uchar32>(*curpos | constants::asc_icase);
|
|
|
|
if (atom.character >= char_alnum::ch_a && atom.character <= char_alnum::ch_z)
|
|
atom.character = static_cast<uchar32>(*curpos++ & 0x1f);
|
|
else
|
|
{
|
|
this->throw_error(regex_constants::error_escape); // Strict.
|
|
// atom.character = char_alnum::ch_c; // Loose.
|
|
}
|
|
}
|
|
break;
|
|
|
|
case char_alnum::ch_0:
|
|
atom.character = char_ctrl::cc_nul; // '\0' 0x00:NUL
|
|
break;
|
|
|
|
case char_alnum::ch_x: // \xhh
|
|
atom.character = translate_numbers(curpos, end, 16, 2, 2, 0xff);
|
|
break;
|
|
|
|
case char_alnum::ch_u: // \uhhhh, \u{h~hhhhhh}
|
|
atom.character = parse_escape_u(curpos, end);
|
|
break;
|
|
|
|
// SyntaxCharacter, '/', and '-'.
|
|
case meta_char::mc_caret: // '^'
|
|
case meta_char::mc_dollar: // '$'
|
|
case meta_char::mc_escape: // '\\'
|
|
case meta_char::mc_period: // '.'
|
|
case meta_char::mc_astrsk: // '*'
|
|
case meta_char::mc_plus: // '+'
|
|
case meta_char::mc_query: // '?'
|
|
case meta_char::mc_rbraop: // '('
|
|
case meta_char::mc_rbracl: // ')'
|
|
case meta_char::mc_sbraop: // '['
|
|
case meta_char::mc_sbracl: // ']'
|
|
case meta_char::mc_cbraop: // '{'
|
|
case meta_char::mc_cbracl: // '}'
|
|
case meta_char::mc_bar: // '|'
|
|
case char_other::co_slash: // '/'
|
|
break;
|
|
|
|
case meta_char::mc_minus: // '-' allowed only in charclass.
|
|
if (insidecharclass)
|
|
break;
|
|
//@fallthrough@
|
|
|
|
default:
|
|
atom.character = constants::invalid_u32value;
|
|
}
|
|
|
|
if (atom.character == constants::invalid_u32value)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
return true;
|
|
}
|
|
|
|
uchar32 parse_escape_u(const uchar32 *&curpos, const uchar32 *const end) const
|
|
{
|
|
uchar32 ucp;
|
|
|
|
if (curpos == end)
|
|
return constants::invalid_u32value;
|
|
|
|
if (*curpos == meta_char::mc_cbraop)
|
|
{
|
|
// ucp = translate_numbers(++curpos, end, 16, 1, 6, constants::unicode_max_codepoint, true);
|
|
ucp = translate_numbers(++curpos, end, 16, 1, 0, constants::unicode_max_codepoint);
|
|
|
|
if (curpos == end || *curpos != meta_char::mc_cbracl)
|
|
return constants::invalid_u32value;
|
|
|
|
++curpos;
|
|
}
|
|
else
|
|
{
|
|
ucp = translate_numbers(curpos, end, 16, 4, 4, 0xffff);
|
|
|
|
if (ucp >= 0xd800 && ucp <= 0xdbff)
|
|
{
|
|
const uchar32 * prefetch = curpos;
|
|
|
|
if (prefetch != end && *prefetch == meta_char::mc_escape && ++prefetch != end && *prefetch == char_alnum::ch_u)
|
|
{
|
|
const uchar32 nextucp = translate_numbers(++prefetch, end, 16, 4, 4, 0xffff);
|
|
|
|
if (nextucp >= 0xdc00 && nextucp <= 0xdfff)
|
|
{
|
|
curpos = prefetch;
|
|
ucp = (((ucp << 10) & 0xffc00) | (nextucp & 0x3ff)) + 0x10000;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ucp;
|
|
}
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
void get_property_ranges(range_pairs &pranges, const uchar32 *&curpos, const uchar32 *const end)
|
|
{
|
|
const uint_l32 pnumber = lookup_propertynumber(curpos, end);
|
|
|
|
if (pnumber == up_constants::error_property || this->character_class.is_pos(pnumber))
|
|
this->throw_error(regex_constants::error_property);
|
|
|
|
this->character_class.load_upranges(pranges, pnumber);
|
|
}
|
|
|
|
uint_l32 lookup_propertynumber(const uchar32 *&curpos, const uchar32 *const end)
|
|
{
|
|
pstring pname;
|
|
pstring pvalue;
|
|
|
|
get_property_name_and_value(pname, pvalue, curpos, end);
|
|
|
|
return this->character_class.get_propertynumber(pname, pvalue);
|
|
}
|
|
|
|
void get_property_name_and_value(pstring &pname, pstring &pvalue, const uchar32 *&curpos, const uchar32 *const end)
|
|
{
|
|
if (curpos == end || *curpos != meta_char::mc_cbraop) // '{'
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
get_property_name_or_value(pvalue, ++curpos, end);
|
|
|
|
if (!pvalue.size())
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
if (static_cast<uchar32>(pvalue[pvalue.size() - 1]) != char_other::co_sp) // ' ', not a value.
|
|
{
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
if (*curpos == meta_char::mc_eq) // '='
|
|
{
|
|
pname = pvalue;
|
|
get_property_name_or_value(pvalue, ++curpos, end);
|
|
if (!pvalue.size())
|
|
this->throw_error(regex_constants::error_escape);
|
|
}
|
|
}
|
|
|
|
if (curpos == end || *curpos != meta_char::mc_cbracl) // '}'
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
if (static_cast<uchar32>(pvalue[pvalue.size() - 1]) == char_other::co_sp) // ' ', value.
|
|
pvalue.resize(pvalue.size() - 1);
|
|
|
|
++curpos;
|
|
}
|
|
|
|
void get_property_name_or_value(pstring &name_or_value, const uchar32 *&curpos, const uchar32 *const end) const
|
|
{
|
|
bool number_found = false;
|
|
|
|
name_or_value.clear();
|
|
|
|
for (;; ++curpos)
|
|
{
|
|
if (curpos == end)
|
|
break;
|
|
|
|
const uchar32 curchar = *curpos;
|
|
|
|
if (curchar >= char_alnum::ch_A && curchar <= char_alnum::ch_Z)
|
|
;
|
|
else if (curchar >= char_alnum::ch_a && curchar <= char_alnum::ch_z)
|
|
;
|
|
else if (curchar == char_other::co_ll) // '_'
|
|
;
|
|
else if (curchar >= char_alnum::ch_0 && curchar <= char_alnum::ch_9)
|
|
number_found = true;
|
|
else
|
|
break;
|
|
|
|
name_or_value.append(1, static_cast<typename pstring::value_type>(curchar));
|
|
}
|
|
if (number_found)
|
|
name_or_value.append(1, char_other::co_sp); // ' '
|
|
|
|
}
|
|
|
|
#endif // !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
bool translate_atom_escape(state_type &escatom
|
|
#if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
, state_array &piece
|
|
, re_quantifier &piecesize
|
|
#else
|
|
, state_array &
|
|
, re_quantifier &
|
|
#endif
|
|
, const uchar32 *&curpos, const uchar32 *const end, /* const */ re_compiler_state<charT> &cstate)
|
|
{
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
escatom.character = *curpos;
|
|
|
|
#if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
if (this->is_vmode() && ((escatom.character | constants::asc_icase) == char_alnum::ch_p))
|
|
{
|
|
posdata_holder pos;
|
|
|
|
parse_escape_p_vmode(pos, escatom, ++curpos, end, cstate);
|
|
|
|
if (escatom.type == st_character_class)
|
|
escatom.number = this->character_class.register_newclass(pos.ranges);
|
|
else
|
|
transform_seqdata(piece, pos);
|
|
|
|
piecesize.set(escatom.quantifier.atleast, escatom.quantifier.atmost);
|
|
return true;
|
|
}
|
|
#endif // !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
switch (escatom.character)
|
|
{
|
|
case char_alnum::ch_B: // 'B':
|
|
escatom.is_not = true;
|
|
//@fallthrough@
|
|
|
|
case char_alnum::ch_b: // 'b':
|
|
escatom.type = st_boundary; // \b, \B.
|
|
escatom.quantifier.reset(0);
|
|
// atom.number = 0;
|
|
if (this->is_icase())
|
|
{
|
|
this->character_class.setup_icase_word();
|
|
escatom.number = static_cast<uint_l32>(re_character_class::icase_word);
|
|
}
|
|
else
|
|
escatom.number = static_cast<uint_l32>(re_character_class::word); // \w, \W.
|
|
break;
|
|
|
|
// case char_alnum::ch_A: // 'A':
|
|
// atom.type = st_bol; // '\A'
|
|
// case char_alnum::ch_Z: // 'Z':
|
|
// atom.type = st_eol; // '\Z'
|
|
// case char_alnum::ch_z: // 'z':
|
|
// atom.type = st_eol; // '\z'
|
|
// case char_alnum::ch_R: // 'R':
|
|
// (?>\r\n?|[\x0A-\x0C\x85\u{2028}\u{2029}])
|
|
|
|
// Backreferences.
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
// Prepared for named captures.
|
|
case char_alnum::ch_k: // 'k':
|
|
return parse_backreference_name(escatom, curpos, end, cstate); // \k.
|
|
#endif
|
|
|
|
default:
|
|
|
|
if (escatom.character >= char_alnum::ch_1 && escatom.character <= char_alnum::ch_9) // \1, \9.
|
|
return parse_backreference_number(escatom, curpos, end, cstate);
|
|
|
|
return translate_escseq(NULL, escatom, curpos, end, false);
|
|
}
|
|
|
|
++curpos;
|
|
return true;
|
|
}
|
|
|
|
bool parse_backreference_number(state_type &atom, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state<charT> &cstate)
|
|
{
|
|
const uchar32 backrefno = translate_numbers(curpos, end, 10, 0, 0, 0xfffffffe);
|
|
// 22.2.1.1 Static Semantics: Early Errors:
|
|
// It is a Syntax Error if NcapturingParens >= 23^2 - 1.
|
|
|
|
if (backrefno == constants::invalid_u32value)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
atom.number = static_cast<uint_l32>(backrefno);
|
|
atom.backrefnumber_unresolved = false;
|
|
|
|
return backreference_postprocess(atom, cstate);
|
|
}
|
|
|
|
bool backreference_postprocess(state_type &atom, const re_compiler_state<charT> & /* cstate */) const
|
|
{
|
|
atom.next2 = 1;
|
|
atom.type = st_backreference;
|
|
|
|
// atom.quantifier.atleast = cstate.atleast_widths_of_brackets[atom.number - 1];
|
|
// Moved to check_backreferences().
|
|
|
|
return true;
|
|
}
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
bool parse_backreference_name(state_type &atom, const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
{
|
|
if (++curpos == end || *curpos != meta_char::mc_lt)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
const gname_string groupname = get_groupname(++curpos, end, cstate);
|
|
|
|
atom.number = this->namedcaptures[groupname];
|
|
|
|
if (atom.number != groupname_mapper<charT>::notfound)
|
|
atom.backrefnumber_unresolved = false;
|
|
else
|
|
{
|
|
atom.backrefnumber_unresolved = true;
|
|
atom.number = static_cast<uint_l32>(cstate.unresolved_gnames.size());
|
|
cstate.unresolved_gnames.push_back(groupname, atom.number);
|
|
}
|
|
|
|
return backreference_postprocess(atom, cstate);
|
|
}
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
gname_string get_groupname(const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &cstate)
|
|
#else
|
|
gname_string get_groupname(const uchar32 *&curpos, const uchar32 *const end, re_compiler_state<charT> &)
|
|
#endif
|
|
{
|
|
charT mbstr[utf_traits::maxseqlen];
|
|
gname_string groupname;
|
|
|
|
#if !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
cstate.idchecker.setup();
|
|
#endif
|
|
for (;;)
|
|
{
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
uchar32 curchar = *curpos++;
|
|
|
|
if (curchar == meta_char::mc_gt) // '>'
|
|
break;
|
|
|
|
if (curchar == meta_char::mc_escape && curpos != end && *curpos == char_alnum::ch_u) // '\\', 'u'.
|
|
curchar = parse_escape_u(++curpos, end);
|
|
|
|
#if defined(SRELL_NO_UNICODE_PROPERTY)
|
|
if (curchar != meta_char::mc_escape)
|
|
#else
|
|
if (cstate.idchecker.is_identifier(curchar, groupname.size() != 0))
|
|
#endif
|
|
; // OK.
|
|
else
|
|
curchar = constants::invalid_u32value;
|
|
|
|
if (curchar == constants::invalid_u32value)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
const uchar32 seqlen = utf_traits::to_codeunits(mbstr, curchar);
|
|
for (uchar32 i = 0; i < seqlen; ++i)
|
|
groupname.append(1, mbstr[i]);
|
|
}
|
|
if (!groupname.size())
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
return groupname;
|
|
}
|
|
#endif // !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
#if !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
bool parse_escape_p_vmode(posdata_holder &pos, state_type &patom, const uchar32 *&curpos, const uchar32 *const end, const re_compiler_state<charT> &cstate)
|
|
{
|
|
if (curpos == end)
|
|
this->throw_error(regex_constants::error_escape);
|
|
|
|
// patom.is_not = (patom.character & constants::asc_icase) ? false : true;
|
|
if (patom.character == char_alnum::ch_P) // \P{...}
|
|
patom.is_not = true;
|
|
|
|
patom.number = lookup_propertynumber(curpos, end);
|
|
|
|
if (patom.number == up_constants::error_property)
|
|
this->throw_error(regex_constants::error_property);
|
|
|
|
if (!this->character_class.is_pos(patom.number))
|
|
{
|
|
pos.clear();
|
|
|
|
this->character_class.load_upranges(pos.ranges, patom.number);
|
|
|
|
if (this->is_icase() && patom.number >= static_cast<uint_l32>(re_character_class::number_of_predefcls))
|
|
pos.ranges.make_caseunfoldedcharset();
|
|
|
|
if (patom.is_not)
|
|
{
|
|
pos.ranges.negation();
|
|
patom.is_not = false;
|
|
}
|
|
|
|
patom.type = st_character_class;
|
|
patom.quantifier.reset(1);
|
|
}
|
|
else
|
|
{
|
|
simple_array<uchar32> sequences;
|
|
|
|
this->character_class.get_prawdata(sequences, patom.number);
|
|
pos.split_seqs_and_ranges(sequences, this->is_icase(), cstate.back);
|
|
|
|
patom.quantifier.set(pos.length.first, pos.length.second);
|
|
|
|
if (patom.is_not)
|
|
this->throw_error(regex_constants::error_complement);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
uint_l32 transform_seqdata(state_array &piece, const posdata_holder &pos)
|
|
{
|
|
uchar32 seqlen = static_cast<uchar32>(pos.indices.size());
|
|
state_type ccatom;
|
|
|
|
ccatom.reset();
|
|
ccatom.type = st_character_class;
|
|
ccatom.number = this->character_class.register_newclass(pos.ranges);
|
|
|
|
if (seqlen > 0)
|
|
{
|
|
const bool has_empty = pos.has_empty();
|
|
state_size_type prevbranch_end = 0;
|
|
state_type branchatom;
|
|
state_type jumpatom;
|
|
state_array branch;
|
|
|
|
branch.resize(seqlen);
|
|
for (uchar32 i = 0; i < seqlen; ++i)
|
|
branch[i].reset();
|
|
|
|
branchatom.reset();
|
|
branchatom.type = st_epsilon;
|
|
branchatom.character = meta_char::mc_bar; // '|'
|
|
|
|
jumpatom.reset();
|
|
jumpatom.type = st_epsilon;
|
|
|
|
--seqlen;
|
|
|
|
for (; seqlen >= 2; --seqlen)
|
|
{
|
|
uchar32 offset = pos.indices[seqlen];
|
|
const uchar32 seqend = pos.indices[seqlen - 1];
|
|
|
|
if (offset != seqend)
|
|
{
|
|
branch.resize(seqlen + 1);
|
|
branch[seqlen] = jumpatom;
|
|
|
|
branchatom.quantifier.atleast = seqlen;
|
|
|
|
for (uchar32 count = 0; offset < seqend; ++offset)
|
|
{
|
|
branch[count++].character = pos.seqs[offset];
|
|
|
|
if (count == seqlen)
|
|
{
|
|
if (piece.size())
|
|
{
|
|
state_type &lastatom = piece[piece.size() - 1];
|
|
|
|
lastatom.next1 = seqlen + 2;
|
|
piece[prevbranch_end].next2 = static_cast<std::ptrdiff_t>(piece.size() - prevbranch_end);
|
|
}
|
|
prevbranch_end = piece.size();
|
|
piece.push_back(branchatom);
|
|
piece.append(branch);
|
|
count = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (piece.size())
|
|
{
|
|
state_type &lastatom = piece[piece.size() - 1];
|
|
|
|
lastatom.next1 = has_empty ? 3 : 2;
|
|
|
|
piece[prevbranch_end].next2 = static_cast<std::ptrdiff_t>(piece.size() - prevbranch_end);
|
|
}
|
|
|
|
branchatom.quantifier.atleast = 1;
|
|
|
|
if (has_empty)
|
|
{
|
|
branchatom.next2 = 2;
|
|
piece.push_back(branchatom);
|
|
}
|
|
|
|
piece.push_back(ccatom);
|
|
|
|
branchatom.character = meta_char::mc_colon; // ':'
|
|
branchatom.next1 = 1;
|
|
branchatom.next2 = 0;
|
|
branchatom.quantifier.set(1, 0);
|
|
piece.insert(0, branchatom);
|
|
++prevbranch_end;
|
|
|
|
branchatom.quantifier.atmost = 1;
|
|
piece.push_back(branchatom);
|
|
|
|
optimise_pos(piece, branchatom, has_empty);
|
|
|
|
}
|
|
return ccatom.number;
|
|
}
|
|
|
|
void optimise_pos(state_array &piece, state_type &branchatom, const bool has_empty) const
|
|
{
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT) && !defined(SRELLDBG_NO_POS_OPT)
|
|
|
|
simple_array<uchar32> ins_bt;
|
|
|
|
branchatom.number = 0u;
|
|
branchatom.next1 = 0;
|
|
|
|
for (state_size_type srcbase = 1; srcbase < piece.size();)
|
|
{
|
|
state_type &srcbaseatom = piece[srcbase];
|
|
|
|
if (srcbaseatom.is_branch())
|
|
{
|
|
if (srcbaseatom.quantifier.atleast == 1)
|
|
{
|
|
break;
|
|
}
|
|
|
|
const state_size_type dstbaseinit = srcbase + srcbaseatom.next2;
|
|
const uint_l32 srcseqlen = srcbaseatom.quantifier.atleast;
|
|
|
|
for (uint_l32 complen = srcseqlen; complen;)
|
|
{
|
|
state_size_type dstbase = dstbaseinit;
|
|
bool modified = false;
|
|
|
|
--complen;
|
|
|
|
for (;;)
|
|
{
|
|
state_type &dstbaseatom = piece[dstbase];
|
|
const uint_l32 dstseqlen = dstbaseatom.quantifier.atleast;
|
|
state_size_type dstpos = dstbase + (dstbaseatom.type == st_epsilon ? 1 : 0);
|
|
|
|
if (piece[dstpos].type == st_character_class)
|
|
{
|
|
branchatom.next2 = static_cast<std::ptrdiff_t>(dstbase);
|
|
break;
|
|
}
|
|
|
|
if (dstseqlen >= complen)
|
|
{
|
|
state_size_type srcpos = srcbase + 1;
|
|
|
|
for (uint_l32 i = 0;; ++i)
|
|
{
|
|
state_type &srcref = piece[srcpos];
|
|
state_type &dstref = piece[dstpos];
|
|
|
|
if (i == complen)
|
|
{
|
|
if (dstref.type == st_epsilon)
|
|
{
|
|
if (complen)
|
|
{
|
|
uchar32 inspos = static_cast<uchar32>(ins_bt.size());
|
|
|
|
for (; !piece[srcpos].quantifier.is_greedy;)
|
|
srcpos = static_cast<uchar32>(piece[srcpos].quantifier.atmost);
|
|
|
|
for (; inspos >= 2 && (ins_bt[inspos - 2] > srcpos);)
|
|
{
|
|
inspos -= 2;
|
|
}
|
|
|
|
ins_bt.insert(inspos, dstpos);
|
|
ins_bt.insert(inspos, srcpos);
|
|
}
|
|
}
|
|
else if (dstref.type == st_character && srcref.character != dstref.character)
|
|
{
|
|
if (dstref.quantifier.is_greedy)
|
|
{
|
|
piece[srcpos].next2 = static_cast<std::ptrdiff_t>(dstpos - srcpos);
|
|
dstref.quantifier.is_greedy = false;
|
|
dstref.quantifier.atmost = srcpos;
|
|
}
|
|
modified = true;
|
|
}
|
|
|
|
break;
|
|
}
|
|
else if (srcref.character == dstref.character && dstref.type == st_character)
|
|
{
|
|
++srcpos;
|
|
++dstpos;
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
|
|
if (modified)
|
|
break;
|
|
|
|
if (dstbaseatom.type == st_epsilon && dstbaseatom.next2)
|
|
{
|
|
dstbase += dstbaseatom.next2;
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (branchatom.number == 0)
|
|
{
|
|
branchatom.next1 = static_cast<std::ptrdiff_t>(srcbase);
|
|
}
|
|
else
|
|
{
|
|
srcbaseatom.next2 = 0;
|
|
srcbaseatom.character = char_alnum::ch_s;
|
|
}
|
|
|
|
srcbase = dstbaseinit;
|
|
|
|
++branchatom.number;
|
|
}
|
|
else
|
|
break;
|
|
// ++srcbase;
|
|
}
|
|
|
|
if (branchatom.next2 == 0)
|
|
++branchatom.next2;
|
|
|
|
if (branchatom.next1 != 0)
|
|
piece[branchatom.next1].next2 = static_cast<std::ptrdiff_t>(branchatom.next2 - branchatom.next1);
|
|
|
|
if (has_empty)
|
|
{
|
|
piece[branchatom.next2].next2 = 0;
|
|
piece[branchatom.next2 + 1].next2 = 1;
|
|
}
|
|
|
|
#endif // !defined(SRELLDBG_NO_ASTERISK_OPT) && !defined(SRELLDBG_NO_POS_OPT)
|
|
}
|
|
|
|
void insert_btbranch(state_array &piece, const simple_array<uchar32> &ins_bt) const
|
|
{
|
|
state_type insstate;
|
|
simple_array<uchar32> reordering1;
|
|
simple_array<uchar32> reordering2;
|
|
uchar32 offset = 0;
|
|
uchar32 chainindex = 0;
|
|
|
|
insstate.reset();
|
|
insstate.type = st_epsilon;
|
|
insstate.character = meta_char::mc_bar;
|
|
|
|
reordering1.resize(piece.size() + 1);
|
|
reordering2.resize(piece.size() + 1);
|
|
|
|
for (uchar32 indx = 0; indx <= piece.size(); ++indx)
|
|
{
|
|
reordering1[indx] = indx + offset;
|
|
|
|
if (chainindex < ins_bt.size())
|
|
{
|
|
if (indx == ins_bt[chainindex])
|
|
{
|
|
++offset;
|
|
chainindex += 2;
|
|
}
|
|
}
|
|
reordering2[indx] = indx + offset;
|
|
}
|
|
|
|
for (uchar32 indx = 0; indx < piece.size(); ++indx)
|
|
{
|
|
state_type &st = piece[indx];
|
|
|
|
if (st.next1 != 0)
|
|
{
|
|
const uchar32 newn1 = reordering1[indx + st.next1];
|
|
st.next1 = static_cast<std::ptrdiff_t>(newn1 - reordering2[indx]);
|
|
}
|
|
|
|
if (st.next2 != 0)
|
|
{
|
|
const uchar32 newn2 = reordering1[indx + st.next2];
|
|
st.next2 = static_cast<std::ptrdiff_t>(newn2 - reordering2[indx]);
|
|
}
|
|
}
|
|
|
|
for (uchar32 indx = 0; indx < ins_bt.size();)
|
|
{
|
|
const uchar32 srcpos = reordering1[ins_bt[indx++]];
|
|
const uchar32 dstpos = reordering1[ins_bt[indx++]];
|
|
|
|
piece.insert(srcpos, insstate);
|
|
state_type &insst = piece[srcpos];
|
|
insst.next2 = static_cast<std::ptrdiff_t>(dstpos - srcpos);
|
|
}
|
|
}
|
|
|
|
#endif // !defined(SRELL_NO_VMODE) && !defined(SRELL_NO_UNICODE_PROPERTY)
|
|
|
|
bool get_quantifier(re_quantifier &quantifier, const uchar32 *&curpos, const uchar32 *const end)
|
|
{
|
|
switch (*curpos)
|
|
{
|
|
case meta_char::mc_astrsk: // '*':
|
|
--quantifier.atleast;
|
|
//@fallthrough@
|
|
|
|
case meta_char::mc_plus: // '+':
|
|
quantifier.set_infinity();
|
|
break;
|
|
|
|
case meta_char::mc_query: // '?':
|
|
--quantifier.atleast;
|
|
break;
|
|
|
|
case meta_char::mc_cbraop: // '{':
|
|
get_brace_with_quantifier(quantifier, curpos, end);
|
|
break;
|
|
|
|
default:
|
|
return true;
|
|
}
|
|
|
|
if (++curpos != end && *curpos == meta_char::mc_query) // '?'
|
|
{
|
|
quantifier.is_greedy = false;
|
|
++curpos;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void get_brace_with_quantifier(re_quantifier &quantifier, const uchar32 *&curpos, const uchar32 *const end)
|
|
{
|
|
++curpos;
|
|
|
|
quantifier.atleast = static_cast<uint_l32>(translate_numbers(curpos, end, 10, 1, 0, constants::max_u32value));
|
|
|
|
if (quantifier.atleast == static_cast<uint_l32>(constants::invalid_u32value))
|
|
goto THROW_ERROR_BRACE;
|
|
|
|
if (curpos == end)
|
|
goto THROW_ERROR_BRACE;
|
|
|
|
if (*curpos == meta_char::mc_comma) // ','
|
|
{
|
|
++curpos;
|
|
|
|
quantifier.atmost = static_cast<uint_l32>(translate_numbers(curpos, end, 10, 1, 0, constants::max_u32value));
|
|
|
|
if (quantifier.atmost == static_cast<uint_l32>(constants::invalid_u32value))
|
|
quantifier.set_infinity();
|
|
|
|
if (!quantifier.is_valid())
|
|
this->throw_error(regex_constants::error_badbrace);
|
|
}
|
|
else
|
|
quantifier.atmost = quantifier.atleast;
|
|
|
|
if (curpos == end || *curpos != meta_char::mc_cbracl) // '}'
|
|
{
|
|
THROW_ERROR_BRACE:
|
|
this->throw_error(regex_constants::error_brace);
|
|
}
|
|
// *curpos == '}'
|
|
}
|
|
|
|
uchar32 translate_numbers(const uchar32 *&curpos, const uchar32 *const end, const int radix, const std::size_t minsize, const std::size_t maxsize, const uchar32 maxvalue) const
|
|
{
|
|
std::size_t count = 0;
|
|
uchar32 u32value = 0;
|
|
int num;
|
|
|
|
for (; maxsize == 0 || count < maxsize; ++curpos, ++count)
|
|
{
|
|
|
|
if (curpos == end || (num = tonumber(*curpos, radix)) == -1)
|
|
break;
|
|
|
|
const uchar32 nextvalue = u32value * radix + num;
|
|
|
|
if ((/* maxvalue != 0 && */ nextvalue > maxvalue) || nextvalue < u32value)
|
|
break;
|
|
|
|
u32value = nextvalue;
|
|
}
|
|
|
|
if (count >= minsize)
|
|
return u32value;
|
|
|
|
return constants::invalid_u32value;
|
|
}
|
|
|
|
int tonumber(const uchar32 ch, const int radix) const
|
|
{
|
|
if ((ch >= char_alnum::ch_0 && ch <= char_alnum::ch_7) || (radix >= 10 && (ch == char_alnum::ch_8 || ch == char_alnum::ch_9)))
|
|
return static_cast<int>(ch - char_alnum::ch_0);
|
|
|
|
if (radix == 16)
|
|
{
|
|
if (ch >= char_alnum::ch_a && ch <= char_alnum::ch_f)
|
|
return static_cast<int>(ch - char_alnum::ch_a + 10);
|
|
|
|
if (ch >= char_alnum::ch_A && ch <= char_alnum::ch_F)
|
|
return static_cast<int>(ch - char_alnum::ch_A + 10);
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
bool check_backreferences(re_compiler_state<charT> &cstate)
|
|
{
|
|
for (typename state_array::size_type backrefpos = 0; backrefpos < this->NFA_states.size(); ++backrefpos)
|
|
{
|
|
state_type &brs = this->NFA_states[backrefpos];
|
|
|
|
if (brs.type == st_backreference)
|
|
{
|
|
const uint_l32 &backrefno = brs.number;
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
if (brs.backrefnumber_unresolved)
|
|
{
|
|
if (backrefno >= cstate.unresolved_gnames.size())
|
|
return false; // Internal error.
|
|
|
|
brs.number = this->namedcaptures[cstate.unresolved_gnames[backrefno]];
|
|
|
|
if (backrefno == groupname_mapper<charT>::notfound)
|
|
return false;
|
|
|
|
brs.backrefnumber_unresolved = false;
|
|
}
|
|
#endif
|
|
|
|
for (typename state_array::size_type roundbracket_closepos = 0;; ++roundbracket_closepos)
|
|
{
|
|
if (roundbracket_closepos < this->NFA_states.size())
|
|
{
|
|
const state_type &rbcs = this->NFA_states[roundbracket_closepos];
|
|
|
|
if (rbcs.type == st_roundbracket_close && rbcs.number == backrefno)
|
|
{
|
|
if (roundbracket_closepos < backrefpos)
|
|
{
|
|
// brs.quantifier.atleast = cstate.atleast_widths_of_brackets[backrefno - 1];
|
|
// 20210429: It was reported that clang-tidy was dissatisfied with this code.
|
|
// 20211006: Replaced with the following code:
|
|
|
|
const uint_l32 backrefnoindex = backrefno - 1;
|
|
|
|
// This can never be true. Added only for satisfying clang-tidy.
|
|
if (backrefnoindex >= cstate.atleast_widths_of_brackets.size())
|
|
return false;
|
|
|
|
brs.quantifier.atleast = cstate.atleast_widths_of_brackets[backrefnoindex];
|
|
|
|
cstate.backref_used = true;
|
|
}
|
|
else
|
|
{
|
|
brs.type = st_epsilon;
|
|
brs.next2 = 0;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS)
|
|
|
|
void create_firstchar_class()
|
|
{
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
range_pairs fcc;
|
|
#else
|
|
range_pairs &fcc = this->firstchar_class;
|
|
#endif
|
|
|
|
const bool canbe0length = gather_nextchars(fcc, static_cast<typename state_array::size_type>(this->NFA_states[0].next1), 0u, false);
|
|
|
|
if (canbe0length)
|
|
{
|
|
fcc.set_solerange(range_pair_helper(0, constants::unicode_max_codepoint));
|
|
// Expressions would consist of assertions only, such as /^$/.
|
|
// We cannot but accept every codepoint.
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
this->NFA_states[0].quantifier.atleast = this->character_class.register_newclass(fcc);
|
|
|
|
set_bitset_table(fcc);
|
|
#endif
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
void set_bitset_table(const range_pairs &fcc)
|
|
{
|
|
for (typename range_pairs::size_type i = 0; i < fcc.size(); ++i)
|
|
{
|
|
const range_pair &range = fcc[i];
|
|
|
|
#if 0
|
|
uchar32 second = range.second <= constants::unicode_max_codepoint ? range.second : constants::unicode_max_codepoint;
|
|
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4127)
|
|
#endif
|
|
if (utf_traits::utftype == 16)
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(pop)
|
|
#endif
|
|
{
|
|
if (second >= 0x10000 && range.first < 0x10000)
|
|
{
|
|
this->firstchar_class_bs.set_range(utf_traits::firstcodeunit(0x10000) & utf_traits::bitsetmask, utf_traits::firstcodeunit(second) & utf_traits::bitsetmask);
|
|
second = 0xffff;
|
|
}
|
|
}
|
|
this->firstchar_class_bs.set_range(utf_traits::firstcodeunit(range.first) & utf_traits::bitsetmask, utf_traits::firstcodeunit(second) & utf_traits::bitsetmask);
|
|
|
|
#else
|
|
for (uchar32 ucp = range.first; ucp <= constants::unicode_max_codepoint; ++ucp)
|
|
{
|
|
this->firstchar_class_bs.set(utf_traits::firstcodeunit(ucp) & utf_traits::bitsetmask);
|
|
|
|
if (ucp == range.second)
|
|
break;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_BITSET)
|
|
#endif // !defined(SRELLDBG_NO_1STCHRCLS)
|
|
|
|
bool gather_nextchars(range_pairs &nextcharclass, typename state_array::size_type pos, simple_array<bool> &checked, const uint_l32 bracket_number, const bool subsequent) const
|
|
{
|
|
bool canbe0length = false;
|
|
|
|
for (;;)
|
|
{
|
|
const state_type &state = this->NFA_states[pos];
|
|
|
|
if (checked[pos])
|
|
break;
|
|
|
|
checked[pos] = true;
|
|
|
|
if (state.next2
|
|
&& (state.type != st_check_counter || !state.quantifier.is_greedy || state.quantifier.atleast == 0)
|
|
&& (state.type != st_save_and_reset_counter)
|
|
&& (state.type != st_roundbracket_open)
|
|
&& (state.type != st_roundbracket_close || state.number != bracket_number)
|
|
&& (state.type != st_repeat_in_push)
|
|
&& (state.type != st_backreference || (state.quantifier.atleast == 0 && state.next1 != state.next2))
|
|
&& (state.type != st_lookaround_open))
|
|
if (gather_nextchars(nextcharclass, pos + state.next2, checked, bracket_number, subsequent))
|
|
canbe0length = true;
|
|
|
|
switch (state.type)
|
|
{
|
|
case st_character:
|
|
if (!this->is_ricase())
|
|
{
|
|
nextcharclass.join(range_pair_helper(state.character));
|
|
}
|
|
else
|
|
{
|
|
uchar32 table[ucf_constants::rev_maxset] = {};
|
|
const uchar32 setnum = unicode_case_folding::casefoldedcharset(table, state.character);
|
|
|
|
for (uchar32 j = 0; j < setnum; ++j)
|
|
nextcharclass.join(range_pair_helper(table[j]));
|
|
}
|
|
return canbe0length;
|
|
|
|
case st_character_class:
|
|
nextcharclass.merge(this->character_class[state.number]);
|
|
return canbe0length;
|
|
|
|
case st_backreference:
|
|
{
|
|
const typename state_array::size_type nextpos = find_next1_of_bracketopen(state.number);
|
|
|
|
const bool length0 = gather_nextchars(nextcharclass, nextpos, state.number, subsequent);
|
|
|
|
if (!length0)
|
|
return canbe0length;
|
|
}
|
|
break;
|
|
|
|
case st_eol:
|
|
case st_bol:
|
|
if (!subsequent)
|
|
break;
|
|
|
|
//@fallthrough@
|
|
|
|
case st_boundary:
|
|
if (subsequent)
|
|
nextcharclass.set_solerange(range_pair_helper(0, constants::unicode_max_codepoint));
|
|
|
|
break;
|
|
|
|
case st_lookaround_open:
|
|
// if (!state.is_not && !state.reverse)
|
|
if (!state.is_not && state.quantifier.atleast == 0)
|
|
{
|
|
gather_nextchars(nextcharclass, pos + 1, checked, 0u, subsequent);
|
|
}
|
|
else if (subsequent)
|
|
nextcharclass.set_solerange(range_pair_helper(0, constants::unicode_max_codepoint));
|
|
|
|
break;
|
|
|
|
case st_roundbracket_close:
|
|
if (/* bracket_number == 0 || */ state.number != bracket_number)
|
|
break;
|
|
//@fallthrough@
|
|
|
|
case st_success: // == st_lookaround_close.
|
|
return true;
|
|
|
|
case st_check_counter:
|
|
if (!state.quantifier.is_greedy && state.quantifier.atleast >= 1)
|
|
return canbe0length;
|
|
//@fallthrough@
|
|
|
|
default:;
|
|
}
|
|
|
|
if (state.next1)
|
|
pos += state.next1;
|
|
else
|
|
break;
|
|
}
|
|
return canbe0length;
|
|
}
|
|
|
|
bool gather_nextchars(range_pairs &nextcharclass, const typename state_array::size_type pos, const uint_l32 bracket_number, const bool subsequent) const
|
|
{
|
|
simple_array<bool> checked;
|
|
|
|
checked.resize(this->NFA_states.size(), false);
|
|
return gather_nextchars(nextcharclass, pos, checked, bracket_number, subsequent);
|
|
}
|
|
|
|
typename state_array::size_type find_next1_of_bracketopen(const uint_l32 bracketno) const
|
|
{
|
|
for (typename state_array::size_type no = 0; no < this->NFA_states.size(); ++no)
|
|
{
|
|
const state_type &state = this->NFA_states[no];
|
|
|
|
if (state.type == st_roundbracket_open && state.number == bracketno)
|
|
return no + state.next1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void relativejump_to_absolutejump()
|
|
{
|
|
for (typename state_array::size_type pos = 0; pos < this->NFA_states.size(); ++pos)
|
|
{
|
|
state_type &state = this->NFA_states[pos];
|
|
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
if (state.next1 || state.type == st_character || state.type == st_character_class)
|
|
#else
|
|
if (state.next1)
|
|
#endif
|
|
state.next_state1 = &this->NFA_states[pos + state.next1];
|
|
else
|
|
state.next_state1 = NULL;
|
|
|
|
if (state.next2)
|
|
state.next_state2 = &this->NFA_states[pos + state.next2];
|
|
else
|
|
state.next_state2 = NULL;
|
|
}
|
|
}
|
|
|
|
void optimise()
|
|
{
|
|
#if !defined(SRELLDBG_NO_BRANCH_OPT2) && !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
branch_optimisation2();
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
asterisk_optimisation();
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_BRANCH_OPT) && !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
branch_optimisation();
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS)
|
|
create_firstchar_class();
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_SKIP_EPSILON)
|
|
skip_epsilon();
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
set_charclass_posinfo();
|
|
#endif
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_SKIP_EPSILON)
|
|
|
|
void skip_epsilon()
|
|
{
|
|
for (typename state_array::size_type pos = 0; pos < this->NFA_states.size(); ++pos)
|
|
{
|
|
state_type &state = this->NFA_states[pos];
|
|
|
|
if (state.next1)
|
|
state.next1 = static_cast<std::ptrdiff_t>(skip_nonbranch_epsilon(pos + state.next1) - pos);
|
|
|
|
if (state.next2)
|
|
state.next2 = static_cast<std::ptrdiff_t>(skip_nonbranch_epsilon(pos + state.next2) - pos);
|
|
}
|
|
}
|
|
|
|
typename state_array::size_type skip_nonbranch_epsilon(typename state_array::size_type pos) const
|
|
{
|
|
for (;;)
|
|
{
|
|
const state_type &state = this->NFA_states[pos];
|
|
|
|
if (state.type == st_epsilon && state.next2 == 0)
|
|
{
|
|
pos += state.next1;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
return pos;
|
|
}
|
|
|
|
#endif
|
|
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
|
|
void asterisk_optimisation()
|
|
{
|
|
state_type *prevstate_is_astrskepsilon = NULL;
|
|
const state_type *prevcharstate = NULL;
|
|
state_size_type mnp_inspos = 0;
|
|
bool inspos_updatable = true;
|
|
#if !defined(SRELLDBG_NO_SPLITCC)
|
|
bool inserted = false;
|
|
#endif
|
|
|
|
for (typename state_array::size_type cur = 1; cur < this->NFA_states.size(); ++cur)
|
|
{
|
|
state_type &curstate = this->NFA_states[cur];
|
|
|
|
switch (curstate.type)
|
|
{
|
|
case st_epsilon:
|
|
if (curstate.character == meta_char::mc_astrsk)
|
|
{
|
|
prevstate_is_astrskepsilon = &curstate;
|
|
}
|
|
else
|
|
{
|
|
prevstate_is_astrskepsilon = NULL;
|
|
inspos_updatable = false;
|
|
}
|
|
break;
|
|
|
|
case st_character:
|
|
case st_character_class:
|
|
if (inspos_updatable)
|
|
{
|
|
if (prevcharstate)
|
|
{
|
|
if (prevcharstate->type != curstate.type || prevcharstate->number != curstate.number)
|
|
inspos_updatable = false;
|
|
}
|
|
if (inspos_updatable)
|
|
{
|
|
if (prevstate_is_astrskepsilon)
|
|
{
|
|
inspos_updatable = false;
|
|
if (prevstate_is_astrskepsilon->quantifier.is_asterisk_or_plus())
|
|
{
|
|
mnp_inspos = cur + 1;
|
|
}
|
|
}
|
|
}
|
|
prevcharstate = &curstate;
|
|
}
|
|
if (prevstate_is_astrskepsilon)
|
|
{
|
|
const re_quantifier &eq = prevstate_is_astrskepsilon->quantifier;
|
|
const state_size_type epsilonno = cur - 1;
|
|
const state_size_type faroffset = eq.is_greedy ? prevstate_is_astrskepsilon->next2 : prevstate_is_astrskepsilon->next1;
|
|
const state_size_type nextno = epsilonno + faroffset;
|
|
#if !defined(SRELLDBG_NO_SPLITCC)
|
|
const state_size_type origlen = this->NFA_states.size();
|
|
#endif
|
|
|
|
if (is_exclusive_sequence(eq, cur, nextno))
|
|
{
|
|
state_type &epsilonstate = this->NFA_states[epsilonno];
|
|
state_type &curstate2 = this->NFA_states[cur];
|
|
|
|
epsilonstate.next1 = 1;
|
|
epsilonstate.next2 = 0;
|
|
epsilonstate.number = 0;
|
|
// curstate2.quantifier.is_greedy = true;
|
|
if (epsilonstate.quantifier.is_infinity())
|
|
{
|
|
curstate2.next1 = 0;
|
|
curstate2.next2 = faroffset - 1;
|
|
}
|
|
else // ? or {0,1}
|
|
{
|
|
curstate2.next2 = faroffset - 1;
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_SPLITCC)
|
|
if (mnp_inspos == nextno && origlen != this->NFA_states.size())
|
|
inserted = true;
|
|
#endif
|
|
}
|
|
prevstate_is_astrskepsilon = NULL;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
prevstate_is_astrskepsilon = NULL;
|
|
inspos_updatable = false;
|
|
}
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_NEXTPOS_OPT)
|
|
|
|
if (mnp_inspos != 0)
|
|
{
|
|
state_size_type cur = mnp_inspos;
|
|
|
|
if (this->NFA_states[cur].type != st_success)
|
|
{
|
|
const state_type &prevstate = this->NFA_states[cur - 1];
|
|
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER) && !defined(SRELLDBG_NO_1STCHRCLS) && !defined(SRELLDBG_NO_BITSET)
|
|
|
|
#if !defined(SRELLDBG_NO_SPLITCC)
|
|
if (!inserted && prevstate.next1 == 0)
|
|
#else
|
|
if (prevstate.next1 == 0)
|
|
#endif
|
|
{
|
|
range_pairs prevcc;
|
|
range_pairs nextcc;
|
|
|
|
// gather_if_char_or_cc_strict(prevcc, prevstate);
|
|
if (prevstate.type == st_character)
|
|
{
|
|
prevcc.set_solerange(range_pair_helper(prevstate.character));
|
|
}
|
|
else if (prevstate.type == st_character_class)
|
|
{
|
|
prevcc = this->character_class[prevstate.number];
|
|
}
|
|
|
|
gather_nextchars(nextcc, cur, 0u, true);
|
|
|
|
const uint_l32 cpnum_prevcc = prevcc.total_codepoints();
|
|
const uint_l32 cpnum_nextcc = nextcc.total_codepoints();
|
|
|
|
if (cpnum_nextcc != 0 && cpnum_nextcc < cpnum_prevcc)
|
|
{
|
|
state_array newNFAs;
|
|
state_type atom;
|
|
|
|
atom.reset();
|
|
atom.character = meta_char::mc_eq; // '='
|
|
atom.type = st_lookaround_open;
|
|
atom.next1 = static_cast<std::ptrdiff_t>(cur - 1) * 2 + 2;
|
|
atom.next2 = 1;
|
|
atom.quantifier.atleast = 2; // Match point rewinder.
|
|
newNFAs.append(1, atom);
|
|
|
|
newNFAs.append(this->NFA_states, 1, cur - 1);
|
|
|
|
atom.type = st_lookaround_close;
|
|
atom.next1 = 0;
|
|
atom.next2 = 0;
|
|
newNFAs.append(1, atom);
|
|
|
|
insert_at(1, newNFAs.size());
|
|
this->NFA_states.replace(1, newNFAs.size(), newNFAs);
|
|
this->NFA_states[0].next2 = this->NFA_states[0].next1;
|
|
this->NFA_states[0].next1 = 1;
|
|
|
|
return;
|
|
}
|
|
}
|
|
#endif // !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER) && !defined(SRELLDBG_NO_1STCHRCLS) && !defined(SRELLDBG_NO_BITSET)
|
|
|
|
insert_at(cur, 1);
|
|
state_type &mnpstate = this->NFA_states[cur];
|
|
state_type &charstate = this->NFA_states[cur - 1];
|
|
|
|
mnpstate.type = st_move_nextpos;
|
|
|
|
#if !defined(SRELLDBG_NO_SPLITCC)
|
|
|
|
if (inserted)
|
|
{
|
|
charstate.next2 = 1;
|
|
}
|
|
else
|
|
#endif
|
|
if (charstate.next1 == 0)
|
|
{
|
|
mnpstate.next1 = charstate.next2 - 1;
|
|
charstate.next2 = 1;
|
|
}
|
|
else
|
|
{
|
|
mnpstate.next1 = -2;
|
|
charstate.next1 = 1;
|
|
}
|
|
}
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_NEXTPOS_OPT)
|
|
}
|
|
|
|
bool is_exclusive_sequence(const re_quantifier &eq, const state_size_type curno, const state_size_type nextno) // const
|
|
{
|
|
const state_type &curstate = this->NFA_states[curno];
|
|
range_pairs curchar_class;
|
|
range_pairs nextchar_class;
|
|
|
|
if (curstate.type == st_character)
|
|
{
|
|
curchar_class.join(range_pair_helper(curstate.character));
|
|
}
|
|
else if (curstate.type == st_character_class)
|
|
{
|
|
curchar_class = this->character_class[curstate.number];
|
|
if (curchar_class.size() == 0) // Means [], which always makes matching fail.
|
|
return true; // For preventing the automaton from pushing bt data.
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
|
|
const bool canbe0length = gather_nextchars(nextchar_class, nextno, 0u, true);
|
|
|
|
if (nextchar_class.size())
|
|
{
|
|
if (!canbe0length || eq.is_greedy)
|
|
{
|
|
#if !defined(SRELLDBG_NO_SPLITCC)
|
|
|
|
range_pairs kept;
|
|
range_pairs removed;
|
|
|
|
curchar_class.split_ranges(kept, removed, nextchar_class);
|
|
|
|
if (removed.size() == 0) // !curchar_class.is_overlap(nextchar_class)
|
|
return true;
|
|
|
|
if (curstate.type == st_character_class && kept.size() && eq.is_infinity())
|
|
{
|
|
{
|
|
state_type &curstate2 = this->NFA_states[curno];
|
|
|
|
curstate2.character = kept.consists_of_one_character(this->is_icase());
|
|
if (curstate2.character != constants::invalid_u32value)
|
|
curstate2.type = st_character;
|
|
else
|
|
curstate2.number = this->character_class.register_newclass(kept);
|
|
}
|
|
const re_quantifier backupeq(eq);
|
|
|
|
insert_at(nextno, 2);
|
|
state_type &n0 = this->NFA_states[nextno];
|
|
state_type &n1 = this->NFA_states[nextno + 1];
|
|
|
|
n0.reset();
|
|
n0.type = st_epsilon;
|
|
n0.character = meta_char::mc_astrsk;
|
|
n0.quantifier = backupeq;
|
|
// n0.next2 = 1;
|
|
n0.next2 = 2;
|
|
if (!n0.quantifier.is_greedy)
|
|
{
|
|
n0.next1 = n0.next2;
|
|
n0.next2 = 1;
|
|
}
|
|
|
|
n1.reset();
|
|
n1.type = st_character_class;
|
|
|
|
n1.character = removed.consists_of_one_character(this->is_icase());
|
|
if (n1.character != constants::invalid_u32value)
|
|
n1.type = st_character;
|
|
else
|
|
n1.number = this->character_class.register_newclass(removed);
|
|
|
|
n1.next1 = -2;
|
|
// n1.next2 = 0;
|
|
return true;
|
|
}
|
|
|
|
#else // defined(SRELLDBG_NO_SPLITCC)
|
|
|
|
if (!curchar_class.is_overlap(nextchar_class))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#endif // !defined(SRELLDBG_NO_SPLITCC)
|
|
}
|
|
}
|
|
else if (/* nextchar_class.size() == 0 && */ (!canbe0length || only_success_left(nextno)))
|
|
{
|
|
// (size() == 0 && !canbe0length) means [].
|
|
return eq.is_greedy;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool only_success_left(typename state_array::size_type pos) const
|
|
{
|
|
for (;;)
|
|
{
|
|
const state_type &state = this->NFA_states[pos];
|
|
|
|
switch (state.type)
|
|
{
|
|
case st_success:
|
|
return true;
|
|
|
|
case st_roundbracket_close:
|
|
case st_backreference:
|
|
if (state.next2 != 0 && state.next1 != state.next2)
|
|
return false;
|
|
break;
|
|
|
|
case st_epsilon:
|
|
if (state.next2 != 0 && !only_success_left(pos + state.next2))
|
|
return false;
|
|
break;
|
|
|
|
case st_roundbracket_open:
|
|
break; // /a*()/
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
if (state.next1)
|
|
pos += state.next1;
|
|
else
|
|
return false;
|
|
}
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
|
|
void insert_at(const typename state_array::size_type pos, const std::ptrdiff_t len)
|
|
{
|
|
state_type newstate;
|
|
|
|
for (typename state_array::size_type cur = 0; cur < pos; ++cur)
|
|
{
|
|
state_type &state = this->NFA_states[cur];
|
|
|
|
if (state.next1 && (cur + state.next1) >= pos)
|
|
state.next1 += len;
|
|
|
|
if (state.next2 && (cur + state.next2) >= pos)
|
|
state.next2 += len;
|
|
}
|
|
|
|
for (typename state_array::size_type cur = pos; cur < this->NFA_states.size(); ++cur)
|
|
{
|
|
state_type &state = this->NFA_states[cur];
|
|
|
|
if ((cur + state.next1) < pos)
|
|
state.next1 -= len;
|
|
|
|
if ((cur + state.next2) < pos)
|
|
state.next2 -= len;
|
|
}
|
|
|
|
newstate.reset();
|
|
newstate.type = st_epsilon;
|
|
for (std::ptrdiff_t count = 0; count < len; ++count)
|
|
this->NFA_states.insert(pos, newstate);
|
|
}
|
|
|
|
#if !defined(SRELLDBG_NO_NEXTPOS_OPT)
|
|
#endif // !defined(SRELLDBG_NO_NEXTPOS_OPT)
|
|
|
|
#if !defined(SRELLDBG_NO_BRANCH_OPT) || !defined(SRELLDBG_NO_BRANCH_OPT2)
|
|
|
|
state_size_type gather_if_char_or_charclass(range_pairs &charclass, state_size_type pos, const bool strictly) const
|
|
{
|
|
for (;;)
|
|
{
|
|
const state_type &curstate = this->NFA_states[pos];
|
|
|
|
if (curstate.type == st_character && curstate.next2 == 0)
|
|
{
|
|
charclass.set_solerange(range_pair_helper(curstate.character));
|
|
return pos;
|
|
}
|
|
else if (curstate.type == st_character_class && curstate.next2 == 0)
|
|
{
|
|
charclass = this->character_class[curstate.number];
|
|
return pos;
|
|
}
|
|
else if (curstate.type == st_epsilon && curstate.next2 == 0 && !strictly)
|
|
{
|
|
}
|
|
else
|
|
break;
|
|
|
|
pos += curstate.next1;
|
|
}
|
|
return 0;
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_BRANCH_OPT) || !defined(SRELLDBG_NO_BRANCH_OPT2)
|
|
|
|
#if !defined(SRELLDBG_NO_BRANCH_OPT)
|
|
void branch_optimisation()
|
|
{
|
|
range_pairs nextcharclass1;
|
|
|
|
for (typename state_array::size_type pos = 0; pos < this->NFA_states.size(); ++pos)
|
|
{
|
|
const state_type &state = this->NFA_states[pos];
|
|
|
|
if (state.is_branch())
|
|
{
|
|
const typename state_array::size_type nextcharpos = gather_if_char_or_charclass(nextcharclass1, pos + state.next1, false);
|
|
|
|
if (nextcharpos)
|
|
{
|
|
range_pairs nextcharclass2;
|
|
const bool canbe0length = gather_nextchars(nextcharclass2, pos + state.next2, 0u /* bracket_number */, true);
|
|
|
|
if (!canbe0length && !nextcharclass1.is_overlap(nextcharclass2))
|
|
{
|
|
state_type &branch = this->NFA_states[pos];
|
|
state_type &next1 = this->NFA_states[nextcharpos];
|
|
|
|
next1.next2 = pos + branch.next2 - nextcharpos;
|
|
branch.next2 = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_BRANCH_OPT)
|
|
|
|
#if !defined(SRELL_NO_ICASE)
|
|
bool check_if_really_needs_icase_search()
|
|
{
|
|
uchar32 u32chars[ucf_constants::rev_maxset];
|
|
|
|
for (typename state_array::size_type i = 0; i < this->NFA_states.size(); ++i)
|
|
{
|
|
const state_type &state = this->NFA_states[i];
|
|
|
|
if (state.type == st_character)
|
|
{
|
|
if (unicode_case_folding::casefoldedcharset(u32chars, state.character) > 1)
|
|
return true;
|
|
}
|
|
else if (state.type == st_backreference)
|
|
return true;
|
|
}
|
|
// this->soflags &= ~regex_constants::icase;
|
|
return false;
|
|
}
|
|
#endif // !defined(SRELL_NO_ICASE)
|
|
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
void setup_bmhdata()
|
|
{
|
|
simple_array<uchar32> u32s;
|
|
|
|
for (typename state_array::size_type i = 1; i < this->NFA_states.size(); ++i)
|
|
{
|
|
const state_type &state = this->NFA_states[i];
|
|
|
|
if (state.type == st_character)
|
|
u32s.push_backncr(state.character);
|
|
else
|
|
{
|
|
u32s.clear();
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (u32s.size() > 1)
|
|
// if ((u32s.size() > 1 && !this->is_ricase()) || (u32s.size() > 2 && this->is_ricase()))
|
|
{
|
|
if (this->bmdata)
|
|
this->bmdata->clear();
|
|
else
|
|
this->bmdata = new re_bmh<charT, utf_traits>;
|
|
|
|
this->bmdata->setup(u32s, this->is_ricase());
|
|
return /* false */;
|
|
}
|
|
|
|
if (this->bmdata)
|
|
delete this->bmdata;
|
|
this->bmdata = NULL;
|
|
// return true;
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_BMH)
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
void set_charclass_posinfo()
|
|
{
|
|
this->character_class.finalise();
|
|
for (typename state_array::size_type i = 1; i < this->NFA_states.size(); ++i)
|
|
{
|
|
state_type &state = this->NFA_states[i];
|
|
|
|
if (state.type == st_character_class)
|
|
{
|
|
const range_pair &posinfo = this->character_class.charclasspos(state.number);
|
|
state.quantifier.setccpos(posinfo.first, posinfo.second);
|
|
}
|
|
}
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_CCPOS)
|
|
|
|
#if !defined(SRELLDBG_NO_BRANCH_OPT2)
|
|
|
|
void branch_optimisation2()
|
|
{
|
|
range_pairs basealt1stch;
|
|
range_pairs nextalt1stch;
|
|
|
|
for (state_size_type pos = 0; pos < this->NFA_states.size(); ++pos)
|
|
{
|
|
const state_type &curstate = this->NFA_states[pos];
|
|
|
|
if (curstate.is_branch())
|
|
{
|
|
const state_size_type next1pos = pos + curstate.next1;
|
|
state_size_type precharchainpos = pos;
|
|
|
|
if (gather_if_char_or_charclass(basealt1stch, next1pos, true) != 0)
|
|
{
|
|
state_size_type next2pos = precharchainpos + curstate.next2;
|
|
state_size_type postcharchainpos = 0;
|
|
|
|
for (;;)
|
|
{
|
|
state_size_type next2next1pos = next2pos;
|
|
state_type &nstate2 = this->NFA_states[next2pos];
|
|
state_size_type next2next2pos = 0;
|
|
|
|
if (nstate2.is_branch())
|
|
{
|
|
next2next2pos = next2pos + nstate2.next2;
|
|
next2next1pos += nstate2.next1;
|
|
}
|
|
|
|
if (gather_if_char_or_charclass(nextalt1stch, next2next1pos, true) != 0)
|
|
{
|
|
const int relation = basealt1stch.relationship(nextalt1stch);
|
|
|
|
if (relation == 0)
|
|
{
|
|
if (next2next2pos) // if (nstate2.is_branch())
|
|
{
|
|
nstate2.reset();
|
|
nstate2.type = st_epsilon;
|
|
}
|
|
|
|
if (postcharchainpos == 0)
|
|
{
|
|
postcharchainpos = next1pos + 1;
|
|
insert_at(postcharchainpos, 1);
|
|
this->NFA_states[next1pos].next1 = 1;
|
|
}
|
|
else
|
|
{
|
|
const state_size_type prevbranchpos = postcharchainpos;
|
|
|
|
postcharchainpos = prevbranchpos + this->NFA_states[prevbranchpos].next2;
|
|
insert_at(postcharchainpos, 1);
|
|
this->NFA_states[prevbranchpos].next2 = postcharchainpos - prevbranchpos;
|
|
// Fix for bug210423. This line cannot be omitted, because
|
|
// NFA_states[prevbranchpos].next2 has been incremented in insert_at().
|
|
}
|
|
|
|
// if (next2next1pos >= postcharchainpos)
|
|
++next2next1pos;
|
|
|
|
if (precharchainpos >= postcharchainpos)
|
|
++precharchainpos;
|
|
|
|
state_type &prechainbranchpoint = this->NFA_states[precharchainpos];
|
|
if (next2next2pos)
|
|
{
|
|
// if (next2next2pos >= postcharchainpos)
|
|
++next2next2pos;
|
|
prechainbranchpoint.next2 = next2next2pos - precharchainpos;
|
|
}
|
|
else
|
|
{
|
|
prechainbranchpoint.next2 = 0;
|
|
}
|
|
|
|
state_type &newbranchpoint = this->NFA_states[postcharchainpos];
|
|
newbranchpoint.character = meta_char::mc_bar;
|
|
// newbranchpoint.next1 = 1;
|
|
newbranchpoint.next2 = next2next1pos + this->NFA_states[next2next1pos].next1 - postcharchainpos;
|
|
}
|
|
else if (relation == 1)
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
precharchainpos = next2pos;
|
|
}
|
|
else
|
|
{
|
|
// Fix for bug210428.
|
|
// Original: /mm2|m|mm/
|
|
// 1st step: /m(?:m2||m)/ <- No more optimisation can be performed. Must quit.
|
|
// 2nd step: /mm(?:2||)/ <- BUG.
|
|
break;
|
|
}
|
|
|
|
if (next2next2pos == 0)
|
|
break;
|
|
|
|
next2pos = next2next2pos;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif // !defined(SRELLDBG_NO_BRANCH_OPT2)
|
|
|
|
public: // For debug.
|
|
|
|
void print_NFA_states(const int) const;
|
|
};
|
|
// re_compiler
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_compiler.hpp"]
|
|
// ["regex_sub_match.hpp" ...
|
|
|
|
// 28.9, class template sub_match:
|
|
template <class BidirectionalIterator>
|
|
class sub_match : public std::pair<BidirectionalIterator, BidirectionalIterator>
|
|
{
|
|
public:
|
|
|
|
typedef typename std::iterator_traits<BidirectionalIterator>::value_type value_type;
|
|
typedef typename std::iterator_traits<BidirectionalIterator>::difference_type difference_type;
|
|
typedef BidirectionalIterator iterator;
|
|
typedef std::basic_string<value_type> string_type;
|
|
|
|
bool matched;
|
|
|
|
// constexpr sub_match(); // C++11.
|
|
|
|
sub_match() : matched(false)
|
|
{
|
|
}
|
|
|
|
difference_type length() const
|
|
{
|
|
return matched ? std::distance(this->first, this->second) : 0;
|
|
}
|
|
|
|
operator string_type() const
|
|
{
|
|
return matched ? string_type(this->first, this->second) : string_type();
|
|
}
|
|
|
|
string_type str() const
|
|
{
|
|
return matched ? string_type(this->first, this->second) : string_type();
|
|
}
|
|
|
|
int compare(const sub_match &s) const
|
|
{
|
|
return str().compare(s.str());
|
|
}
|
|
|
|
int compare(const string_type &s) const
|
|
{
|
|
return str().compare(s);
|
|
}
|
|
|
|
int compare(const value_type *const s) const
|
|
{
|
|
return str().compare(s);
|
|
}
|
|
};
|
|
|
|
// 28.9.2, sub_match non-member operators:
|
|
// [7.9.2] sub_match non-member operators
|
|
|
|
// Compares sub_match & with sub_match &.
|
|
template <class BiIter>
|
|
bool operator==(const sub_match<BiIter> &lhs, const sub_match<BiIter> &rhs)
|
|
{
|
|
return lhs.compare(rhs) == 0; // 1
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator!=(const sub_match<BiIter> &lhs, const sub_match<BiIter> &rhs)
|
|
{
|
|
return lhs.compare(rhs) != 0; // 2
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<(const sub_match<BiIter> &lhs, const sub_match<BiIter> &rhs)
|
|
{
|
|
return lhs.compare(rhs) < 0; // 3
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<=(const sub_match<BiIter> &lhs, const sub_match<BiIter> &rhs)
|
|
{
|
|
return lhs.compare(rhs) <= 0; // 4
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>=(const sub_match<BiIter> &lhs, const sub_match<BiIter> &rhs)
|
|
{
|
|
return lhs.compare(rhs) >= 0; // 5
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>(const sub_match<BiIter> &lhs, const sub_match<BiIter> &rhs)
|
|
{
|
|
return lhs.compare(rhs) > 0; // 6
|
|
}
|
|
|
|
// Compares basic_string & with sub_match &.
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator==(
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs.compare(lhs.c_str()) == 0; // 7
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator!=(
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(lhs == rhs); // 8
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator<(
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs.compare(lhs.c_str()) > 0; // 9
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator>(
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs < lhs; // 10
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator>=(
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(lhs < rhs); // 11
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator<=(
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(rhs < lhs); // 12
|
|
}
|
|
|
|
// Compares sub_match & with basic_string &.
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator==(
|
|
const sub_match<BiIter> &lhs,
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &rhs
|
|
)
|
|
{
|
|
return lhs.compare(rhs.c_str()) == 0; // 13
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator!=(
|
|
const sub_match<BiIter> &lhs,
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &rhs
|
|
)
|
|
{
|
|
return !(lhs == rhs); // 14
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator<(
|
|
const sub_match<BiIter> &lhs,
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &rhs
|
|
)
|
|
{
|
|
return lhs.compare(rhs.c_str()) < 0; // 15
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator>(
|
|
const sub_match<BiIter> &lhs,
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &rhs
|
|
)
|
|
{
|
|
return rhs < lhs; // 16
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator>=(
|
|
const sub_match<BiIter> &lhs,
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &rhs
|
|
)
|
|
{
|
|
return !(lhs < rhs); // 17
|
|
}
|
|
|
|
template <class BiIter, class ST, class SA>
|
|
bool operator<=(
|
|
const sub_match<BiIter> &lhs,
|
|
const std::basic_string<typename std::iterator_traits<BiIter>::value_type, ST, SA> &rhs
|
|
)
|
|
{
|
|
return !(rhs < lhs); // 18
|
|
}
|
|
|
|
// Compares iterator_traits::value_type * with sub_match &.
|
|
template <class BiIter>
|
|
bool operator==(
|
|
typename std::iterator_traits<BiIter>::value_type const *lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs.compare(lhs) == 0; // 19
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator!=(
|
|
typename std::iterator_traits<BiIter>::value_type const *lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(lhs == rhs); // 20
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<(
|
|
typename std::iterator_traits<BiIter>::value_type const *lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs.compare(lhs) > 0; // 21
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>(
|
|
typename std::iterator_traits<BiIter>::value_type const *lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs < lhs; // 22
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>=(
|
|
typename std::iterator_traits<BiIter>::value_type const *lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(lhs < rhs); // 23
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<=(
|
|
typename std::iterator_traits<BiIter>::value_type const *lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(rhs < lhs); // 24
|
|
}
|
|
|
|
// Compares sub_match & with iterator_traits::value_type *.
|
|
template <class BiIter>
|
|
bool operator==(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const *rhs
|
|
)
|
|
{
|
|
return lhs.compare(rhs) == 0; // 25
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator!=(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const *rhs
|
|
)
|
|
{
|
|
return !(lhs == rhs); // 26
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const *rhs
|
|
)
|
|
{
|
|
return lhs.compare(rhs) < 0; // 27
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const *rhs
|
|
)
|
|
{
|
|
return rhs < lhs; // 28
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>=(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const *rhs
|
|
)
|
|
{
|
|
return !(lhs < rhs); // 29
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<=(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const *rhs
|
|
)
|
|
{
|
|
return !(rhs < lhs); // 30
|
|
}
|
|
|
|
// Compares iterator_traits::value_type & with sub_match &.
|
|
template <class BiIter>
|
|
bool operator==(
|
|
typename std::iterator_traits<BiIter>::value_type const &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs.compare(typename sub_match<BiIter>::string_type(1, lhs)) == 0; // 31
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator!=(
|
|
typename std::iterator_traits<BiIter>::value_type const &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(lhs == rhs); // 32
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<(
|
|
typename std::iterator_traits<BiIter>::value_type const &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs.compare(typename sub_match<BiIter>::string_type(1, lhs)) > 0; // 33
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>(
|
|
typename std::iterator_traits<BiIter>::value_type const &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return rhs < lhs; // 34
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>=(
|
|
typename std::iterator_traits<BiIter>::value_type const &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(lhs < rhs); // 35
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<=(
|
|
typename std::iterator_traits<BiIter>::value_type const &lhs,
|
|
const sub_match<BiIter> &rhs
|
|
)
|
|
{
|
|
return !(rhs < lhs); // 36
|
|
}
|
|
|
|
// Compares sub_match & with iterator_traits::value_type &.
|
|
template <class BiIter>
|
|
bool operator==(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const &rhs
|
|
)
|
|
{
|
|
return lhs.compare(typename sub_match<BiIter>::string_type(1, rhs)) == 0; // 37
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator!=(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const &rhs
|
|
)
|
|
{
|
|
return !(lhs == rhs); // 38
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const &rhs
|
|
)
|
|
{
|
|
return lhs.compare(typename sub_match<BiIter>::string_type(1, rhs)) < 0; // 39
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const &rhs
|
|
)
|
|
{
|
|
return rhs < lhs; // 40
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator>=(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const &rhs
|
|
)
|
|
{
|
|
return !(lhs < rhs); // 41
|
|
}
|
|
|
|
template <class BiIter>
|
|
bool operator<=(
|
|
const sub_match<BiIter> &lhs,
|
|
typename std::iterator_traits<BiIter>::value_type const &rhs
|
|
)
|
|
{
|
|
return !(rhs < lhs); // 42
|
|
}
|
|
|
|
template <class charT, class ST, class BiIter>
|
|
std::basic_ostream<charT, ST> &operator<<(std::basic_ostream<charT, ST> &os, const sub_match<BiIter> &m)
|
|
{
|
|
return (os << m.str());
|
|
}
|
|
|
|
typedef sub_match<const char *> csub_match;
|
|
typedef sub_match<const wchar_t *> wcsub_match;
|
|
typedef sub_match<std::string::const_iterator> ssub_match;
|
|
typedef sub_match<std::wstring::const_iterator> wssub_match;
|
|
|
|
#if defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
typedef sub_match<const char16_t *> u16csub_match;
|
|
typedef sub_match<const char32_t *> u32csub_match;
|
|
typedef sub_match<std::u16string::const_iterator> u16ssub_match;
|
|
typedef sub_match<std::u32string::const_iterator> u32ssub_match;
|
|
#endif
|
|
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef sub_match<const char8_t *> u8csub_match;
|
|
#endif
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2
|
|
typedef sub_match<std::u8string::const_iterator> u8ssub_match;
|
|
#endif
|
|
|
|
typedef csub_match u8ccsub_match;
|
|
typedef ssub_match u8cssub_match;
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef u8ccsub_match u8csub_match;
|
|
#endif
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2
|
|
typedef u8cssub_match u8ssub_match;
|
|
#endif
|
|
|
|
#if defined(WCHAR_MAX)
|
|
#if WCHAR_MAX >= 0x10ffff
|
|
typedef wcsub_match u32wcsub_match;
|
|
typedef wssub_match u32wssub_match;
|
|
typedef u32wcsub_match u1632wcsub_match;
|
|
typedef u32wssub_match u1632wssub_match;
|
|
#elif WCHAR_MAX >= 0xffff
|
|
typedef wcsub_match u16wcsub_match;
|
|
typedef wssub_match u16wssub_match;
|
|
typedef u16wcsub_match u1632wcsub_match;
|
|
typedef u16wssub_match u1632wssub_match;
|
|
#endif
|
|
#endif
|
|
|
|
// ... "regex_sub_match.hpp"]
|
|
// ["regex_match_results.hpp" ...
|
|
|
|
// 28.10, class template match_results:
|
|
template <class BidirectionalIterator, class Allocator = std::allocator<sub_match<BidirectionalIterator> > >
|
|
class match_results
|
|
{
|
|
public:
|
|
|
|
typedef sub_match<BidirectionalIterator> value_type;
|
|
typedef const value_type & const_reference;
|
|
typedef const_reference reference;
|
|
// typedef implementation defined const_iterator;
|
|
typedef typename std::vector<value_type, Allocator>::const_iterator const_iterator;
|
|
typedef const_iterator iterator;
|
|
typedef typename std::iterator_traits<BidirectionalIterator>::difference_type difference_type;
|
|
|
|
#if defined(__cplusplus) && __cplusplus >= 201103L
|
|
typedef typename std::allocator_traits<Allocator>::size_type size_type;
|
|
#else
|
|
typedef typename Allocator::size_type size_type; // TR1.
|
|
#endif
|
|
|
|
typedef Allocator allocator_type;
|
|
typedef typename std::iterator_traits<BidirectionalIterator>::value_type char_type;
|
|
typedef std::basic_string<char_type> string_type;
|
|
|
|
public:
|
|
|
|
// 28.10.1, construct/copy/destroy:
|
|
// [7.10.1] construct/copy/destroy
|
|
explicit match_results(const Allocator &a = Allocator()) : ready_(false), sub_matches_(a)
|
|
{
|
|
}
|
|
|
|
match_results(const match_results &m)
|
|
{
|
|
operator=(m);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
match_results(match_results &&m) SRELL_NOEXCEPT
|
|
{
|
|
operator=(std::move(m));
|
|
}
|
|
#endif
|
|
|
|
match_results &operator=(const match_results &m)
|
|
{
|
|
if (this != &m)
|
|
{
|
|
// this->sstate_ = m.sstate_;
|
|
this->ready_ = m.ready_;
|
|
this->sub_matches_ = m.sub_matches_;
|
|
this->prefix_ = m.prefix_;
|
|
this->suffix_ = m.suffix_;
|
|
this->base_ = m.base_;
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
this->gnames_ = m.gnames_;
|
|
#endif
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
match_results &operator=(match_results &&m) SRELL_NOEXCEPT
|
|
{
|
|
if (this != &m)
|
|
{
|
|
// this->sstate_ = std::move(m.sstate_);
|
|
this->ready_ = m.ready_;
|
|
this->sub_matches_ = std::move(m.sub_matches_);
|
|
this->prefix_ = std::move(m.prefix_);
|
|
this->suffix_ = std::move(m.suffix_);
|
|
this->base_ = m.base_;
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
this->gnames_ = std::move(m.gnames_);
|
|
#endif
|
|
}
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
// ~match_results();
|
|
|
|
// 28.10.2, state:
|
|
bool ready() const
|
|
{
|
|
return ready_;
|
|
}
|
|
|
|
// 28.10.3, size:
|
|
// [7.10.2] size
|
|
size_type size() const
|
|
{
|
|
return sub_matches_.size();
|
|
}
|
|
|
|
size_type max_size() const
|
|
{
|
|
return sub_matches_.max_size();
|
|
// return static_cast<size_type>(~0) / sizeof (value_type);
|
|
}
|
|
|
|
bool empty() const
|
|
{
|
|
return size() == 0;
|
|
}
|
|
|
|
// 28.10.4, element access:
|
|
// [7.10.3] element access
|
|
difference_type length(const size_type sub = 0) const
|
|
{
|
|
return (*this)[sub].length();
|
|
}
|
|
|
|
difference_type position(const size_type sub = 0) const
|
|
{
|
|
const_reference ref = (*this)[sub];
|
|
|
|
return std::distance(base_, ref.first);
|
|
}
|
|
|
|
string_type str(const size_type sub = 0) const
|
|
{
|
|
return string_type((*this)[sub]);
|
|
}
|
|
|
|
const_reference operator[](const size_type n) const
|
|
{
|
|
#if defined(SRELL_STRICT_IMPL)
|
|
return n < sub_matches_.size() ? sub_matches_[n] : unmatched_;
|
|
#else
|
|
return sub_matches_[n];
|
|
#endif
|
|
}
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
// Helpers for overload resolution of the integer literal 0 of signed types.
|
|
template <typename IntegerType>
|
|
difference_type length(const IntegerType zero) const
|
|
{
|
|
return length(static_cast<size_type>(zero));
|
|
}
|
|
template <typename IntegerType>
|
|
difference_type position(const IntegerType zero) const
|
|
{
|
|
return position(static_cast<size_type>(zero));
|
|
}
|
|
template <typename IntegerType>
|
|
string_type str(const IntegerType zero) const
|
|
{
|
|
return str(static_cast<size_type>(zero));
|
|
}
|
|
template <typename IntegerType>
|
|
const_reference operator[](const IntegerType zero) const
|
|
{
|
|
return operator[](static_cast<size_type>(zero));
|
|
}
|
|
|
|
difference_type length(const string_type &sub) const
|
|
{
|
|
return (*this)[sub].length();
|
|
}
|
|
|
|
difference_type position(const string_type &sub) const
|
|
{
|
|
const_reference ref = (*this)[sub];
|
|
|
|
return std::distance(base_, ref.first);
|
|
}
|
|
|
|
string_type str(const string_type &sub) const
|
|
{
|
|
return string_type((*this)[sub]);
|
|
}
|
|
|
|
const_reference operator[](const string_type &sub) const
|
|
{
|
|
return sub_matches_[lookup_and_check_backref_number(sub.c_str(), sub.c_str() + sub.size())];
|
|
}
|
|
|
|
difference_type length(const char_type *sub) const
|
|
{
|
|
return (*this)[sub].length();
|
|
}
|
|
|
|
difference_type position(const char_type *sub) const
|
|
{
|
|
const_reference ref = (*this)[sub];
|
|
|
|
return std::distance(base_, ref.first);
|
|
}
|
|
|
|
string_type str(const char_type *sub) const
|
|
{
|
|
return string_type((*this)[sub]);
|
|
}
|
|
|
|
const_reference operator[](const char_type *sub) const
|
|
{
|
|
return sub_matches_[lookup_and_check_backref_number(sub, sub + std::char_traits<char_type>::length(sub))];
|
|
}
|
|
|
|
#endif // !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
const_reference prefix() const
|
|
{
|
|
return prefix_;
|
|
}
|
|
|
|
const_reference suffix() const
|
|
{
|
|
return suffix_;
|
|
}
|
|
|
|
const_iterator begin() const
|
|
{
|
|
return sub_matches_.begin();
|
|
}
|
|
|
|
const_iterator end() const
|
|
{
|
|
return sub_matches_.end();
|
|
}
|
|
|
|
const_iterator cbegin() const
|
|
{
|
|
return sub_matches_.begin();
|
|
}
|
|
|
|
const_iterator cend() const
|
|
{
|
|
return sub_matches_.end();
|
|
}
|
|
|
|
// 28.10.5, format:
|
|
// [7.10.4] format
|
|
template <class OutputIter>
|
|
OutputIter format(
|
|
OutputIter out,
|
|
const char_type *fmt_first,
|
|
const char_type *const fmt_last,
|
|
regex_constants::match_flag_type /* flags */ = regex_constants::format_default
|
|
) const
|
|
{
|
|
if (this->ready() && !this->empty())
|
|
{
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
const bool no_groupnames = gnames_.size() == 0;
|
|
#endif
|
|
const value_type &m0 = (*this)[0];
|
|
|
|
while (fmt_first != fmt_last)
|
|
{
|
|
if (*fmt_first != static_cast<char_type>(regex_internal::meta_char::mc_dollar)) // '$'
|
|
{
|
|
*out++ = *fmt_first++;
|
|
}
|
|
else
|
|
{
|
|
++fmt_first;
|
|
if (fmt_first == fmt_last)
|
|
{
|
|
*out++ = regex_internal::meta_char::mc_dollar; // '$';
|
|
}
|
|
else if (*fmt_first == static_cast<char_type>(regex_internal::char_other::co_amp)) // '&', $&
|
|
{
|
|
out = std::copy(m0.first, m0.second, out);
|
|
++fmt_first;
|
|
}
|
|
else if (*fmt_first == static_cast<char_type>(regex_internal::char_other::co_grav)) // '`', $`, prefix.
|
|
{
|
|
out = std::copy(this->prefix().first, this->prefix().second, out);
|
|
++fmt_first;
|
|
}
|
|
else if (*fmt_first == static_cast<char_type>(regex_internal::char_other::co_apos)) // '\'', $', suffix.
|
|
{
|
|
out = std::copy(this->suffix().first, this->suffix().second, out);
|
|
++fmt_first;
|
|
}
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
else if (*fmt_first == static_cast<char_type>(regex_internal::meta_char::mc_lt) && !no_groupnames) // '<', $<
|
|
{
|
|
const char_type *const current_backup = fmt_first;
|
|
bool replaced = false;
|
|
|
|
if (++fmt_first == fmt_last)
|
|
; // Do nothing.
|
|
else
|
|
{
|
|
const char_type *const name_begin = fmt_first;
|
|
|
|
for (;; ++fmt_first)
|
|
{
|
|
if (*fmt_first == static_cast<char_type>(regex_internal::meta_char::mc_gt))
|
|
{
|
|
const regex_internal::uint_l32 backref_number = lookup_backref_number(name_begin, fmt_first);
|
|
|
|
if (backref_number != regex_internal::groupname_mapper<char_type>::notfound)
|
|
{
|
|
const value_type &mn = (*this)[backref_number];
|
|
|
|
if (mn.matched)
|
|
out = std::copy(mn.first, mn.second, out);
|
|
// replaced = true;
|
|
}
|
|
replaced = true;
|
|
++fmt_first;
|
|
break;
|
|
}
|
|
if (fmt_first == fmt_last)
|
|
break;
|
|
}
|
|
}
|
|
if (!replaced)
|
|
{
|
|
fmt_first = current_backup;
|
|
*out++ = regex_internal::meta_char::mc_dollar; // '$';
|
|
}
|
|
}
|
|
#endif // !defined(SRELL_NO_NAMEDCAPTURE)
|
|
else
|
|
{
|
|
const char_type *const backup_pos = fmt_first;
|
|
size_type backref_number = 0;
|
|
|
|
if (fmt_first != fmt_last && *fmt_first >= static_cast<char_type>(regex_internal::char_alnum::ch_0) && *fmt_first <= static_cast<char_type>(regex_internal::char_alnum::ch_9)) // '0'-'9'
|
|
{
|
|
backref_number += *fmt_first - regex_internal::char_alnum::ch_0; // '0';
|
|
|
|
if (++fmt_first != fmt_last && *fmt_first >= static_cast<char_type>(regex_internal::char_alnum::ch_0) && *fmt_first <= static_cast<char_type>(regex_internal::char_alnum::ch_9)) // '0'-'9'
|
|
{
|
|
backref_number *= 10;
|
|
backref_number += *fmt_first - regex_internal::char_alnum::ch_0; // '0';
|
|
++fmt_first;
|
|
}
|
|
}
|
|
|
|
if (backref_number && backref_number < this->size())
|
|
{
|
|
const value_type &mn = (*this)[backref_number];
|
|
|
|
if (mn.matched)
|
|
out = std::copy(mn.first, mn.second, out);
|
|
}
|
|
else
|
|
{
|
|
*out++ = regex_internal::meta_char::mc_dollar; // '$';
|
|
|
|
fmt_first = backup_pos;
|
|
if (*fmt_first == static_cast<char_type>(regex_internal::meta_char::mc_dollar))
|
|
++fmt_first;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
template <class OutputIter, class ST, class SA>
|
|
OutputIter format(
|
|
OutputIter out,
|
|
const std::basic_string<char_type, ST, SA> &fmt,
|
|
regex_constants::match_flag_type flags = regex_constants::format_default
|
|
) const
|
|
{
|
|
return format(out, fmt.data(), fmt.data() + fmt.size(), flags);
|
|
}
|
|
|
|
template <class ST, class SA>
|
|
std::basic_string<char_type, ST, SA> format(
|
|
const string_type &fmt,
|
|
regex_constants::match_flag_type flags = regex_constants::format_default
|
|
) const
|
|
{
|
|
std::basic_string<char_type, ST, SA> result;
|
|
|
|
// format(std::back_insert_iterator<string_type>(result), fmt, flags);
|
|
format(std::back_inserter(result), fmt, flags);
|
|
return result;
|
|
}
|
|
|
|
string_type format(const char_type *fmt, regex_constants::match_flag_type flags = regex_constants::format_default) const
|
|
{
|
|
string_type result;
|
|
|
|
format(std::back_inserter(result), fmt, fmt + std::char_traits<char_type>::length(fmt), flags);
|
|
return result;
|
|
}
|
|
|
|
// 28.10.6, allocator:
|
|
// [7.10.5] allocator
|
|
allocator_type get_allocator() const
|
|
{
|
|
return allocator_type();
|
|
}
|
|
|
|
// 28.10.7, swap:
|
|
// [7.10.6] swap
|
|
void swap(match_results &that)
|
|
{
|
|
const match_results tmp(that);
|
|
that = *this;
|
|
*this = tmp;
|
|
}
|
|
|
|
public: // For internal.
|
|
|
|
typedef match_results<BidirectionalIterator> match_results_type;
|
|
typedef typename match_results_type::size_type match_results_size_type;
|
|
typedef typename regex_internal::re_search_state</*charT, */BidirectionalIterator> search_state_type;
|
|
|
|
search_state_type sstate_;
|
|
|
|
void clear_()
|
|
{
|
|
ready_ = false;
|
|
sub_matches_.clear();
|
|
// prefix_.matched = false;
|
|
// suffix_.matched = false;
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
gnames_.clear();
|
|
#endif
|
|
}
|
|
|
|
// template <typename charT>
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
bool set_match_results_(const regex_internal::groupname_mapper<char_type> &gnames)
|
|
#else
|
|
bool set_match_results_()
|
|
#endif
|
|
{
|
|
sub_matches_.resize(sstate_.bracket.size());
|
|
// value_type &m0 = sub_matches_[0];
|
|
|
|
sub_matches_[0].matched = true;
|
|
|
|
for (regex_internal::uint_l32 i = 1; i < static_cast<regex_internal::uint_l32>(sstate_.bracket.size()); ++i)
|
|
{
|
|
const typename search_state_type::submatch_type &br = sstate_.bracket[i];
|
|
value_type &sm = sub_matches_[i];
|
|
|
|
sm.first = br.core.open_at;
|
|
sm.second = br.core.close_at;
|
|
sm.matched = br.counter != 0;
|
|
}
|
|
|
|
base_ = sstate_.lblim;
|
|
prefix_.first = sstate_.srchbegin;
|
|
prefix_.second = sub_matches_[0].first = sstate_.bracket[0].core.open_at;
|
|
suffix_.first = sub_matches_[0].second = sstate_.nth.in_string;
|
|
suffix_.second = sstate_.srchend;
|
|
|
|
prefix_.matched = prefix_.first != prefix_.second; // The spec says prefix().first != prefix().second
|
|
suffix_.matched = suffix_.first != suffix_.second; // The spec says suffix().first != suffix().second
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
gnames_ = gnames;
|
|
#endif
|
|
ready_ = true;
|
|
return true;
|
|
}
|
|
|
|
bool set_match_results_bmh_()
|
|
{
|
|
sub_matches_.resize(1);
|
|
// value_type &m0 = sub_matches_[0];
|
|
|
|
sub_matches_[0].matched = true;
|
|
|
|
base_ = sstate_.lblim;
|
|
prefix_.first = sstate_.srchbegin;
|
|
prefix_.second = sub_matches_[0].first = sstate_.nth.in_string;
|
|
suffix_.first = sub_matches_[0].second = sstate_.nextpos;
|
|
suffix_.second = sstate_.srchend;
|
|
|
|
prefix_.matched = prefix_.first != prefix_.second;
|
|
suffix_.matched = suffix_.first != suffix_.second;
|
|
|
|
ready_ = true;
|
|
return true;
|
|
}
|
|
|
|
void set_prefix_first_(const BidirectionalIterator pf)
|
|
{
|
|
prefix_.first = pf;
|
|
}
|
|
|
|
bool mark_as_failed_()
|
|
{
|
|
ready_ = true; // 30.11.2 and 3: Postconditions: m.ready() == true in all cases.
|
|
return false;
|
|
}
|
|
|
|
private:
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
regex_internal::uint_l32 lookup_backref_number(const char_type *begin, const char_type *const end) const
|
|
{
|
|
typename regex_internal::groupname_mapper<char_type>::gname_string key(end - begin);
|
|
|
|
for (std::size_t i = 0; begin != end; ++begin, ++i)
|
|
key[i] = *begin;
|
|
|
|
return gnames_[key];
|
|
}
|
|
|
|
regex_internal::uint_l32 lookup_and_check_backref_number(const char_type *begin, const char_type *const end) const
|
|
{
|
|
const regex_internal::uint_l32 backrefno = lookup_backref_number(begin, end);
|
|
|
|
if (backrefno == regex_internal::groupname_mapper<char_type>::notfound)
|
|
throw regex_error(regex_constants::error_backref);
|
|
|
|
return backrefno;
|
|
}
|
|
|
|
#endif // !defined(SRELL_NO_NAMEDCAPTURE)
|
|
|
|
public: // For debug.
|
|
|
|
template <typename BasicRegexT>
|
|
void print_sub_matches(const BasicRegexT &, const int) const;
|
|
void print_addresses(const value_type &, const char *const) const;
|
|
|
|
private:
|
|
|
|
typedef std::vector<value_type, Allocator> sub_match_array;
|
|
|
|
bool ready_;
|
|
sub_match_array sub_matches_;
|
|
value_type prefix_;
|
|
value_type suffix_;
|
|
BidirectionalIterator base_;
|
|
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
regex_internal::groupname_mapper<char_type> gnames_;
|
|
#endif
|
|
#if defined(SRELL_STRICT_IMPL)
|
|
value_type unmatched_;
|
|
#endif
|
|
};
|
|
|
|
// 28.10.7, match_results swap:
|
|
// [7.10.6] match_results swap
|
|
template <class BidirectionalIterator, class Allocator>
|
|
void swap(
|
|
match_results<BidirectionalIterator, Allocator> &m1,
|
|
match_results<BidirectionalIterator, Allocator> &m2
|
|
)
|
|
{
|
|
m1.swap(m2);
|
|
}
|
|
|
|
// 28.10.8, match_results comparisons
|
|
template <class BidirectionalIterator, class Allocator>
|
|
bool operator==(
|
|
const match_results<BidirectionalIterator, Allocator> &m1,
|
|
const match_results<BidirectionalIterator, Allocator> &m2
|
|
)
|
|
{
|
|
if (!m1.ready() && !m2.ready())
|
|
return true;
|
|
|
|
if (m1.ready() && m2.ready())
|
|
{
|
|
if (m1.empty() && m2.empty())
|
|
return true;
|
|
|
|
if (!m1.empty() && !m2.empty())
|
|
{
|
|
return m1.prefix() == m2.prefix() && m1.size() == m2.size() && std::equal(m1.begin(), m1.end(), m2.begin()) && m1.suffix() == m2.suffix();
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <class BidirectionalIterator, class Allocator>
|
|
bool operator!=(
|
|
const match_results<BidirectionalIterator, Allocator> &m1,
|
|
const match_results<BidirectionalIterator, Allocator> &m2
|
|
)
|
|
{
|
|
return !(m1 == m2);
|
|
}
|
|
|
|
typedef match_results<const char *> cmatch;
|
|
typedef match_results<const wchar_t *> wcmatch;
|
|
typedef match_results<std::string::const_iterator> smatch;
|
|
typedef match_results<std::wstring::const_iterator> wsmatch;
|
|
|
|
#if defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
typedef match_results<const char16_t *> u16cmatch;
|
|
typedef match_results<const char32_t *> u32cmatch;
|
|
typedef match_results<std::u16string::const_iterator> u16smatch;
|
|
typedef match_results<std::u32string::const_iterator> u32smatch;
|
|
#endif
|
|
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef match_results<const char8_t *> u8cmatch;
|
|
#endif
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2
|
|
typedef match_results<std::u8string::const_iterator> u8smatch;
|
|
#endif
|
|
|
|
typedef cmatch u8ccmatch;
|
|
typedef smatch u8csmatch;
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef u8ccmatch u8cmatch;
|
|
#endif
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2
|
|
typedef u8csmatch u8smatch;
|
|
#endif
|
|
|
|
#if defined(WCHAR_MAX)
|
|
#if WCHAR_MAX >= 0x10ffff
|
|
typedef wcmatch u32wcmatch;
|
|
typedef wsmatch u32wsmatch;
|
|
typedef u32wcmatch u1632wcmatch;
|
|
typedef u32wsmatch u1632wsmatch;
|
|
#elif WCHAR_MAX >= 0xffff
|
|
typedef wcmatch u16wcmatch;
|
|
typedef wsmatch u16wsmatch;
|
|
typedef u16wcmatch u1632wcmatch;
|
|
typedef u16wsmatch u1632wsmatch;
|
|
#endif
|
|
#endif
|
|
|
|
// ... "regex_match_results.hpp"]
|
|
// ["rei_algorithm.hpp" ...
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
template <typename charT, typename traits>
|
|
class regex_object : public re_compiler<charT, traits>
|
|
{
|
|
public:
|
|
|
|
template <typename BidirectionalIterator>
|
|
bool search
|
|
(
|
|
const BidirectionalIterator begin,
|
|
const BidirectionalIterator end,
|
|
const BidirectionalIterator lookbehind_limit,
|
|
match_results<BidirectionalIterator> &results,
|
|
const regex_constants::match_flag_type flags /* = regex_constants::match_default */
|
|
) const
|
|
{
|
|
results.clear_();
|
|
|
|
// results.sstate_.template init<utf_traits>(begin, end, lookbehind_limit, flags);
|
|
results.sstate_.init(begin, end, lookbehind_limit, flags);
|
|
|
|
if (results.sstate_.match_continuous_flag())
|
|
{
|
|
if (this->NFA_states.size())
|
|
{
|
|
results.sstate_.set_entrypoint(this->NFA_states[0].next_state2);
|
|
goto DO_SEARCH;
|
|
}
|
|
}
|
|
else
|
|
#if !defined(SRELLDBG_NO_BMH)
|
|
if (this->bmdata)
|
|
{
|
|
#if !defined(SRELL_NO_ICASE)
|
|
if (!this->is_ricase() ? this->bmdata->do_casesensitivesearch(results.sstate_, typename std::iterator_traits<BidirectionalIterator>::iterator_category()) : this->bmdata->do_icasesearch(results.sstate_, typename std::iterator_traits<BidirectionalIterator>::iterator_category()))
|
|
#else
|
|
if (this->bmdata->do_casesensitivesearch(results.sstate_, typename std::iterator_traits<BidirectionalIterator>::iterator_category()))
|
|
#endif
|
|
return results.set_match_results_bmh_();
|
|
}
|
|
else
|
|
#endif
|
|
if (this->NFA_states.size())
|
|
{
|
|
results.sstate_.set_entrypoint(this->NFA_states[0].next_state1);
|
|
|
|
DO_SEARCH:
|
|
results.sstate_.init_for_automaton(this->number_of_brackets, this->number_of_counters, this->number_of_repeats);
|
|
|
|
#if !defined(SRELL_NO_ICASE)
|
|
if (!this->is_ricase() ? do_search<false>(results) : do_search<true>(results))
|
|
#else
|
|
if (do_search<false>(results))
|
|
#endif
|
|
{
|
|
#if !defined(SRELL_NO_NAMEDCAPTURE)
|
|
return results.set_match_results_(this->namedcaptures);
|
|
#else
|
|
return results.set_match_results_();
|
|
#endif
|
|
}
|
|
}
|
|
return results.mark_as_failed_();
|
|
}
|
|
|
|
private:
|
|
|
|
typedef typename traits::utf_traits utf_traits;
|
|
|
|
template <const bool icase, typename BidirectionalIterator>
|
|
bool do_search
|
|
(
|
|
match_results<BidirectionalIterator> &results
|
|
) const
|
|
{
|
|
re_search_state</*charT, */BidirectionalIterator> &sstate = results.sstate_;
|
|
const BidirectionalIterator searchend = sstate.nth.in_string;
|
|
|
|
for (;;)
|
|
{
|
|
const bool final = sstate.nextpos == searchend;
|
|
|
|
sstate.nth.in_string = sstate.nextpos;
|
|
|
|
if (!final)
|
|
{
|
|
|
|
#ifdef SRELLDBG_NO_1STCHRCLS
|
|
utf_traits::codepoint_inc(sstate.nextpos, sstate.srchend);
|
|
#else
|
|
{
|
|
#if !defined(SRELLDBG_NO_BITSET)
|
|
if (!this->firstchar_class_bs.test((*sstate.nextpos++) & utf_traits::bitsetmask))
|
|
#else
|
|
const uchar32 firstchar = utf_traits::codepoint_inc(sstate.nextpos, sstate.srchend);
|
|
|
|
if (!this->firstchar_class.is_included(firstchar))
|
|
#endif
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
// Even when final == true, we have to try for such expressions
|
|
// as "" =~ /^$/ or "..." =~ /$/.
|
|
|
|
#if defined(SRELL_NO_LIMIT_COUNTER)
|
|
sstate.reset(/* first */);
|
|
#else
|
|
sstate.reset(/* first, */ this->limit_counter);
|
|
#endif
|
|
if (run_automaton<icase, false>(sstate /* , false */))
|
|
return true;
|
|
|
|
if (final)
|
|
break;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <typename T, const bool>
|
|
struct casehelper
|
|
{
|
|
static T canonicalise(const T t)
|
|
{
|
|
return t;
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
struct casehelper<T, true>
|
|
{
|
|
static T canonicalise(const T t)
|
|
{
|
|
return unicode_case_folding::do_casefolding(t);
|
|
}
|
|
};
|
|
|
|
template <const bool icase, const bool reverse, typename BidirectionalIterator>
|
|
bool run_automaton
|
|
(
|
|
// match_results<BidirectionalIterator> &results,
|
|
re_search_state</*charT, */BidirectionalIterator> &sstate
|
|
// , const bool is_recursive /* = false */
|
|
) const
|
|
{
|
|
typedef casehelper<uchar32, icase> casehelper_type;
|
|
typedef typename re_object_core<charT, traits>::state_type state_type;
|
|
typedef re_search_state</*charT, */BidirectionalIterator> ss_type;
|
|
// typedef typename ss_type::search_core_state scstate_type;
|
|
typedef typename ss_type::submatch_type submatch_type;
|
|
typedef typename ss_type::submatchcore_type submatchcore_type;
|
|
typedef typename ss_type::counter_type counter_type;
|
|
typedef typename ss_type::position_type position_type;
|
|
bool is_matched;
|
|
|
|
goto START;
|
|
|
|
JUDGE:
|
|
if (is_matched)
|
|
{
|
|
MATCHED:
|
|
sstate.nth.in_NFA_states = sstate.nth.in_NFA_states->next_state1;
|
|
}
|
|
else
|
|
{
|
|
NOT_MATCHED:
|
|
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
if (--sstate.failure_counter)
|
|
{
|
|
#endif
|
|
if (sstate.bt_stack.size() > sstate.btstack_size)
|
|
{
|
|
sstate.nth = sstate.bt_stack.back();
|
|
sstate.bt_stack.pop_back();
|
|
|
|
sstate.nth.in_NFA_states = sstate.nth.in_NFA_states->next_state2;
|
|
// continue;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
#if !defined(SRELL_NO_LIMIT_COUNTER)
|
|
}
|
|
else
|
|
throw regex_error(regex_constants::error_complexity);
|
|
#endif
|
|
}
|
|
|
|
// START:
|
|
for (;;)
|
|
{
|
|
START:
|
|
const state_type ¤t_NFA = *sstate.nth.in_NFA_states;
|
|
|
|
switch (current_NFA.type)
|
|
{
|
|
case st_character:
|
|
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4127)
|
|
#endif
|
|
if (!reverse)
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(pop)
|
|
#endif
|
|
{
|
|
if (!sstate.is_at_srchend())
|
|
{
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
const BidirectionalIterator prevpos = sstate.nth.in_string;
|
|
#endif
|
|
const uchar32 uchar = casehelper_type::canonicalise(utf_traits::codepoint_inc(sstate.nth.in_string, sstate.srchend));
|
|
RETRY_CF:
|
|
const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states;
|
|
|
|
if (current_NFA2.character == uchar)
|
|
goto MATCHED;
|
|
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
if (current_NFA2.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA2.next_state2;
|
|
|
|
if (sstate.nth.in_NFA_states->type == st_character)
|
|
goto RETRY_CF;
|
|
|
|
sstate.nth.in_string = prevpos;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
else if (current_NFA.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
else // reverse == true.
|
|
{
|
|
if (!sstate.is_at_lookbehindlimit())
|
|
{
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
const BidirectionalIterator prevpos = sstate.nth.in_string;
|
|
#endif
|
|
const uchar32 uchar = casehelper_type::canonicalise(utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim));
|
|
RETRY_CB:
|
|
const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states;
|
|
|
|
if (current_NFA2.character == uchar)
|
|
goto MATCHED;
|
|
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
if (current_NFA2.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA2.next_state2;
|
|
|
|
if (sstate.nth.in_NFA_states->type == st_character)
|
|
goto RETRY_CB;
|
|
|
|
sstate.nth.in_string = prevpos;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
else if (current_NFA.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
goto NOT_MATCHED;
|
|
|
|
case st_character_class:
|
|
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4127)
|
|
#endif
|
|
if (!reverse)
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(pop)
|
|
#endif
|
|
{
|
|
if (!sstate.is_at_srchend())
|
|
{
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
const BidirectionalIterator prevpos = sstate.nth.in_string;
|
|
#endif
|
|
const uchar32 uchar = utf_traits::codepoint_inc(sstate.nth.in_string, sstate.srchend);
|
|
// RETRY_CCF:
|
|
const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states;
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
if (this->character_class.is_included(current_NFA2.quantifier.offset, current_NFA2.quantifier.length, uchar))
|
|
#else
|
|
if (this->character_class.is_included(current_NFA2.number, uchar))
|
|
#endif
|
|
goto MATCHED;
|
|
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
if (current_NFA2.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA2.next_state2;
|
|
|
|
// if (sstate.nth.in_NFA_states->type == st_character_class)
|
|
// goto RETRY_CCF;
|
|
|
|
sstate.nth.in_string = prevpos;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
else if (current_NFA.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
else // reverse == true.
|
|
{
|
|
if (!sstate.is_at_lookbehindlimit())
|
|
{
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
const BidirectionalIterator prevpos = sstate.nth.in_string;
|
|
#endif
|
|
const uchar32 uchar = utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim);
|
|
// RETRY_CCB:
|
|
const state_type ¤t_NFA2 = *sstate.nth.in_NFA_states;
|
|
|
|
#if !defined(SRELLDBG_NO_CCPOS)
|
|
if (this->character_class.is_included(current_NFA2.quantifier.offset, current_NFA2.quantifier.length, uchar))
|
|
#else
|
|
if (this->character_class.is_included(current_NFA2.number, uchar))
|
|
#endif
|
|
goto MATCHED;
|
|
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
if (current_NFA2.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA2.next_state2;
|
|
|
|
// if (sstate.nth.in_NFA_states->type == st_character_class)
|
|
// goto RETRY_CCB;
|
|
|
|
sstate.nth.in_string = prevpos;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
#if !defined(SRELLDBG_NO_ASTERISK_OPT)
|
|
else if (current_NFA.next_state2)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
goto NOT_MATCHED;
|
|
|
|
case st_epsilon:
|
|
|
|
#if defined(SRELLDBG_NO_SKIP_EPSILON)
|
|
if (current_NFA.next_state2)
|
|
#endif
|
|
{
|
|
sstate.bt_stack.push_back(sstate.nth); // sstate.push();
|
|
}
|
|
|
|
sstate.nth.in_NFA_states = current_NFA.next_state1;
|
|
continue;
|
|
|
|
default:
|
|
switch (current_NFA.type)
|
|
{
|
|
|
|
case st_check_counter:
|
|
{
|
|
const uint_l32 counter = sstate.counter[current_NFA.number];
|
|
|
|
if (counter < current_NFA.quantifier.atmost)
|
|
{
|
|
++sstate.counter[current_NFA.number];
|
|
|
|
LOOP_WITHOUT_INCREMENT:
|
|
|
|
if (counter >= current_NFA.quantifier.atleast)
|
|
{
|
|
sstate.bt_stack.push_back(sstate.nth);
|
|
sstate.nth.in_NFA_states = current_NFA.next_state1;
|
|
}
|
|
else
|
|
{
|
|
sstate.nth.in_NFA_states
|
|
= current_NFA.quantifier.is_greedy
|
|
? current_NFA.next_state1
|
|
: current_NFA.next_state2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (current_NFA.quantifier.is_infinity())
|
|
goto LOOP_WITHOUT_INCREMENT;
|
|
|
|
sstate.nth.in_NFA_states
|
|
= current_NFA.quantifier.is_greedy
|
|
? current_NFA.next_state2
|
|
: current_NFA.next_state1;
|
|
}
|
|
}
|
|
continue;
|
|
|
|
case st_decrement_counter:
|
|
--sstate.counter[current_NFA.number];
|
|
goto NOT_MATCHED;
|
|
|
|
case st_save_and_reset_counter:
|
|
{
|
|
counter_type &c = sstate.counter[current_NFA.number];
|
|
|
|
sstate.counter_stack.push_back(c);
|
|
sstate.bt_stack.push_back(sstate.nth);
|
|
c = 0;
|
|
}
|
|
goto MATCHED;
|
|
|
|
case st_restore_counter:
|
|
sstate.counter[current_NFA.number] = sstate.counter_stack.back();
|
|
sstate.counter_stack.pop_back();
|
|
goto NOT_MATCHED;
|
|
|
|
case st_roundbracket_open: // '(':
|
|
{
|
|
submatch_type &bracket = sstate.bracket[current_NFA.number];
|
|
|
|
sstate.capture_stack.push_back(bracket.core);
|
|
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4127)
|
|
#endif
|
|
if (!reverse)
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(pop)
|
|
#endif
|
|
{
|
|
bracket.core.open_at = sstate.nth.in_string;
|
|
}
|
|
else
|
|
bracket.core.close_at = sstate.nth.in_string;
|
|
|
|
++bracket.counter;
|
|
|
|
for (uint_l32 brno = current_NFA.quantifier.atleast; brno <= current_NFA.quantifier.atmost; ++brno)
|
|
{
|
|
submatch_type &inner_bracket = sstate.bracket[brno];
|
|
|
|
sstate.capture_stack.push_back(inner_bracket.core);
|
|
sstate.counter_stack.push_back(inner_bracket.counter);
|
|
inner_bracket.core.open_at = inner_bracket.core.close_at = sstate.srchend;
|
|
inner_bracket.counter = 0;
|
|
// ECMAScript spec (3-5.1) 15.10.2.5, NOTE 3.
|
|
// ECMAScript 2018 (ES9) 21.2.2.5.1, Note 3.
|
|
}
|
|
|
|
sstate.bt_stack.push_back(sstate.nth);
|
|
}
|
|
goto MATCHED;
|
|
|
|
case st_roundbracket_pop: // '/':
|
|
{
|
|
for (uint_l32 brno = current_NFA.quantifier.atmost; brno >= current_NFA.quantifier.atleast; --brno)
|
|
{
|
|
submatch_type &inner_bracket = sstate.bracket[brno];
|
|
|
|
inner_bracket.counter = sstate.counter_stack.back();
|
|
inner_bracket.core = sstate.capture_stack.back();
|
|
sstate.counter_stack.pop_back();
|
|
sstate.capture_stack.pop_back();
|
|
}
|
|
|
|
submatch_type &bracket = sstate.bracket[current_NFA.number];
|
|
|
|
bracket.core = sstate.capture_stack.back();
|
|
sstate.capture_stack.pop_back();
|
|
--bracket.counter;
|
|
}
|
|
goto NOT_MATCHED;
|
|
|
|
case st_roundbracket_close: // ')':
|
|
{
|
|
submatch_type &bracket = sstate.bracket[current_NFA.number];
|
|
submatchcore_type &brc = bracket.core;
|
|
|
|
if ((!reverse ? brc.open_at : brc.close_at) != sstate.nth.in_string)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA.next_state1;
|
|
}
|
|
else // 0 width match, breaks from the loop.
|
|
{
|
|
if (current_NFA.next_state1->type != st_check_counter)
|
|
{
|
|
if (bracket.counter > 1)
|
|
goto NOT_MATCHED; // ECMAScript spec 15.10.2.5, note 4.
|
|
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
// Accepts 0 width match and exits.
|
|
}
|
|
else
|
|
{
|
|
// A pair with check_counter.
|
|
const counter_type counter = sstate.counter[current_NFA.next_state1->number];
|
|
|
|
if (counter > current_NFA.next_state1->quantifier.atleast)
|
|
goto NOT_MATCHED; // Takes a captured string in the previous loop.
|
|
|
|
sstate.nth.in_NFA_states = current_NFA.next_state1;
|
|
// Accepts 0 width match and continues.
|
|
}
|
|
}
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4127)
|
|
#endif
|
|
if (!reverse)
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(pop)
|
|
#endif
|
|
{
|
|
brc.close_at = sstate.nth.in_string;
|
|
}
|
|
else // reverse == true.
|
|
{
|
|
brc.open_at = sstate.nth.in_string;
|
|
}
|
|
}
|
|
continue;
|
|
|
|
case st_repeat_in_push:
|
|
{
|
|
position_type &r = sstate.repeat[current_NFA.number];
|
|
|
|
sstate.repeat_stack.push_back(r);
|
|
r = sstate.nth.in_string;
|
|
|
|
for (uint_l32 brno = current_NFA.quantifier.atleast; brno <= current_NFA.quantifier.atmost; ++brno)
|
|
{
|
|
submatch_type &inner_bracket = sstate.bracket[brno];
|
|
|
|
sstate.capture_stack.push_back(inner_bracket.core);
|
|
sstate.counter_stack.push_back(inner_bracket.counter);
|
|
inner_bracket.core.open_at = inner_bracket.core.close_at = sstate.srchend;
|
|
inner_bracket.counter = 0;
|
|
// ECMAScript 2019 (ES10) 21.2.2.5.1, Note 3.
|
|
}
|
|
sstate.bt_stack.push_back(sstate.nth);
|
|
}
|
|
goto MATCHED;
|
|
|
|
case st_repeat_in_pop:
|
|
for (uint_l32 brno = current_NFA.quantifier.atmost; brno >= current_NFA.quantifier.atleast; --brno)
|
|
{
|
|
submatch_type &inner_bracket = sstate.bracket[brno];
|
|
|
|
inner_bracket.counter = sstate.counter_stack.back();
|
|
inner_bracket.core = sstate.capture_stack.back();
|
|
sstate.counter_stack.pop_back();
|
|
sstate.capture_stack.pop_back();
|
|
}
|
|
|
|
sstate.repeat[current_NFA.number] = sstate.repeat_stack.back();
|
|
sstate.repeat_stack.pop_back();
|
|
goto NOT_MATCHED;
|
|
|
|
case st_check_0_width_repeat:
|
|
if (sstate.nth.in_string != sstate.repeat[current_NFA.number])
|
|
goto MATCHED;
|
|
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
continue;
|
|
|
|
case st_backreference: // '\\':
|
|
{
|
|
const submatch_type &bracket = sstate.bracket[current_NFA.number];
|
|
|
|
if (bracket.counter == 0) // Undefined.
|
|
{
|
|
ESCAPE_FROM_ZERO_WIDTH_MATCH:
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
const submatchcore_type &brc = bracket.core;
|
|
|
|
if (brc.open_at == brc.close_at)
|
|
{
|
|
goto ESCAPE_FROM_ZERO_WIDTH_MATCH;
|
|
}
|
|
else
|
|
{
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4127)
|
|
#endif
|
|
if (!reverse)
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
#pragma warning(pop)
|
|
#endif
|
|
{
|
|
for (BidirectionalIterator backrefpos = brc.open_at; backrefpos != brc.close_at;)
|
|
{
|
|
if (!sstate.is_at_srchend())
|
|
{
|
|
const uchar32 uchartxt = utf_traits::codepoint_inc(sstate.nth.in_string, sstate.srchend);
|
|
const uchar32 ucharref = utf_traits::codepoint_inc(backrefpos, brc.close_at);
|
|
|
|
if (casehelper_type::canonicalise(uchartxt) == casehelper_type::canonicalise(ucharref))
|
|
continue;
|
|
}
|
|
goto NOT_MATCHED;
|
|
}
|
|
}
|
|
else // reverse == true.
|
|
{
|
|
for (BidirectionalIterator backrefpos = brc.close_at; backrefpos != brc.open_at;)
|
|
{
|
|
if (!sstate.is_at_lookbehindlimit())
|
|
{
|
|
const uchar32 uchartxt = utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim);
|
|
const uchar32 ucharref = utf_traits::dec_codepoint(backrefpos, brc.open_at);
|
|
|
|
if (casehelper_type::canonicalise(uchartxt) == casehelper_type::canonicalise(ucharref))
|
|
continue;
|
|
}
|
|
goto NOT_MATCHED;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
goto MATCHED;
|
|
|
|
case st_lookaround_open:
|
|
{
|
|
for (uint_l32 i = 1; i < this->number_of_brackets; ++i)
|
|
{
|
|
const submatch_type &sm = sstate.bracket[i];
|
|
sstate.capture_stack.push_back(sm.core);
|
|
sstate.counter_stack.push_back(sm.counter);
|
|
}
|
|
|
|
for (uint_l32 i = 0; i < this->number_of_counters; ++i)
|
|
sstate.counter_stack.push_back(sstate.counter[i]);
|
|
|
|
for (uint_l32 i = 0; i < this->number_of_repeats; ++i)
|
|
sstate.repeat_stack.push_back(sstate.repeat[i]);
|
|
|
|
const typename ss_type::bottom_state backup_bottom(sstate.btstack_size, sstate.capture_stack.size(), sstate.counter_stack.size(), sstate.repeat_stack.size());
|
|
const BidirectionalIterator orgpos = sstate.nth.in_string;
|
|
|
|
sstate.btstack_size = sstate.bt_stack.size();
|
|
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER)
|
|
if (current_NFA.quantifier.atleast == 2)
|
|
{
|
|
sstate.repeat_stack.push_back(sstate.lblim);
|
|
sstate.lblim = sstate.srchbegin;
|
|
}
|
|
#endif
|
|
|
|
#if defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
|
|
// if (current_NFA.reverse)
|
|
{
|
|
for (uint_l32 i = 0; i < current_NFA.quantifier.atleast; ++i)
|
|
{
|
|
if (!sstate.is_at_lookbehindlimit())
|
|
{
|
|
utf_traits::dec_codepoint(sstate.nth.in_string, sstate.lblim);
|
|
continue;
|
|
}
|
|
is_matched = false;
|
|
goto AFTER_LOOKAROUND;
|
|
}
|
|
}
|
|
#endif
|
|
sstate.nth.in_NFA_states = current_NFA.next_state2;
|
|
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
is_matched = current_NFA.quantifier.atleast == 0 ? run_automaton<icase, false>(sstate /* , true */) : run_automaton<icase, true>(sstate /* , true */);
|
|
#else
|
|
is_matched = run_automaton<icase, false>(sstate /* , true */);
|
|
#endif
|
|
|
|
#if defined(SRELL_FIXEDWIDTHLOOKBEHIND)
|
|
AFTER_LOOKAROUND:
|
|
#endif
|
|
{
|
|
|
|
#if !defined(SRELL_FIXEDWIDTHLOOKBEHIND) && !defined(SRELLDBG_NO_MPREWINDER)
|
|
if (current_NFA.quantifier.atleast == 2)
|
|
{
|
|
sstate.lblim = sstate.repeat_stack[backup_bottom.repeatstack_size];
|
|
if (is_matched)
|
|
sstate.bracket[0].core.open_at = sstate.nth.in_string;
|
|
}
|
|
#endif
|
|
|
|
#if defined(SRELL_ENABLE_GT)
|
|
if (current_NFA.character != meta_char::mc_gt) // '>'
|
|
#endif
|
|
{
|
|
sstate.nth.in_string = orgpos;
|
|
}
|
|
sstate.bt_stack.resize(sstate.btstack_size);
|
|
|
|
sstate.btstack_size = backup_bottom.btstack_size;
|
|
sstate.capture_stack.resize(backup_bottom.capturestack_size);
|
|
sstate.counter_stack.resize(backup_bottom.counterstack_size);
|
|
sstate.repeat_stack.resize(backup_bottom.repeatstack_size);
|
|
|
|
is_matched ^= current_NFA.is_not;
|
|
}
|
|
}
|
|
if (is_matched)
|
|
{
|
|
sstate.nth.in_NFA_states = current_NFA.next_state1;
|
|
continue;
|
|
}
|
|
|
|
// case st_lookaround_pop:
|
|
for (uint_l32 i = this->number_of_repeats; i;)
|
|
{
|
|
sstate.repeat[--i] = sstate.repeat_stack.back();
|
|
sstate.repeat_stack.pop_back();
|
|
}
|
|
|
|
for (uint_l32 i = this->number_of_counters; i;)
|
|
{
|
|
sstate.counter[--i] = sstate.counter_stack.back();
|
|
sstate.counter_stack.pop_back();
|
|
}
|
|
|
|
for (uint_l32 i = this->number_of_brackets; i > 1;)
|
|
{
|
|
submatch_type &sm = sstate.bracket[--i];
|
|
|
|
sm.counter = sstate.counter_stack.back();
|
|
sm.core = sstate.capture_stack.back();
|
|
sstate.counter_stack.pop_back();
|
|
sstate.capture_stack.pop_back();
|
|
}
|
|
goto NOT_MATCHED;
|
|
|
|
case st_bol: // '^':
|
|
if (sstate.is_at_lookbehindlimit() && !sstate.match_prev_avail_flag())
|
|
{
|
|
if (!sstate.match_not_bol_flag())
|
|
goto MATCHED;
|
|
}
|
|
// !sstate.is_at_lookbehindlimit() || sstate.match_prev_avail_flag()
|
|
else if (current_NFA.multiline)
|
|
{
|
|
const uchar32 prevchar = utf_traits::prevcodepoint(sstate.nth.in_string, sstate.lblim);
|
|
|
|
if (this->character_class.is_included(re_character_class::newline, prevchar))
|
|
goto MATCHED;
|
|
}
|
|
goto NOT_MATCHED;
|
|
|
|
case st_eol: // '$':
|
|
if (sstate.is_at_srchend())
|
|
{
|
|
if (!sstate.match_not_eol_flag())
|
|
goto MATCHED;
|
|
}
|
|
else if (current_NFA.multiline)
|
|
{
|
|
const uchar32 nextchar = utf_traits::codepoint(sstate.nth.in_string, sstate.srchend);
|
|
|
|
if (this->character_class.is_included(re_character_class::newline, nextchar))
|
|
goto MATCHED;
|
|
}
|
|
goto NOT_MATCHED;
|
|
|
|
case st_boundary: // '\b' '\B'
|
|
is_matched = current_NFA.is_not;
|
|
// is_matched = current_NFA.character == char_alnum::ch_B;
|
|
|
|
// First, suppose the previous character is not \w but \W.
|
|
|
|
if (sstate.is_at_srchend())
|
|
{
|
|
if (sstate.match_not_eow_flag())
|
|
is_matched = !is_matched;
|
|
}
|
|
else if (this->character_class.is_included(current_NFA.number, utf_traits::codepoint(sstate.nth.in_string, sstate.srchend)))
|
|
{
|
|
is_matched = !is_matched;
|
|
}
|
|
// \W/last \w
|
|
// \b false true
|
|
// \B true false
|
|
|
|
// Second, if the actual previous character is \w, flip is_matched.
|
|
|
|
if (sstate.is_at_lookbehindlimit() && !sstate.match_prev_avail_flag())
|
|
{
|
|
if (sstate.match_not_bow_flag())
|
|
is_matched = !is_matched;
|
|
}
|
|
// !sstate.is_at_lookbehindlimit() || sstate.match_prev_avail_flag()
|
|
else if (this->character_class.is_included(current_NFA.number, utf_traits::prevcodepoint(sstate.nth.in_string, sstate.lblim)))
|
|
{
|
|
is_matched = !is_matched;
|
|
}
|
|
// \b \B
|
|
// pre cur \W/last \w pre cur \W/last \w
|
|
// \W/base false true \W/base true false
|
|
// \w true false \w false true
|
|
|
|
goto JUDGE;
|
|
|
|
case st_success: // == lookaround_close.
|
|
// if (is_recursive)
|
|
if (sstate.btstack_size)
|
|
return true;
|
|
|
|
if
|
|
(
|
|
(!sstate.match_not_null_flag() || !sstate.is_null())
|
|
&&
|
|
(!sstate.match_match_flag() || sstate.is_at_srchend())
|
|
)
|
|
return true;
|
|
|
|
goto NOT_MATCHED;
|
|
|
|
#if !defined(SRELLDBG_NO_NEXTPOS_OPT)
|
|
case st_move_nextpos:
|
|
#if !defined(SRELLDBG_NO_1STCHRCLS) && !defined(SRELLDBG_NO_BITSET)
|
|
sstate.nextpos = sstate.nth.in_string;
|
|
if (!sstate.is_at_srchend())
|
|
++sstate.nextpos;
|
|
#else // defined(SRELLDBG_NO_1STCHRCLS) || defined(SRELLDBG_NO_BITSET)
|
|
if (sstate.nth.in_string != sstate.bracket[0].core.open_at)
|
|
{
|
|
sstate.nextpos = sstate.nth.in_string;
|
|
if (!sstate.is_at_srchend())
|
|
utf_traits::codepoint_inc(sstate.nextpos, sstate.srchend);
|
|
}
|
|
#endif
|
|
goto MATCHED;
|
|
#endif
|
|
|
|
default:
|
|
// Reaching here means that this->NFA_states is corrupted.
|
|
throw regex_error(regex_constants::error_internal);
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
// regex_object
|
|
|
|
} // namespace regex_internal
|
|
|
|
// ... "rei_algorithm.hpp"]
|
|
// ["basic_regex.hpp" ...
|
|
|
|
// 28.8, class template basic_regex:
|
|
template <class charT, class traits = regex_traits<charT> >
|
|
class basic_regex : public regex_internal::regex_object<charT, traits>
|
|
{
|
|
public:
|
|
|
|
// Types:
|
|
typedef charT value_type;
|
|
typedef traits traits_type;
|
|
typedef typename traits::string_type string_type;
|
|
typedef regex_constants::syntax_option_type flag_type;
|
|
typedef typename traits::locale_type locale_type;
|
|
|
|
// 28.8.1, constants:
|
|
// [7.8.1] constants
|
|
static const regex_constants::syntax_option_type icase = regex_constants::icase;
|
|
static const regex_constants::syntax_option_type nosubs = regex_constants::nosubs;
|
|
static const regex_constants::syntax_option_type optimize = regex_constants::optimize;
|
|
static const regex_constants::syntax_option_type collate = regex_constants::collate;
|
|
static const regex_constants::syntax_option_type ECMAScript = regex_constants::ECMAScript;
|
|
static const regex_constants::syntax_option_type basic = regex_constants::basic;
|
|
static const regex_constants::syntax_option_type extended = regex_constants::extended;
|
|
static const regex_constants::syntax_option_type awk = regex_constants::awk;
|
|
static const regex_constants::syntax_option_type grep = regex_constants::grep;
|
|
static const regex_constants::syntax_option_type egrep = regex_constants::egrep;
|
|
static const regex_constants::syntax_option_type multiline = regex_constants::multiline;
|
|
|
|
static const regex_constants::syntax_option_type dotall = regex_constants::dotall;
|
|
static const regex_constants::syntax_option_type unicodesets = regex_constants::unicodesets;
|
|
|
|
// 28.8.2, construct/copy/destroy:
|
|
// [7.8.2] construct/copy/destroy
|
|
basic_regex()
|
|
{
|
|
}
|
|
|
|
explicit basic_regex(const charT *const p, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
assign(p, p + std::char_traits<charT>::length(p), f);
|
|
}
|
|
|
|
basic_regex(const charT *const p, const std::size_t len, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
assign(p, p + len, f);
|
|
}
|
|
|
|
basic_regex(const basic_regex &e)
|
|
{
|
|
assign(e);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
basic_regex(basic_regex &&e) SRELL_NOEXCEPT
|
|
{
|
|
assign(std::move(e));
|
|
}
|
|
#endif
|
|
|
|
template <class ST, class SA>
|
|
explicit basic_regex(const std::basic_string<charT, ST, SA> &p, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
assign(p, f);
|
|
}
|
|
|
|
template <class ForwardIterator>
|
|
basic_regex(ForwardIterator first, ForwardIterator last, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
assign(first, last, f);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED)
|
|
basic_regex(std::initializer_list<charT> il, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
assign(il, f);
|
|
}
|
|
#endif
|
|
|
|
// ~basic_regex();
|
|
|
|
basic_regex &operator=(const basic_regex &right)
|
|
{
|
|
return assign(right);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
basic_regex &operator=(basic_regex &&e) SRELL_NOEXCEPT
|
|
{
|
|
return assign(std::move(e));
|
|
}
|
|
#endif
|
|
|
|
basic_regex &operator=(const charT *const ptr)
|
|
{
|
|
return assign(ptr);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED)
|
|
basic_regex &operator=(std::initializer_list<charT> il)
|
|
{
|
|
return assign(il.begin(), il.end());
|
|
}
|
|
#endif
|
|
|
|
template <class ST, class SA>
|
|
basic_regex &operator=(const std::basic_string<charT, ST, SA> &p)
|
|
{
|
|
return assign(p);
|
|
}
|
|
|
|
// 28.8.3, assign:
|
|
// [7.8.3] assign
|
|
basic_regex &assign(const basic_regex &right)
|
|
{
|
|
regex_internal::re_object_core<charT, traits>::operator=(right);
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_MOVE_ENABLED)
|
|
basic_regex &assign(basic_regex &&right) SRELL_NOEXCEPT
|
|
{
|
|
regex_internal::re_object_core<charT, traits>::operator=(std::move(right));
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
basic_regex &assign(const charT *const ptr, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
return assign(ptr, ptr + std::char_traits<charT>::length(ptr), f);
|
|
}
|
|
|
|
basic_regex &assign(const charT *const p, std::size_t len, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
return assign(p, p + len, f);
|
|
}
|
|
|
|
template <class string_traits, class A>
|
|
basic_regex &assign(const std::basic_string<charT, string_traits, A> &s, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
return assign(s.c_str(), s.c_str() + s.size(), f);
|
|
}
|
|
|
|
template <class InputIterator>
|
|
basic_regex &assign(InputIterator first, InputIterator last, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
#if defined(SRELL_STRICT_IMPL)
|
|
basic_regex tmp;
|
|
tmp.compile(first, last, f);
|
|
tmp.swap(*this);
|
|
#else
|
|
this->compile(first, last, f);
|
|
#endif
|
|
return *this;
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED)
|
|
basic_regex &assign(std::initializer_list<charT> il, const flag_type f = regex_constants::ECMAScript)
|
|
{
|
|
return assign(il.begin(), il.end(), f);
|
|
}
|
|
#endif
|
|
|
|
// 28.8.4, const operations:
|
|
// [7.8.4] const operations
|
|
unsigned mark_count() const
|
|
{
|
|
return this->number_of_brackets - 1;
|
|
}
|
|
|
|
flag_type flags() const
|
|
{
|
|
return this->soflags;
|
|
}
|
|
|
|
// 28.8.5, locale:
|
|
// [7.8.5] locale
|
|
locale_type imbue(locale_type loc)
|
|
{
|
|
return this->traits_inst.imbue(loc);
|
|
}
|
|
|
|
locale_type getloc() const
|
|
{
|
|
return this->traits_inst.getloc();
|
|
}
|
|
|
|
// 28.8.6, swap:
|
|
// [7.8.6] swap
|
|
void swap(basic_regex &e)
|
|
{
|
|
regex_internal::re_object_core<charT, traits>::swap(e);
|
|
}
|
|
};
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::icase;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::nosubs;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::optimize;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::collate;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::ECMAScript;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::basic;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::extended;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::awk;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::grep;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::egrep;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::multiline;
|
|
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::dotall;
|
|
template <class charT, class traits>
|
|
const regex_constants::syntax_option_type basic_regex<charT, traits>::unicodesets;
|
|
|
|
// 28.8.6, basic_regex swap:
|
|
template <class charT, class traits>
|
|
void swap(basic_regex<charT, traits> &lhs, basic_regex<charT, traits> &rhs)
|
|
{
|
|
lhs.swap(rhs);
|
|
}
|
|
|
|
typedef basic_regex<char> regex;
|
|
typedef basic_regex<wchar_t> wregex;
|
|
|
|
#if defined(WCHAR_MAX)
|
|
#if WCHAR_MAX >= 0x10ffff
|
|
typedef wregex u32wregex;
|
|
typedef u32wregex u1632wregex;
|
|
#elif WCHAR_MAX >= 0xffff
|
|
typedef basic_regex<wchar_t, u16regex_traits<wchar_t> > u16wregex;
|
|
typedef u16wregex u1632wregex;
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef basic_regex<char8_t> u8regex;
|
|
#endif
|
|
|
|
typedef basic_regex<char, u8regex_traits<char> > u8cregex;
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef u8cregex u8regex;
|
|
#endif
|
|
|
|
#if defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
typedef basic_regex<char16_t> u16regex;
|
|
typedef basic_regex<char32_t> u32regex;
|
|
#endif
|
|
|
|
// ... "basic_regex.hpp"]
|
|
// ["regex_iterator.hpp" ...
|
|
|
|
// 28.12.1, class template regex_iterator:
|
|
template <class BidirectionalIterator, class charT = typename std::iterator_traits<BidirectionalIterator>::value_type, class traits = regex_traits<charT> >
|
|
class regex_iterator
|
|
{
|
|
public:
|
|
|
|
typedef basic_regex<charT, traits> regex_type;
|
|
typedef match_results<BidirectionalIterator> value_type;
|
|
typedef std::ptrdiff_t difference_type;
|
|
typedef const value_type * pointer;
|
|
typedef const value_type & reference;
|
|
typedef std::forward_iterator_tag iterator_category;
|
|
|
|
regex_iterator()
|
|
{
|
|
// 28.12.1.1: Constructs an end-of-sequence iterator.
|
|
}
|
|
|
|
regex_iterator(
|
|
const BidirectionalIterator a,
|
|
const BidirectionalIterator b,
|
|
const regex_type &re,
|
|
const regex_constants::match_flag_type m = regex_constants::match_default)
|
|
: begin(a), end(b), pregex(&re), flags(m)
|
|
{
|
|
regex_search(begin, end, begin, match, *pregex, flags);
|
|
// 28.12.1.1: If this call returns false the constructor
|
|
// sets *this to the end-of-sequence iterator.
|
|
}
|
|
|
|
regex_iterator(const regex_iterator &that)
|
|
{
|
|
operator=(that);
|
|
}
|
|
|
|
regex_iterator &operator=(const regex_iterator &that)
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->begin = that.begin;
|
|
this->end = that.end;
|
|
this->pregex = that.pregex;
|
|
this->flags = that.flags;
|
|
this->match = that.match;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
bool operator==(const regex_iterator &right) const
|
|
{
|
|
// It is probably safe to assume that match.size() == 0 means
|
|
// end-of-sequence, because it happens only when 1) never tried
|
|
// regex_search, or 2) regex_search returned false.
|
|
|
|
if (this->match.size() == 0 || right.match.size() == 0)
|
|
return this->match.size() == right.match.size();
|
|
|
|
return
|
|
this->begin == right.begin
|
|
&&
|
|
this->end == right.end
|
|
&&
|
|
this->pregex == right.pregex
|
|
&&
|
|
this->flags == right.flags
|
|
&&
|
|
this->match[0] == right.match[0];
|
|
}
|
|
|
|
bool operator!=(const regex_iterator &right) const
|
|
{
|
|
return !(*this == right);
|
|
}
|
|
|
|
const value_type &operator*() const
|
|
{
|
|
return match;
|
|
}
|
|
|
|
const value_type *operator->() const
|
|
{
|
|
return &match;
|
|
}
|
|
|
|
regex_iterator &operator++()
|
|
{
|
|
if (this->match.size())
|
|
{
|
|
BidirectionalIterator start = match[0].second;
|
|
|
|
if (match[0].first == start) // The iterator holds a 0-length match.
|
|
{
|
|
if (start == end)
|
|
{
|
|
match.clear_();
|
|
// 28.12.1.4.2: If the iterator holds a zero-length match and
|
|
// start == end the operator sets *this to the end-ofsequence
|
|
// iterator and returns *this.
|
|
}
|
|
else
|
|
{
|
|
// 28.12.1.4.3: Otherwise, if the iterator holds a zero-length match
|
|
// the operator calls regex_search(start, end, match, *pregex, flags
|
|
// | regex_constants::match_not_null | regex_constants::match_continuous).
|
|
// If the call returns true the operator returns *this. [Cont...]
|
|
|
|
if (!regex_search(start, end, begin, match, *pregex, flags | regex_constants::match_not_null | regex_constants::match_continuous))
|
|
{
|
|
const BidirectionalIterator prevend = start;
|
|
|
|
// [...Cont] Otherwise the operator increments start and continues
|
|
// as if the most recent match was not a zero-length match.
|
|
// ++start;
|
|
utf_traits::codepoint_inc(start, end);
|
|
|
|
flags |= regex_constants::match_prev_avail;
|
|
|
|
if (regex_search(start, end, begin, match, *pregex, flags))
|
|
{
|
|
// 28.12.1.4.5-6: In all cases in which the call to regex_search
|
|
// returns true, match.prefix().first shall be equal to the previous
|
|
// value of match[0].second, ... match[i].position() shall return
|
|
// distance(begin, match[i].first).
|
|
// This means that match[i].position() gives the offset from the
|
|
// beginning of the target sequence, which is often not the same as
|
|
// the offset from the sequence passed in the call to regex_search.
|
|
//
|
|
// To satisfy this:
|
|
match.set_prefix_first_(prevend);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// 28.12.1.4.4: If the most recent match was not a zero-length match,
|
|
// the operator sets flags to flags | regex_constants::match_prev_avail
|
|
// and calls regex_search(start, end, match, *pregex, flags). [Cont...]
|
|
flags |= regex_constants::match_prev_avail;
|
|
|
|
regex_search(start, end, begin, match, *pregex, flags);
|
|
// [...Cont] If the call returns false the iterator sets *this to
|
|
// the end-of-sequence iterator. The iterator then returns *this.
|
|
//
|
|
// 28.12.1.4.5-6: In all cases in which the call to regex_search
|
|
// returns true, match.prefix().first shall be equal to the previous
|
|
// value of match[0].second, ... match[i].position() shall return
|
|
// distance(begin, match[i].first).
|
|
// This means that match[i].position() gives the offset from the
|
|
// beginning of the target sequence, which is often not the same as
|
|
// the offset from the sequence passed in the call to regex_search.
|
|
//
|
|
// These should already be done in regex_search.
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
regex_iterator operator++(int)
|
|
{
|
|
const regex_iterator tmp = *this;
|
|
++(*this);
|
|
return tmp;
|
|
}
|
|
|
|
private:
|
|
|
|
BidirectionalIterator begin;
|
|
BidirectionalIterator end;
|
|
const regex_type *pregex;
|
|
regex_constants::match_flag_type flags;
|
|
match_results<BidirectionalIterator> match;
|
|
|
|
typedef typename traits::utf_traits utf_traits;
|
|
};
|
|
|
|
typedef regex_iterator<const char *> cregex_iterator;
|
|
typedef regex_iterator<const wchar_t *> wcregex_iterator;
|
|
typedef regex_iterator<std::string::const_iterator> sregex_iterator;
|
|
typedef regex_iterator<std::wstring::const_iterator> wsregex_iterator;
|
|
|
|
#if defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
typedef regex_iterator<const char16_t *> u16cregex_iterator;
|
|
typedef regex_iterator<const char32_t *> u32cregex_iterator;
|
|
typedef regex_iterator<std::u16string::const_iterator> u16sregex_iterator;
|
|
typedef regex_iterator<std::u32string::const_iterator> u32sregex_iterator;
|
|
#endif
|
|
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef regex_iterator<const char8_t *> u8cregex_iterator;
|
|
#endif
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2
|
|
typedef regex_iterator<std::u8string::const_iterator> u8sregex_iterator;
|
|
#endif
|
|
|
|
typedef regex_iterator<const char *, std::iterator_traits<const char *>::value_type, u8regex_traits<std::iterator_traits<const char *>::value_type> > u8ccregex_iterator;
|
|
typedef regex_iterator<std::string::const_iterator, std::iterator_traits<std::string::const_iterator>::value_type, u8regex_traits<std::iterator_traits<std::string::const_iterator>::value_type> > u8csregex_iterator;
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef u8ccregex_iterator u8cregex_iterator;
|
|
#endif
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2
|
|
typedef u8csregex_iterator u8sregex_iterator;
|
|
#endif
|
|
|
|
#if defined(WCHAR_MAX)
|
|
#if WCHAR_MAX >= 0x10ffff
|
|
typedef wcregex_iterator u32wcregex_iterator;
|
|
typedef wsregex_iterator u32wsregex_iterator;
|
|
typedef u32wcregex_iterator u1632wcregex_iterator;
|
|
typedef u32wsregex_iterator u1632wsregex_iterator;
|
|
#elif WCHAR_MAX >= 0xffff
|
|
typedef regex_iterator<const wchar_t *, std::iterator_traits<const wchar_t *>::value_type, u16regex_traits<std::iterator_traits<const wchar_t *>::value_type> > u16wcregex_iterator;
|
|
typedef regex_iterator<std::wstring::const_iterator, std::iterator_traits<std::wstring::const_iterator>::value_type, u16regex_traits<std::iterator_traits<std::wstring::const_iterator>::value_type> > u16wsregex_iterator;
|
|
typedef u16wcregex_iterator u1632wcregex_iterator;
|
|
typedef u16wsregex_iterator u1632wsregex_iterator;
|
|
#endif
|
|
#endif
|
|
|
|
// ... "regex_iterator.hpp"]
|
|
// ["regex_algorithm.hpp" ...
|
|
|
|
// 28.11.2, function template regex_match:
|
|
// [7.11.2] Function template regex_match
|
|
template <class BidirectionalIterator, class Allocator, class charT, class traits>
|
|
bool regex_match(
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
match_results<BidirectionalIterator, Allocator> &m,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return e.search(first, last, first, m, flags | regex_constants::match_continuous | regex_constants::match_match_);
|
|
}
|
|
|
|
template <class BidirectionalIterator, class charT, class traits>
|
|
bool regex_match(
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
// 4 Effects: Behaves "as if" by constructing an instance of
|
|
// match_results<BidirectionalIterator> what, and then returning the
|
|
// result of regex_match(first, last, what, e, flags).
|
|
|
|
match_results<BidirectionalIterator> what;
|
|
|
|
return regex_match(first, last, what, e, flags);
|
|
}
|
|
|
|
template <class charT, class Allocator, class traits>
|
|
bool regex_match(
|
|
const charT *const str,
|
|
match_results<const charT *, Allocator> &m,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_match(str, str + std::char_traits<charT>::length(str), m, e, flags);
|
|
}
|
|
|
|
template <class ST, class SA, class Allocator, class charT, class traits>
|
|
bool regex_match(
|
|
const std::basic_string<charT, ST, SA> &s,
|
|
match_results<typename std::basic_string<charT, ST, SA>::const_iterator, Allocator> &m,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_match(s.begin(), s.end(), m, e, flags);
|
|
}
|
|
|
|
template <class charT, class traits>
|
|
bool regex_match(
|
|
const charT *const str,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_match(str, str + std::char_traits<charT>::length(str), e, flags);
|
|
}
|
|
|
|
template <class ST, class SA, class charT, class traits>
|
|
bool regex_match(
|
|
const std::basic_string<charT, ST, SA> &s,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_match(s.begin(), s.end(), e, flags);
|
|
}
|
|
|
|
template <class BidirectionalIterator, class Allocator, class charT, class traits>
|
|
bool regex_search(
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
const BidirectionalIterator lookbehind_limit,
|
|
match_results<BidirectionalIterator, Allocator> &m,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return e.search(first, last, lookbehind_limit, m, flags);
|
|
}
|
|
|
|
template <class BidirectionalIterator, class charT, class traits>
|
|
bool regex_search(
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
const BidirectionalIterator lookbehind_limit,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
// 6 Effects: Behaves "as if" by constructing an object what of type
|
|
// match_results<iterator> and then returning the result of
|
|
// regex_search(first, last, what, e, flags).
|
|
|
|
match_results<BidirectionalIterator> what;
|
|
return regex_search(first, last, lookbehind_limit, what, e, flags);
|
|
}
|
|
|
|
// 28.11.3, function template regex_search:
|
|
// 7.11.3 regex_search [tr.re.alg.search]
|
|
template <class BidirectionalIterator, class Allocator, class charT, class traits>
|
|
bool regex_search(
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
match_results<BidirectionalIterator, Allocator> &m,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return e.search(first, last, first, m, flags);
|
|
}
|
|
|
|
template <class BidirectionalIterator, class charT, class traits>
|
|
bool regex_search(
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
// 6 Effects: Behaves "as if" by constructing an object what of type
|
|
// match_results<iterator> and then returning the result of
|
|
// regex_search(first, last, what, e, flags).
|
|
|
|
match_results<BidirectionalIterator> what;
|
|
return regex_search(first, last, what, e, flags);
|
|
}
|
|
|
|
template <class charT, class Allocator, class traits>
|
|
bool regex_search(
|
|
const charT *const str,
|
|
match_results<const charT *, Allocator> &m,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_search(str, str + std::char_traits<charT>::length(str), m, e, flags);
|
|
}
|
|
|
|
template <class charT, class traits>
|
|
bool regex_search(
|
|
const charT *const str,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_search(str, str + std::char_traits<charT>::length(str), e, flags);
|
|
}
|
|
|
|
template <class ST, class SA, class charT, class traits>
|
|
bool regex_search(
|
|
const std::basic_string<charT, ST, SA> &s,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_search(s.begin(), s.end(), e, flags);
|
|
}
|
|
|
|
template <class ST, class SA, class Allocator, class charT, class traits>
|
|
bool regex_search(
|
|
const std::basic_string<charT, ST, SA> &s,
|
|
match_results<typename std::basic_string<charT, ST, SA>::const_iterator, Allocator> &m,
|
|
const basic_regex<charT, traits> &e,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
return regex_search(s.begin(), s.end(), m, e, flags);
|
|
}
|
|
|
|
// 28.11.4, function template regex_replace:
|
|
// [7.11.4] Function template regex_replace
|
|
template <class OutputIterator, class BidirectionalIterator, class traits, class charT, class ST, class SA>
|
|
OutputIterator regex_replace(
|
|
OutputIterator out,
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
const basic_regex<charT, traits> &e,
|
|
const std::basic_string<charT, ST, SA> &fmt,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
typedef regex_iterator<BidirectionalIterator, charT, traits> iterator_type;
|
|
|
|
const bool do_copy = !(flags & regex_constants::format_no_copy);
|
|
const iterator_type eos;
|
|
iterator_type i(first, last, e, flags);
|
|
typename iterator_type::value_type::value_type last_m_suffix;
|
|
|
|
last_m_suffix.first = first;
|
|
last_m_suffix.second = last;
|
|
|
|
for (; i != eos; ++i)
|
|
{
|
|
if (do_copy)
|
|
out = std::copy(i->prefix().first, i->prefix().second, out);
|
|
|
|
out = i->format(out, fmt, flags);
|
|
last_m_suffix = i->suffix();
|
|
|
|
if (flags & regex_constants::format_first_only)
|
|
break;
|
|
}
|
|
|
|
if (do_copy)
|
|
out = std::copy(last_m_suffix.first, last_m_suffix.second, out);
|
|
|
|
return out;
|
|
}
|
|
|
|
template <class OutputIterator, class BidirectionalIterator, class traits, class charT>
|
|
OutputIterator regex_replace(
|
|
OutputIterator out,
|
|
const BidirectionalIterator first,
|
|
const BidirectionalIterator last,
|
|
const basic_regex<charT, traits> &e,
|
|
const charT *const fmt,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
// Strictly speaking, this should be implemented as a version different
|
|
// from the above with changing the line i->format(out, fmt, flags) to
|
|
// i->format(out, fmt, fmt + char_traits<charT>::length(fmt), flags).
|
|
|
|
const std::basic_string<charT> fs(fmt, fmt + std::char_traits<charT>::length(fmt));
|
|
|
|
return regex_replace(out, first, last, e, fs, flags);
|
|
}
|
|
|
|
template <class traits, class charT, class ST, class SA, class FST, class FSA>
|
|
std::basic_string<charT, ST, SA> regex_replace(
|
|
const std::basic_string<charT, ST, SA> &s,
|
|
const basic_regex<charT, traits> &e,
|
|
const std::basic_string<charT, FST, FSA> &fmt,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
std::basic_string<charT, ST, SA> result;
|
|
|
|
regex_replace(std::back_inserter(result), s.begin(), s.end(), e, fmt, flags);
|
|
return result;
|
|
}
|
|
|
|
template <class traits, class charT, class ST, class SA>
|
|
std::basic_string<charT, ST, SA> regex_replace(
|
|
const std::basic_string<charT, ST, SA> &s,
|
|
const basic_regex<charT, traits> &e,
|
|
const charT *const fmt,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
std::basic_string<charT, ST, SA> result;
|
|
|
|
regex_replace(std::back_inserter(result), s.begin(), s.end(), e, fmt, flags);
|
|
return result;
|
|
}
|
|
|
|
template <class traits, class charT, class ST, class SA>
|
|
std::basic_string<charT> regex_replace(
|
|
const charT *const s,
|
|
const basic_regex<charT, traits> &e,
|
|
const std::basic_string<charT, ST, SA> &fmt,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
std::basic_string<charT> result;
|
|
|
|
regex_replace(std::back_inserter(result), s, s + std::char_traits<charT>::length(s), e, fmt, flags);
|
|
return result;
|
|
}
|
|
|
|
template <class traits, class charT>
|
|
std::basic_string<charT> regex_replace(
|
|
const charT *const s,
|
|
const basic_regex<charT, traits> &e,
|
|
const charT *const fmt,
|
|
const regex_constants::match_flag_type flags = regex_constants::match_default
|
|
)
|
|
{
|
|
std::basic_string<charT> result;
|
|
|
|
regex_replace(std::back_inserter(result), s, s + std::char_traits<charT>::length(s), e, fmt, flags);
|
|
return result;
|
|
}
|
|
|
|
// ... "regex_algorithm.hpp"]
|
|
// ["regex_token_iterator.hpp" ...
|
|
|
|
// 28.12.2, class template regex_token_iterator:
|
|
template <class BidirectionalIterator, class charT = typename std::iterator_traits<BidirectionalIterator>::value_type, class traits = regex_traits<charT> >
|
|
class regex_token_iterator
|
|
{
|
|
public:
|
|
|
|
typedef basic_regex<charT, traits> regex_type;
|
|
typedef sub_match<BidirectionalIterator> value_type;
|
|
typedef std::ptrdiff_t difference_type;
|
|
typedef const value_type * pointer;
|
|
typedef const value_type & reference;
|
|
typedef std::forward_iterator_tag iterator_category;
|
|
|
|
regex_token_iterator() : result(NULL)
|
|
{
|
|
// Constructs the end-of-sequence iterator.
|
|
}
|
|
|
|
regex_token_iterator(
|
|
const BidirectionalIterator a,
|
|
const BidirectionalIterator b,
|
|
const regex_type &re,
|
|
int submatch = 0,
|
|
regex_constants::match_flag_type m = regex_constants::match_default
|
|
) : position(a, b, re, m), result(NULL), subs(1, submatch)
|
|
{
|
|
post_constructor(a, b);
|
|
}
|
|
|
|
regex_token_iterator(
|
|
const BidirectionalIterator a,
|
|
const BidirectionalIterator b,
|
|
const regex_type &re,
|
|
const std::vector<int> &submatches,
|
|
regex_constants::match_flag_type m = regex_constants::match_default
|
|
) : position(a, b, re, m), result(NULL), subs(submatches)
|
|
{
|
|
post_constructor(a, b);
|
|
}
|
|
|
|
#if defined(SRELL_CPP11_INITIALIZER_LIST_ENABLED)
|
|
regex_token_iterator(
|
|
const BidirectionalIterator a,
|
|
const BidirectionalIterator b,
|
|
const regex_type &re,
|
|
std::initializer_list<int> submatches,
|
|
regex_constants::match_flag_type m = regex_constants::match_default
|
|
) : position(a, b, re, m), result(NULL), subs(submatches)
|
|
{
|
|
post_constructor(a, b);
|
|
}
|
|
#endif
|
|
|
|
template <std::size_t N> // Was R in TR1.
|
|
regex_token_iterator(
|
|
const BidirectionalIterator a,
|
|
const BidirectionalIterator b,
|
|
const regex_type &re,
|
|
const int (&submatches)[N],
|
|
regex_constants::match_flag_type m = regex_constants::match_default
|
|
) : position(a, b, re, m), result(NULL), subs(submatches, submatches + N)
|
|
{
|
|
post_constructor(a, b);
|
|
}
|
|
|
|
regex_token_iterator(const regex_token_iterator &that)
|
|
{
|
|
operator=(that);
|
|
}
|
|
|
|
regex_token_iterator &operator=(const regex_token_iterator &that)
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->position = that.position;
|
|
this->result = that.result;
|
|
this->suffix = that.suffix;
|
|
this->N = that.N;
|
|
this->subs = that.subs;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
bool operator==(const regex_token_iterator &right)
|
|
{
|
|
if (this->result == NULL || right.result == NULL)
|
|
return this->result == right.result;
|
|
|
|
if (this->result == &this->suffix || right.result == &right.suffix)
|
|
return this->suffix == right.suffix;
|
|
|
|
return
|
|
this->position == right.position
|
|
&&
|
|
this->N == right.N
|
|
&&
|
|
this->subs == right.subs;
|
|
}
|
|
|
|
bool operator!=(const regex_token_iterator &right)
|
|
{
|
|
return !(*this == right);
|
|
}
|
|
|
|
const value_type &operator*()
|
|
{
|
|
return *result;
|
|
}
|
|
|
|
const value_type *operator->()
|
|
{
|
|
return result;
|
|
}
|
|
|
|
regex_token_iterator &operator++()
|
|
{
|
|
position_iterator prev(position);
|
|
position_iterator eos_iterator;
|
|
|
|
if (result != NULL)
|
|
// To avoid inifinite loop. The specification does not require, though.
|
|
{
|
|
if (result == &suffix)
|
|
{
|
|
result = NULL; // end-of-sequence.
|
|
}
|
|
else
|
|
{
|
|
++this->N;
|
|
for (;;)
|
|
{
|
|
if (this->N < subs.size())
|
|
{
|
|
result = subs[this->N] != -1 ? &((*position)[subs[this->N]]) : &((*position).prefix());
|
|
break;
|
|
}
|
|
|
|
this->N = 0;
|
|
++position;
|
|
|
|
if (position == eos_iterator)
|
|
{
|
|
if (this->N < subs.size() && prev->suffix().length() && minus1_in_subs())
|
|
{
|
|
suffix = prev->suffix();
|
|
result = &suffix;
|
|
}
|
|
else
|
|
{
|
|
result = NULL;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
regex_token_iterator operator++(int)
|
|
{
|
|
const regex_token_iterator tmp(*this);
|
|
++(*this);
|
|
return tmp;
|
|
}
|
|
|
|
private:
|
|
|
|
void post_constructor(const BidirectionalIterator a, const BidirectionalIterator b)
|
|
{
|
|
position_iterator eos_iterator;
|
|
|
|
this->N = 0;
|
|
|
|
if (position != eos_iterator && subs.size())
|
|
{
|
|
result = subs[this->N] != -1 ? &((*position)[subs[this->N]]) : &((*position).prefix());
|
|
}
|
|
else if (minus1_in_subs()) // end-of-sequence.
|
|
{
|
|
suffix.first = a;
|
|
suffix.second = b;
|
|
suffix.matched = a != b;
|
|
// 28.1.2.7: In a suffix iterator the member result holds a pointer
|
|
// to the data member suffix, the value of the member suffix.match is true,
|
|
|
|
if (suffix.matched)
|
|
result = &suffix;
|
|
else
|
|
result = NULL; // Means end-of-sequence.
|
|
}
|
|
}
|
|
|
|
bool minus1_in_subs() const
|
|
{
|
|
for (std::size_t i = 0; i < subs.size(); ++i)
|
|
if (subs[i] == -1)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
private:
|
|
|
|
typedef regex_iterator<BidirectionalIterator, charT, traits> position_iterator;
|
|
position_iterator position;
|
|
const value_type *result;
|
|
value_type suffix;
|
|
std::size_t N;
|
|
std::vector<int> subs;
|
|
};
|
|
|
|
typedef regex_token_iterator<const char *> cregex_token_iterator;
|
|
typedef regex_token_iterator<const wchar_t *> wcregex_token_iterator;
|
|
typedef regex_token_iterator<std::string::const_iterator> sregex_token_iterator;
|
|
typedef regex_token_iterator<std::wstring::const_iterator> wsregex_token_iterator;
|
|
|
|
#if defined(SRELL_CPP11_CHAR1632_ENABLED)
|
|
typedef regex_token_iterator<const char16_t *> u16cregex_token_iterator;
|
|
typedef regex_token_iterator<const char32_t *> u32cregex_token_iterator;
|
|
typedef regex_token_iterator<std::u16string::const_iterator> u16sregex_token_iterator;
|
|
typedef regex_token_iterator<std::u32string::const_iterator> u32sregex_token_iterator;
|
|
#endif
|
|
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef regex_token_iterator<const char8_t *> u8cregex_token_iterator;
|
|
#endif
|
|
#if defined(SRELL_CPP20_CHAR8_ENABLED) && SRELL_CPP20_CHAR8_ENABLED >= 2
|
|
typedef regex_token_iterator<std::u8string::const_iterator> u8sregex_token_iterator;
|
|
#endif
|
|
|
|
typedef regex_token_iterator<const char *, std::iterator_traits<const char *>::value_type, u8regex_traits<std::iterator_traits<const char *>::value_type> > u8ccregex_token_iterator;
|
|
typedef regex_token_iterator<std::string::const_iterator, std::iterator_traits<std::string::const_iterator>::value_type, u8regex_traits<std::iterator_traits<std::string::const_iterator>::value_type> > u8csregex_token_iterator;
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED)
|
|
typedef u8ccregex_token_iterator u8cregex_token_iterator;
|
|
#endif
|
|
#if !defined(SRELL_CPP20_CHAR8_ENABLED) || SRELL_CPP20_CHAR8_ENABLED < 2
|
|
typedef u8csregex_token_iterator u8sregex_token_iterator;
|
|
#endif
|
|
|
|
#if defined(WCHAR_MAX)
|
|
#if WCHAR_MAX >= 0x10ffff
|
|
typedef wcregex_token_iterator u32wcregex_token_iterator;
|
|
typedef wsregex_token_iterator u32wsregex_token_iterator;
|
|
typedef u32wcregex_token_iterator u1632wcregex_token_iterator;
|
|
typedef u32wsregex_token_iterator u1632wsregex_token_iterator;
|
|
#elif WCHAR_MAX >= 0xffff
|
|
typedef regex_token_iterator<const wchar_t *, std::iterator_traits<const wchar_t *>::value_type, u16regex_traits<std::iterator_traits<const wchar_t *>::value_type> > u16wcregex_token_iterator;
|
|
typedef regex_token_iterator<std::wstring::const_iterator, std::iterator_traits<std::wstring::const_iterator>::value_type, u16regex_traits<std::iterator_traits<std::wstring::const_iterator>::value_type> > u16wsregex_token_iterator;
|
|
typedef u16wcregex_token_iterator u1632wcregex_token_iterator;
|
|
typedef u16wsregex_token_iterator u1632wsregex_token_iterator;
|
|
#endif
|
|
#endif
|
|
|
|
// ... "regex_token_iterator.hpp"]
|
|
|
|
} // namespace srell
|
|
|
|
#ifdef SRELL_NOEXCEPT
|
|
#undef SRELL_NOEXCEPT
|
|
#endif
|
|
|
|
#ifdef SRELL_CPP20_CHAR8_ENABLED
|
|
#undef SRELL_CPP20_CHAR8_ENABLED
|
|
#endif
|
|
|
|
#ifdef SRELL_CPP11_CHAR1632_ENABLED
|
|
#undef SRELL_CPP11_CHAR1632_ENABLED
|
|
#endif
|
|
|
|
#ifdef SRELL_CPP11_INITIALIZER_LIST_ENABLED
|
|
#undef SRELL_CPP11_INITIALIZER_LIST_ENABLED
|
|
#endif
|
|
|
|
#ifdef SRELL_CPP11_MOVE_ENABLED
|
|
#undef SRELL_CPP11_MOVE_ENABLED
|
|
#endif
|
|
|
|
#endif // SRELL_REGEX_TEMPLATE_LIBRARY
|