Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Dependencies: MaximInterface
rapidjson/internal/regex.h@0:33d4e66780c0, 2017-02-24 (annotated)
- Committer:
- IanBenzMaxim
- Date:
- Fri Feb 24 11:23:12 2017 -0600
- Revision:
- 0:33d4e66780c0
Initial commit.
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
IanBenzMaxim | 0:33d4e66780c0 | 1 | // Tencent is pleased to support the open source community by making RapidJSON available. |
IanBenzMaxim | 0:33d4e66780c0 | 2 | // |
IanBenzMaxim | 0:33d4e66780c0 | 3 | // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. |
IanBenzMaxim | 0:33d4e66780c0 | 4 | // |
IanBenzMaxim | 0:33d4e66780c0 | 5 | // Licensed under the MIT License (the "License"); you may not use this file except |
IanBenzMaxim | 0:33d4e66780c0 | 6 | // in compliance with the License. You may obtain a copy of the License at |
IanBenzMaxim | 0:33d4e66780c0 | 7 | // |
IanBenzMaxim | 0:33d4e66780c0 | 8 | // http://opensource.org/licenses/MIT |
IanBenzMaxim | 0:33d4e66780c0 | 9 | // |
IanBenzMaxim | 0:33d4e66780c0 | 10 | // Unless required by applicable law or agreed to in writing, software distributed |
IanBenzMaxim | 0:33d4e66780c0 | 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR |
IanBenzMaxim | 0:33d4e66780c0 | 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the |
IanBenzMaxim | 0:33d4e66780c0 | 13 | // specific language governing permissions and limitations under the License. |
IanBenzMaxim | 0:33d4e66780c0 | 14 | |
IanBenzMaxim | 0:33d4e66780c0 | 15 | #ifndef RAPIDJSON_INTERNAL_REGEX_H_ |
IanBenzMaxim | 0:33d4e66780c0 | 16 | #define RAPIDJSON_INTERNAL_REGEX_H_ |
IanBenzMaxim | 0:33d4e66780c0 | 17 | |
IanBenzMaxim | 0:33d4e66780c0 | 18 | #include "../allocators.h" |
IanBenzMaxim | 0:33d4e66780c0 | 19 | #include "../stream.h" |
IanBenzMaxim | 0:33d4e66780c0 | 20 | #include "stack.h" |
IanBenzMaxim | 0:33d4e66780c0 | 21 | |
IanBenzMaxim | 0:33d4e66780c0 | 22 | #ifdef __clang__ |
IanBenzMaxim | 0:33d4e66780c0 | 23 | RAPIDJSON_DIAG_PUSH |
IanBenzMaxim | 0:33d4e66780c0 | 24 | RAPIDJSON_DIAG_OFF(padded) |
IanBenzMaxim | 0:33d4e66780c0 | 25 | RAPIDJSON_DIAG_OFF(switch-enum) |
IanBenzMaxim | 0:33d4e66780c0 | 26 | RAPIDJSON_DIAG_OFF(implicit-fallthrough) |
IanBenzMaxim | 0:33d4e66780c0 | 27 | #endif |
IanBenzMaxim | 0:33d4e66780c0 | 28 | |
IanBenzMaxim | 0:33d4e66780c0 | 29 | #ifdef __GNUC__ |
IanBenzMaxim | 0:33d4e66780c0 | 30 | RAPIDJSON_DIAG_PUSH |
IanBenzMaxim | 0:33d4e66780c0 | 31 | RAPIDJSON_DIAG_OFF(effc++) |
IanBenzMaxim | 0:33d4e66780c0 | 32 | #endif |
IanBenzMaxim | 0:33d4e66780c0 | 33 | |
IanBenzMaxim | 0:33d4e66780c0 | 34 | #ifdef _MSC_VER |
IanBenzMaxim | 0:33d4e66780c0 | 35 | RAPIDJSON_DIAG_PUSH |
IanBenzMaxim | 0:33d4e66780c0 | 36 | RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated |
IanBenzMaxim | 0:33d4e66780c0 | 37 | #endif |
IanBenzMaxim | 0:33d4e66780c0 | 38 | |
IanBenzMaxim | 0:33d4e66780c0 | 39 | #ifndef RAPIDJSON_REGEX_VERBOSE |
IanBenzMaxim | 0:33d4e66780c0 | 40 | #define RAPIDJSON_REGEX_VERBOSE 0 |
IanBenzMaxim | 0:33d4e66780c0 | 41 | #endif |
IanBenzMaxim | 0:33d4e66780c0 | 42 | |
IanBenzMaxim | 0:33d4e66780c0 | 43 | RAPIDJSON_NAMESPACE_BEGIN |
IanBenzMaxim | 0:33d4e66780c0 | 44 | namespace internal { |
IanBenzMaxim | 0:33d4e66780c0 | 45 | |
IanBenzMaxim | 0:33d4e66780c0 | 46 | /////////////////////////////////////////////////////////////////////////////// |
IanBenzMaxim | 0:33d4e66780c0 | 47 | // DecodedStream |
IanBenzMaxim | 0:33d4e66780c0 | 48 | |
IanBenzMaxim | 0:33d4e66780c0 | 49 | template <typename SourceStream, typename Encoding> |
IanBenzMaxim | 0:33d4e66780c0 | 50 | class DecodedStream { |
IanBenzMaxim | 0:33d4e66780c0 | 51 | public: |
IanBenzMaxim | 0:33d4e66780c0 | 52 | DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); } |
IanBenzMaxim | 0:33d4e66780c0 | 53 | unsigned Peek() { return codepoint_; } |
IanBenzMaxim | 0:33d4e66780c0 | 54 | unsigned Take() { |
IanBenzMaxim | 0:33d4e66780c0 | 55 | unsigned c = codepoint_; |
IanBenzMaxim | 0:33d4e66780c0 | 56 | if (c) // No further decoding when '\0' |
IanBenzMaxim | 0:33d4e66780c0 | 57 | Decode(); |
IanBenzMaxim | 0:33d4e66780c0 | 58 | return c; |
IanBenzMaxim | 0:33d4e66780c0 | 59 | } |
IanBenzMaxim | 0:33d4e66780c0 | 60 | |
IanBenzMaxim | 0:33d4e66780c0 | 61 | private: |
IanBenzMaxim | 0:33d4e66780c0 | 62 | void Decode() { |
IanBenzMaxim | 0:33d4e66780c0 | 63 | if (!Encoding::Decode(ss_, &codepoint_)) |
IanBenzMaxim | 0:33d4e66780c0 | 64 | codepoint_ = 0; |
IanBenzMaxim | 0:33d4e66780c0 | 65 | } |
IanBenzMaxim | 0:33d4e66780c0 | 66 | |
IanBenzMaxim | 0:33d4e66780c0 | 67 | SourceStream& ss_; |
IanBenzMaxim | 0:33d4e66780c0 | 68 | unsigned codepoint_; |
IanBenzMaxim | 0:33d4e66780c0 | 69 | }; |
IanBenzMaxim | 0:33d4e66780c0 | 70 | |
IanBenzMaxim | 0:33d4e66780c0 | 71 | /////////////////////////////////////////////////////////////////////////////// |
IanBenzMaxim | 0:33d4e66780c0 | 72 | // GenericRegex |
IanBenzMaxim | 0:33d4e66780c0 | 73 | |
IanBenzMaxim | 0:33d4e66780c0 | 74 | static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1 |
IanBenzMaxim | 0:33d4e66780c0 | 75 | static const SizeType kRegexInvalidRange = ~SizeType(0); |
IanBenzMaxim | 0:33d4e66780c0 | 76 | |
IanBenzMaxim | 0:33d4e66780c0 | 77 | template <typename Encoding, typename Allocator> |
IanBenzMaxim | 0:33d4e66780c0 | 78 | class GenericRegexSearch; |
IanBenzMaxim | 0:33d4e66780c0 | 79 | |
IanBenzMaxim | 0:33d4e66780c0 | 80 | //! Regular expression engine with subset of ECMAscript grammar. |
IanBenzMaxim | 0:33d4e66780c0 | 81 | /*! |
IanBenzMaxim | 0:33d4e66780c0 | 82 | Supported regular expression syntax: |
IanBenzMaxim | 0:33d4e66780c0 | 83 | - \c ab Concatenation |
IanBenzMaxim | 0:33d4e66780c0 | 84 | - \c a|b Alternation |
IanBenzMaxim | 0:33d4e66780c0 | 85 | - \c a? Zero or one |
IanBenzMaxim | 0:33d4e66780c0 | 86 | - \c a* Zero or more |
IanBenzMaxim | 0:33d4e66780c0 | 87 | - \c a+ One or more |
IanBenzMaxim | 0:33d4e66780c0 | 88 | - \c a{3} Exactly 3 times |
IanBenzMaxim | 0:33d4e66780c0 | 89 | - \c a{3,} At least 3 times |
IanBenzMaxim | 0:33d4e66780c0 | 90 | - \c a{3,5} 3 to 5 times |
IanBenzMaxim | 0:33d4e66780c0 | 91 | - \c (ab) Grouping |
IanBenzMaxim | 0:33d4e66780c0 | 92 | - \c ^a At the beginning |
IanBenzMaxim | 0:33d4e66780c0 | 93 | - \c a$ At the end |
IanBenzMaxim | 0:33d4e66780c0 | 94 | - \c . Any character |
IanBenzMaxim | 0:33d4e66780c0 | 95 | - \c [abc] Character classes |
IanBenzMaxim | 0:33d4e66780c0 | 96 | - \c [a-c] Character class range |
IanBenzMaxim | 0:33d4e66780c0 | 97 | - \c [a-z0-9_] Character class combination |
IanBenzMaxim | 0:33d4e66780c0 | 98 | - \c [^abc] Negated character classes |
IanBenzMaxim | 0:33d4e66780c0 | 99 | - \c [^a-c] Negated character class range |
IanBenzMaxim | 0:33d4e66780c0 | 100 | - \c [\b] Backspace (U+0008) |
IanBenzMaxim | 0:33d4e66780c0 | 101 | - \c \\| \\\\ ... Escape characters |
IanBenzMaxim | 0:33d4e66780c0 | 102 | - \c \\f Form feed (U+000C) |
IanBenzMaxim | 0:33d4e66780c0 | 103 | - \c \\n Line feed (U+000A) |
IanBenzMaxim | 0:33d4e66780c0 | 104 | - \c \\r Carriage return (U+000D) |
IanBenzMaxim | 0:33d4e66780c0 | 105 | - \c \\t Tab (U+0009) |
IanBenzMaxim | 0:33d4e66780c0 | 106 | - \c \\v Vertical tab (U+000B) |
IanBenzMaxim | 0:33d4e66780c0 | 107 | |
IanBenzMaxim | 0:33d4e66780c0 | 108 | \note This is a Thompson NFA engine, implemented with reference to |
IanBenzMaxim | 0:33d4e66780c0 | 109 | Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).", |
IanBenzMaxim | 0:33d4e66780c0 | 110 | https://swtch.com/~rsc/regexp/regexp1.html |
IanBenzMaxim | 0:33d4e66780c0 | 111 | */ |
IanBenzMaxim | 0:33d4e66780c0 | 112 | template <typename Encoding, typename Allocator = CrtAllocator> |
IanBenzMaxim | 0:33d4e66780c0 | 113 | class GenericRegex { |
IanBenzMaxim | 0:33d4e66780c0 | 114 | public: |
IanBenzMaxim | 0:33d4e66780c0 | 115 | typedef Encoding EncodingType; |
IanBenzMaxim | 0:33d4e66780c0 | 116 | typedef typename Encoding::Ch Ch; |
IanBenzMaxim | 0:33d4e66780c0 | 117 | template <typename, typename> friend class GenericRegexSearch; |
IanBenzMaxim | 0:33d4e66780c0 | 118 | |
IanBenzMaxim | 0:33d4e66780c0 | 119 | GenericRegex(const Ch* source, Allocator* allocator = 0) : |
IanBenzMaxim | 0:33d4e66780c0 | 120 | states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), |
IanBenzMaxim | 0:33d4e66780c0 | 121 | anchorBegin_(), anchorEnd_() |
IanBenzMaxim | 0:33d4e66780c0 | 122 | { |
IanBenzMaxim | 0:33d4e66780c0 | 123 | GenericStringStream<Encoding> ss(source); |
IanBenzMaxim | 0:33d4e66780c0 | 124 | DecodedStream<GenericStringStream<Encoding>, Encoding> ds(ss); |
IanBenzMaxim | 0:33d4e66780c0 | 125 | Parse(ds); |
IanBenzMaxim | 0:33d4e66780c0 | 126 | } |
IanBenzMaxim | 0:33d4e66780c0 | 127 | |
IanBenzMaxim | 0:33d4e66780c0 | 128 | ~GenericRegex() {} |
IanBenzMaxim | 0:33d4e66780c0 | 129 | |
IanBenzMaxim | 0:33d4e66780c0 | 130 | bool IsValid() const { |
IanBenzMaxim | 0:33d4e66780c0 | 131 | return root_ != kRegexInvalidState; |
IanBenzMaxim | 0:33d4e66780c0 | 132 | } |
IanBenzMaxim | 0:33d4e66780c0 | 133 | |
IanBenzMaxim | 0:33d4e66780c0 | 134 | private: |
IanBenzMaxim | 0:33d4e66780c0 | 135 | enum Operator { |
IanBenzMaxim | 0:33d4e66780c0 | 136 | kZeroOrOne, |
IanBenzMaxim | 0:33d4e66780c0 | 137 | kZeroOrMore, |
IanBenzMaxim | 0:33d4e66780c0 | 138 | kOneOrMore, |
IanBenzMaxim | 0:33d4e66780c0 | 139 | kConcatenation, |
IanBenzMaxim | 0:33d4e66780c0 | 140 | kAlternation, |
IanBenzMaxim | 0:33d4e66780c0 | 141 | kLeftParenthesis |
IanBenzMaxim | 0:33d4e66780c0 | 142 | }; |
IanBenzMaxim | 0:33d4e66780c0 | 143 | |
IanBenzMaxim | 0:33d4e66780c0 | 144 | static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.' |
IanBenzMaxim | 0:33d4e66780c0 | 145 | static const unsigned kRangeCharacterClass = 0xFFFFFFFE; |
IanBenzMaxim | 0:33d4e66780c0 | 146 | static const unsigned kRangeNegationFlag = 0x80000000; |
IanBenzMaxim | 0:33d4e66780c0 | 147 | |
IanBenzMaxim | 0:33d4e66780c0 | 148 | struct Range { |
IanBenzMaxim | 0:33d4e66780c0 | 149 | unsigned start; // |
IanBenzMaxim | 0:33d4e66780c0 | 150 | unsigned end; |
IanBenzMaxim | 0:33d4e66780c0 | 151 | SizeType next; |
IanBenzMaxim | 0:33d4e66780c0 | 152 | }; |
IanBenzMaxim | 0:33d4e66780c0 | 153 | |
IanBenzMaxim | 0:33d4e66780c0 | 154 | struct State { |
IanBenzMaxim | 0:33d4e66780c0 | 155 | SizeType out; //!< Equals to kInvalid for matching state |
IanBenzMaxim | 0:33d4e66780c0 | 156 | SizeType out1; //!< Equals to non-kInvalid for split |
IanBenzMaxim | 0:33d4e66780c0 | 157 | SizeType rangeStart; |
IanBenzMaxim | 0:33d4e66780c0 | 158 | unsigned codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 159 | }; |
IanBenzMaxim | 0:33d4e66780c0 | 160 | |
IanBenzMaxim | 0:33d4e66780c0 | 161 | struct Frag { |
IanBenzMaxim | 0:33d4e66780c0 | 162 | Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {} |
IanBenzMaxim | 0:33d4e66780c0 | 163 | SizeType start; |
IanBenzMaxim | 0:33d4e66780c0 | 164 | SizeType out; //!< link-list of all output states |
IanBenzMaxim | 0:33d4e66780c0 | 165 | SizeType minIndex; |
IanBenzMaxim | 0:33d4e66780c0 | 166 | }; |
IanBenzMaxim | 0:33d4e66780c0 | 167 | |
IanBenzMaxim | 0:33d4e66780c0 | 168 | State& GetState(SizeType index) { |
IanBenzMaxim | 0:33d4e66780c0 | 169 | RAPIDJSON_ASSERT(index < stateCount_); |
IanBenzMaxim | 0:33d4e66780c0 | 170 | return states_.template Bottom<State>()[index]; |
IanBenzMaxim | 0:33d4e66780c0 | 171 | } |
IanBenzMaxim | 0:33d4e66780c0 | 172 | |
IanBenzMaxim | 0:33d4e66780c0 | 173 | const State& GetState(SizeType index) const { |
IanBenzMaxim | 0:33d4e66780c0 | 174 | RAPIDJSON_ASSERT(index < stateCount_); |
IanBenzMaxim | 0:33d4e66780c0 | 175 | return states_.template Bottom<State>()[index]; |
IanBenzMaxim | 0:33d4e66780c0 | 176 | } |
IanBenzMaxim | 0:33d4e66780c0 | 177 | |
IanBenzMaxim | 0:33d4e66780c0 | 178 | Range& GetRange(SizeType index) { |
IanBenzMaxim | 0:33d4e66780c0 | 179 | RAPIDJSON_ASSERT(index < rangeCount_); |
IanBenzMaxim | 0:33d4e66780c0 | 180 | return ranges_.template Bottom<Range>()[index]; |
IanBenzMaxim | 0:33d4e66780c0 | 181 | } |
IanBenzMaxim | 0:33d4e66780c0 | 182 | |
IanBenzMaxim | 0:33d4e66780c0 | 183 | const Range& GetRange(SizeType index) const { |
IanBenzMaxim | 0:33d4e66780c0 | 184 | RAPIDJSON_ASSERT(index < rangeCount_); |
IanBenzMaxim | 0:33d4e66780c0 | 185 | return ranges_.template Bottom<Range>()[index]; |
IanBenzMaxim | 0:33d4e66780c0 | 186 | } |
IanBenzMaxim | 0:33d4e66780c0 | 187 | |
IanBenzMaxim | 0:33d4e66780c0 | 188 | template <typename InputStream> |
IanBenzMaxim | 0:33d4e66780c0 | 189 | void Parse(DecodedStream<InputStream, Encoding>& ds) { |
IanBenzMaxim | 0:33d4e66780c0 | 190 | Allocator allocator; |
IanBenzMaxim | 0:33d4e66780c0 | 191 | Stack<Allocator> operandStack(&allocator, 256); // Frag |
IanBenzMaxim | 0:33d4e66780c0 | 192 | Stack<Allocator> operatorStack(&allocator, 256); // Operator |
IanBenzMaxim | 0:33d4e66780c0 | 193 | Stack<Allocator> atomCountStack(&allocator, 256); // unsigned (Atom per parenthesis) |
IanBenzMaxim | 0:33d4e66780c0 | 194 | |
IanBenzMaxim | 0:33d4e66780c0 | 195 | *atomCountStack.template Push<unsigned>() = 0; |
IanBenzMaxim | 0:33d4e66780c0 | 196 | |
IanBenzMaxim | 0:33d4e66780c0 | 197 | unsigned codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 198 | while (ds.Peek() != 0) { |
IanBenzMaxim | 0:33d4e66780c0 | 199 | switch (codepoint = ds.Take()) { |
IanBenzMaxim | 0:33d4e66780c0 | 200 | case '^': |
IanBenzMaxim | 0:33d4e66780c0 | 201 | anchorBegin_ = true; |
IanBenzMaxim | 0:33d4e66780c0 | 202 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 203 | |
IanBenzMaxim | 0:33d4e66780c0 | 204 | case '$': |
IanBenzMaxim | 0:33d4e66780c0 | 205 | anchorEnd_ = true; |
IanBenzMaxim | 0:33d4e66780c0 | 206 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 207 | |
IanBenzMaxim | 0:33d4e66780c0 | 208 | case '|': |
IanBenzMaxim | 0:33d4e66780c0 | 209 | while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation) |
IanBenzMaxim | 0:33d4e66780c0 | 210 | if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1))) |
IanBenzMaxim | 0:33d4e66780c0 | 211 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 212 | *operatorStack.template Push<Operator>() = kAlternation; |
IanBenzMaxim | 0:33d4e66780c0 | 213 | *atomCountStack.template Top<unsigned>() = 0; |
IanBenzMaxim | 0:33d4e66780c0 | 214 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 215 | |
IanBenzMaxim | 0:33d4e66780c0 | 216 | case '(': |
IanBenzMaxim | 0:33d4e66780c0 | 217 | *operatorStack.template Push<Operator>() = kLeftParenthesis; |
IanBenzMaxim | 0:33d4e66780c0 | 218 | *atomCountStack.template Push<unsigned>() = 0; |
IanBenzMaxim | 0:33d4e66780c0 | 219 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 220 | |
IanBenzMaxim | 0:33d4e66780c0 | 221 | case ')': |
IanBenzMaxim | 0:33d4e66780c0 | 222 | while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis) |
IanBenzMaxim | 0:33d4e66780c0 | 223 | if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1))) |
IanBenzMaxim | 0:33d4e66780c0 | 224 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 225 | if (operatorStack.Empty()) |
IanBenzMaxim | 0:33d4e66780c0 | 226 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 227 | operatorStack.template Pop<Operator>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 228 | atomCountStack.template Pop<unsigned>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 229 | ImplicitConcatenation(atomCountStack, operatorStack); |
IanBenzMaxim | 0:33d4e66780c0 | 230 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 231 | |
IanBenzMaxim | 0:33d4e66780c0 | 232 | case '?': |
IanBenzMaxim | 0:33d4e66780c0 | 233 | if (!Eval(operandStack, kZeroOrOne)) |
IanBenzMaxim | 0:33d4e66780c0 | 234 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 235 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 236 | |
IanBenzMaxim | 0:33d4e66780c0 | 237 | case '*': |
IanBenzMaxim | 0:33d4e66780c0 | 238 | if (!Eval(operandStack, kZeroOrMore)) |
IanBenzMaxim | 0:33d4e66780c0 | 239 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 240 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 241 | |
IanBenzMaxim | 0:33d4e66780c0 | 242 | case '+': |
IanBenzMaxim | 0:33d4e66780c0 | 243 | if (!Eval(operandStack, kOneOrMore)) |
IanBenzMaxim | 0:33d4e66780c0 | 244 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 245 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 246 | |
IanBenzMaxim | 0:33d4e66780c0 | 247 | case '{': |
IanBenzMaxim | 0:33d4e66780c0 | 248 | { |
IanBenzMaxim | 0:33d4e66780c0 | 249 | unsigned n, m; |
IanBenzMaxim | 0:33d4e66780c0 | 250 | if (!ParseUnsigned(ds, &n)) |
IanBenzMaxim | 0:33d4e66780c0 | 251 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 252 | |
IanBenzMaxim | 0:33d4e66780c0 | 253 | if (ds.Peek() == ',') { |
IanBenzMaxim | 0:33d4e66780c0 | 254 | ds.Take(); |
IanBenzMaxim | 0:33d4e66780c0 | 255 | if (ds.Peek() == '}') |
IanBenzMaxim | 0:33d4e66780c0 | 256 | m = kInfinityQuantifier; |
IanBenzMaxim | 0:33d4e66780c0 | 257 | else if (!ParseUnsigned(ds, &m) || m < n) |
IanBenzMaxim | 0:33d4e66780c0 | 258 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 259 | } |
IanBenzMaxim | 0:33d4e66780c0 | 260 | else |
IanBenzMaxim | 0:33d4e66780c0 | 261 | m = n; |
IanBenzMaxim | 0:33d4e66780c0 | 262 | |
IanBenzMaxim | 0:33d4e66780c0 | 263 | if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}') |
IanBenzMaxim | 0:33d4e66780c0 | 264 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 265 | ds.Take(); |
IanBenzMaxim | 0:33d4e66780c0 | 266 | } |
IanBenzMaxim | 0:33d4e66780c0 | 267 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 268 | |
IanBenzMaxim | 0:33d4e66780c0 | 269 | case '.': |
IanBenzMaxim | 0:33d4e66780c0 | 270 | PushOperand(operandStack, kAnyCharacterClass); |
IanBenzMaxim | 0:33d4e66780c0 | 271 | ImplicitConcatenation(atomCountStack, operatorStack); |
IanBenzMaxim | 0:33d4e66780c0 | 272 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 273 | |
IanBenzMaxim | 0:33d4e66780c0 | 274 | case '[': |
IanBenzMaxim | 0:33d4e66780c0 | 275 | { |
IanBenzMaxim | 0:33d4e66780c0 | 276 | SizeType range; |
IanBenzMaxim | 0:33d4e66780c0 | 277 | if (!ParseRange(ds, &range)) |
IanBenzMaxim | 0:33d4e66780c0 | 278 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 279 | SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass); |
IanBenzMaxim | 0:33d4e66780c0 | 280 | GetState(s).rangeStart = range; |
IanBenzMaxim | 0:33d4e66780c0 | 281 | *operandStack.template Push<Frag>() = Frag(s, s, s); |
IanBenzMaxim | 0:33d4e66780c0 | 282 | } |
IanBenzMaxim | 0:33d4e66780c0 | 283 | ImplicitConcatenation(atomCountStack, operatorStack); |
IanBenzMaxim | 0:33d4e66780c0 | 284 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 285 | |
IanBenzMaxim | 0:33d4e66780c0 | 286 | case '\\': // Escape character |
IanBenzMaxim | 0:33d4e66780c0 | 287 | if (!CharacterEscape(ds, &codepoint)) |
IanBenzMaxim | 0:33d4e66780c0 | 288 | return; // Unsupported escape character |
IanBenzMaxim | 0:33d4e66780c0 | 289 | // fall through to default |
IanBenzMaxim | 0:33d4e66780c0 | 290 | |
IanBenzMaxim | 0:33d4e66780c0 | 291 | default: // Pattern character |
IanBenzMaxim | 0:33d4e66780c0 | 292 | PushOperand(operandStack, codepoint); |
IanBenzMaxim | 0:33d4e66780c0 | 293 | ImplicitConcatenation(atomCountStack, operatorStack); |
IanBenzMaxim | 0:33d4e66780c0 | 294 | } |
IanBenzMaxim | 0:33d4e66780c0 | 295 | } |
IanBenzMaxim | 0:33d4e66780c0 | 296 | |
IanBenzMaxim | 0:33d4e66780c0 | 297 | while (!operatorStack.Empty()) |
IanBenzMaxim | 0:33d4e66780c0 | 298 | if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1))) |
IanBenzMaxim | 0:33d4e66780c0 | 299 | return; |
IanBenzMaxim | 0:33d4e66780c0 | 300 | |
IanBenzMaxim | 0:33d4e66780c0 | 301 | // Link the operand to matching state. |
IanBenzMaxim | 0:33d4e66780c0 | 302 | if (operandStack.GetSize() == sizeof(Frag)) { |
IanBenzMaxim | 0:33d4e66780c0 | 303 | Frag* e = operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 304 | Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); |
IanBenzMaxim | 0:33d4e66780c0 | 305 | root_ = e->start; |
IanBenzMaxim | 0:33d4e66780c0 | 306 | |
IanBenzMaxim | 0:33d4e66780c0 | 307 | #if RAPIDJSON_REGEX_VERBOSE |
IanBenzMaxim | 0:33d4e66780c0 | 308 | printf("root: %d\n", root_); |
IanBenzMaxim | 0:33d4e66780c0 | 309 | for (SizeType i = 0; i < stateCount_ ; i++) { |
IanBenzMaxim | 0:33d4e66780c0 | 310 | State& s = GetState(i); |
IanBenzMaxim | 0:33d4e66780c0 | 311 | printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); |
IanBenzMaxim | 0:33d4e66780c0 | 312 | } |
IanBenzMaxim | 0:33d4e66780c0 | 313 | printf("\n"); |
IanBenzMaxim | 0:33d4e66780c0 | 314 | #endif |
IanBenzMaxim | 0:33d4e66780c0 | 315 | } |
IanBenzMaxim | 0:33d4e66780c0 | 316 | } |
IanBenzMaxim | 0:33d4e66780c0 | 317 | |
IanBenzMaxim | 0:33d4e66780c0 | 318 | SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { |
IanBenzMaxim | 0:33d4e66780c0 | 319 | State* s = states_.template Push<State>(); |
IanBenzMaxim | 0:33d4e66780c0 | 320 | s->out = out; |
IanBenzMaxim | 0:33d4e66780c0 | 321 | s->out1 = out1; |
IanBenzMaxim | 0:33d4e66780c0 | 322 | s->codepoint = codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 323 | s->rangeStart = kRegexInvalidRange; |
IanBenzMaxim | 0:33d4e66780c0 | 324 | return stateCount_++; |
IanBenzMaxim | 0:33d4e66780c0 | 325 | } |
IanBenzMaxim | 0:33d4e66780c0 | 326 | |
IanBenzMaxim | 0:33d4e66780c0 | 327 | void PushOperand(Stack<Allocator>& operandStack, unsigned codepoint) { |
IanBenzMaxim | 0:33d4e66780c0 | 328 | SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); |
IanBenzMaxim | 0:33d4e66780c0 | 329 | *operandStack.template Push<Frag>() = Frag(s, s, s); |
IanBenzMaxim | 0:33d4e66780c0 | 330 | } |
IanBenzMaxim | 0:33d4e66780c0 | 331 | |
IanBenzMaxim | 0:33d4e66780c0 | 332 | void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) { |
IanBenzMaxim | 0:33d4e66780c0 | 333 | if (*atomCountStack.template Top<unsigned>()) |
IanBenzMaxim | 0:33d4e66780c0 | 334 | *operatorStack.template Push<Operator>() = kConcatenation; |
IanBenzMaxim | 0:33d4e66780c0 | 335 | (*atomCountStack.template Top<unsigned>())++; |
IanBenzMaxim | 0:33d4e66780c0 | 336 | } |
IanBenzMaxim | 0:33d4e66780c0 | 337 | |
IanBenzMaxim | 0:33d4e66780c0 | 338 | SizeType Append(SizeType l1, SizeType l2) { |
IanBenzMaxim | 0:33d4e66780c0 | 339 | SizeType old = l1; |
IanBenzMaxim | 0:33d4e66780c0 | 340 | while (GetState(l1).out != kRegexInvalidState) |
IanBenzMaxim | 0:33d4e66780c0 | 341 | l1 = GetState(l1).out; |
IanBenzMaxim | 0:33d4e66780c0 | 342 | GetState(l1).out = l2; |
IanBenzMaxim | 0:33d4e66780c0 | 343 | return old; |
IanBenzMaxim | 0:33d4e66780c0 | 344 | } |
IanBenzMaxim | 0:33d4e66780c0 | 345 | |
IanBenzMaxim | 0:33d4e66780c0 | 346 | void Patch(SizeType l, SizeType s) { |
IanBenzMaxim | 0:33d4e66780c0 | 347 | for (SizeType next; l != kRegexInvalidState; l = next) { |
IanBenzMaxim | 0:33d4e66780c0 | 348 | next = GetState(l).out; |
IanBenzMaxim | 0:33d4e66780c0 | 349 | GetState(l).out = s; |
IanBenzMaxim | 0:33d4e66780c0 | 350 | } |
IanBenzMaxim | 0:33d4e66780c0 | 351 | } |
IanBenzMaxim | 0:33d4e66780c0 | 352 | |
IanBenzMaxim | 0:33d4e66780c0 | 353 | bool Eval(Stack<Allocator>& operandStack, Operator op) { |
IanBenzMaxim | 0:33d4e66780c0 | 354 | switch (op) { |
IanBenzMaxim | 0:33d4e66780c0 | 355 | case kConcatenation: |
IanBenzMaxim | 0:33d4e66780c0 | 356 | RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag) * 2); |
IanBenzMaxim | 0:33d4e66780c0 | 357 | { |
IanBenzMaxim | 0:33d4e66780c0 | 358 | Frag e2 = *operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 359 | Frag e1 = *operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 360 | Patch(e1.out, e2.start); |
IanBenzMaxim | 0:33d4e66780c0 | 361 | *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex)); |
IanBenzMaxim | 0:33d4e66780c0 | 362 | } |
IanBenzMaxim | 0:33d4e66780c0 | 363 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 364 | |
IanBenzMaxim | 0:33d4e66780c0 | 365 | case kAlternation: |
IanBenzMaxim | 0:33d4e66780c0 | 366 | if (operandStack.GetSize() >= sizeof(Frag) * 2) { |
IanBenzMaxim | 0:33d4e66780c0 | 367 | Frag e2 = *operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 368 | Frag e1 = *operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 369 | SizeType s = NewState(e1.start, e2.start, 0); |
IanBenzMaxim | 0:33d4e66780c0 | 370 | *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex)); |
IanBenzMaxim | 0:33d4e66780c0 | 371 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 372 | } |
IanBenzMaxim | 0:33d4e66780c0 | 373 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 374 | |
IanBenzMaxim | 0:33d4e66780c0 | 375 | case kZeroOrOne: |
IanBenzMaxim | 0:33d4e66780c0 | 376 | if (operandStack.GetSize() >= sizeof(Frag)) { |
IanBenzMaxim | 0:33d4e66780c0 | 377 | Frag e = *operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 378 | SizeType s = NewState(kRegexInvalidState, e.start, 0); |
IanBenzMaxim | 0:33d4e66780c0 | 379 | *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex); |
IanBenzMaxim | 0:33d4e66780c0 | 380 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 381 | } |
IanBenzMaxim | 0:33d4e66780c0 | 382 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 383 | |
IanBenzMaxim | 0:33d4e66780c0 | 384 | case kZeroOrMore: |
IanBenzMaxim | 0:33d4e66780c0 | 385 | if (operandStack.GetSize() >= sizeof(Frag)) { |
IanBenzMaxim | 0:33d4e66780c0 | 386 | Frag e = *operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 387 | SizeType s = NewState(kRegexInvalidState, e.start, 0); |
IanBenzMaxim | 0:33d4e66780c0 | 388 | Patch(e.out, s); |
IanBenzMaxim | 0:33d4e66780c0 | 389 | *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex); |
IanBenzMaxim | 0:33d4e66780c0 | 390 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 391 | } |
IanBenzMaxim | 0:33d4e66780c0 | 392 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 393 | |
IanBenzMaxim | 0:33d4e66780c0 | 394 | default: |
IanBenzMaxim | 0:33d4e66780c0 | 395 | RAPIDJSON_ASSERT(op == kOneOrMore); |
IanBenzMaxim | 0:33d4e66780c0 | 396 | if (operandStack.GetSize() >= sizeof(Frag)) { |
IanBenzMaxim | 0:33d4e66780c0 | 397 | Frag e = *operandStack.template Pop<Frag>(1); |
IanBenzMaxim | 0:33d4e66780c0 | 398 | SizeType s = NewState(kRegexInvalidState, e.start, 0); |
IanBenzMaxim | 0:33d4e66780c0 | 399 | Patch(e.out, s); |
IanBenzMaxim | 0:33d4e66780c0 | 400 | *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex); |
IanBenzMaxim | 0:33d4e66780c0 | 401 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 402 | } |
IanBenzMaxim | 0:33d4e66780c0 | 403 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 404 | } |
IanBenzMaxim | 0:33d4e66780c0 | 405 | } |
IanBenzMaxim | 0:33d4e66780c0 | 406 | |
IanBenzMaxim | 0:33d4e66780c0 | 407 | bool EvalQuantifier(Stack<Allocator>& operandStack, unsigned n, unsigned m) { |
IanBenzMaxim | 0:33d4e66780c0 | 408 | RAPIDJSON_ASSERT(n <= m); |
IanBenzMaxim | 0:33d4e66780c0 | 409 | RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag)); |
IanBenzMaxim | 0:33d4e66780c0 | 410 | |
IanBenzMaxim | 0:33d4e66780c0 | 411 | if (n == 0) { |
IanBenzMaxim | 0:33d4e66780c0 | 412 | if (m == 0) // a{0} not support |
IanBenzMaxim | 0:33d4e66780c0 | 413 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 414 | else if (m == kInfinityQuantifier) |
IanBenzMaxim | 0:33d4e66780c0 | 415 | Eval(operandStack, kZeroOrMore); // a{0,} -> a* |
IanBenzMaxim | 0:33d4e66780c0 | 416 | else { |
IanBenzMaxim | 0:33d4e66780c0 | 417 | Eval(operandStack, kZeroOrOne); // a{0,5} -> a? |
IanBenzMaxim | 0:33d4e66780c0 | 418 | for (unsigned i = 0; i < m - 1; i++) |
IanBenzMaxim | 0:33d4e66780c0 | 419 | CloneTopOperand(operandStack); // a{0,5} -> a? a? a? a? a? |
IanBenzMaxim | 0:33d4e66780c0 | 420 | for (unsigned i = 0; i < m - 1; i++) |
IanBenzMaxim | 0:33d4e66780c0 | 421 | Eval(operandStack, kConcatenation); // a{0,5} -> a?a?a?a?a? |
IanBenzMaxim | 0:33d4e66780c0 | 422 | } |
IanBenzMaxim | 0:33d4e66780c0 | 423 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 424 | } |
IanBenzMaxim | 0:33d4e66780c0 | 425 | |
IanBenzMaxim | 0:33d4e66780c0 | 426 | for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a |
IanBenzMaxim | 0:33d4e66780c0 | 427 | CloneTopOperand(operandStack); |
IanBenzMaxim | 0:33d4e66780c0 | 428 | |
IanBenzMaxim | 0:33d4e66780c0 | 429 | if (m == kInfinityQuantifier) |
IanBenzMaxim | 0:33d4e66780c0 | 430 | Eval(operandStack, kOneOrMore); // a{3,} -> a a a+ |
IanBenzMaxim | 0:33d4e66780c0 | 431 | else if (m > n) { |
IanBenzMaxim | 0:33d4e66780c0 | 432 | CloneTopOperand(operandStack); // a{3,5} -> a a a a |
IanBenzMaxim | 0:33d4e66780c0 | 433 | Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a? |
IanBenzMaxim | 0:33d4e66780c0 | 434 | for (unsigned i = n; i < m - 1; i++) |
IanBenzMaxim | 0:33d4e66780c0 | 435 | CloneTopOperand(operandStack); // a{3,5} -> a a a a? a? |
IanBenzMaxim | 0:33d4e66780c0 | 436 | for (unsigned i = n; i < m; i++) |
IanBenzMaxim | 0:33d4e66780c0 | 437 | Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a? |
IanBenzMaxim | 0:33d4e66780c0 | 438 | } |
IanBenzMaxim | 0:33d4e66780c0 | 439 | |
IanBenzMaxim | 0:33d4e66780c0 | 440 | for (unsigned i = 0; i < n - 1; i++) |
IanBenzMaxim | 0:33d4e66780c0 | 441 | Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a? |
IanBenzMaxim | 0:33d4e66780c0 | 442 | |
IanBenzMaxim | 0:33d4e66780c0 | 443 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 444 | } |
IanBenzMaxim | 0:33d4e66780c0 | 445 | |
IanBenzMaxim | 0:33d4e66780c0 | 446 | static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; } |
IanBenzMaxim | 0:33d4e66780c0 | 447 | |
IanBenzMaxim | 0:33d4e66780c0 | 448 | void CloneTopOperand(Stack<Allocator>& operandStack) { |
IanBenzMaxim | 0:33d4e66780c0 | 449 | const Frag src = *operandStack.template Top<Frag>(); // Copy constructor to prevent invalidation |
IanBenzMaxim | 0:33d4e66780c0 | 450 | SizeType count = stateCount_ - src.minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_) |
IanBenzMaxim | 0:33d4e66780c0 | 451 | State* s = states_.template Push<State>(count); |
IanBenzMaxim | 0:33d4e66780c0 | 452 | memcpy(s, &GetState(src.minIndex), count * sizeof(State)); |
IanBenzMaxim | 0:33d4e66780c0 | 453 | for (SizeType j = 0; j < count; j++) { |
IanBenzMaxim | 0:33d4e66780c0 | 454 | if (s[j].out != kRegexInvalidState) |
IanBenzMaxim | 0:33d4e66780c0 | 455 | s[j].out += count; |
IanBenzMaxim | 0:33d4e66780c0 | 456 | if (s[j].out1 != kRegexInvalidState) |
IanBenzMaxim | 0:33d4e66780c0 | 457 | s[j].out1 += count; |
IanBenzMaxim | 0:33d4e66780c0 | 458 | } |
IanBenzMaxim | 0:33d4e66780c0 | 459 | *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count); |
IanBenzMaxim | 0:33d4e66780c0 | 460 | stateCount_ += count; |
IanBenzMaxim | 0:33d4e66780c0 | 461 | } |
IanBenzMaxim | 0:33d4e66780c0 | 462 | |
IanBenzMaxim | 0:33d4e66780c0 | 463 | template <typename InputStream> |
IanBenzMaxim | 0:33d4e66780c0 | 464 | bool ParseUnsigned(DecodedStream<InputStream, Encoding>& ds, unsigned* u) { |
IanBenzMaxim | 0:33d4e66780c0 | 465 | unsigned r = 0; |
IanBenzMaxim | 0:33d4e66780c0 | 466 | if (ds.Peek() < '0' || ds.Peek() > '9') |
IanBenzMaxim | 0:33d4e66780c0 | 467 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 468 | while (ds.Peek() >= '0' && ds.Peek() <= '9') { |
IanBenzMaxim | 0:33d4e66780c0 | 469 | if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295 |
IanBenzMaxim | 0:33d4e66780c0 | 470 | return false; // overflow |
IanBenzMaxim | 0:33d4e66780c0 | 471 | r = r * 10 + (ds.Take() - '0'); |
IanBenzMaxim | 0:33d4e66780c0 | 472 | } |
IanBenzMaxim | 0:33d4e66780c0 | 473 | *u = r; |
IanBenzMaxim | 0:33d4e66780c0 | 474 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 475 | } |
IanBenzMaxim | 0:33d4e66780c0 | 476 | |
IanBenzMaxim | 0:33d4e66780c0 | 477 | template <typename InputStream> |
IanBenzMaxim | 0:33d4e66780c0 | 478 | bool ParseRange(DecodedStream<InputStream, Encoding>& ds, SizeType* range) { |
IanBenzMaxim | 0:33d4e66780c0 | 479 | bool isBegin = true; |
IanBenzMaxim | 0:33d4e66780c0 | 480 | bool negate = false; |
IanBenzMaxim | 0:33d4e66780c0 | 481 | int step = 0; |
IanBenzMaxim | 0:33d4e66780c0 | 482 | SizeType start = kRegexInvalidRange; |
IanBenzMaxim | 0:33d4e66780c0 | 483 | SizeType current = kRegexInvalidRange; |
IanBenzMaxim | 0:33d4e66780c0 | 484 | unsigned codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 485 | while ((codepoint = ds.Take()) != 0) { |
IanBenzMaxim | 0:33d4e66780c0 | 486 | if (isBegin) { |
IanBenzMaxim | 0:33d4e66780c0 | 487 | isBegin = false; |
IanBenzMaxim | 0:33d4e66780c0 | 488 | if (codepoint == '^') { |
IanBenzMaxim | 0:33d4e66780c0 | 489 | negate = true; |
IanBenzMaxim | 0:33d4e66780c0 | 490 | continue; |
IanBenzMaxim | 0:33d4e66780c0 | 491 | } |
IanBenzMaxim | 0:33d4e66780c0 | 492 | } |
IanBenzMaxim | 0:33d4e66780c0 | 493 | |
IanBenzMaxim | 0:33d4e66780c0 | 494 | switch (codepoint) { |
IanBenzMaxim | 0:33d4e66780c0 | 495 | case ']': |
IanBenzMaxim | 0:33d4e66780c0 | 496 | if (start == kRegexInvalidRange) |
IanBenzMaxim | 0:33d4e66780c0 | 497 | return false; // Error: nothing inside [] |
IanBenzMaxim | 0:33d4e66780c0 | 498 | if (step == 2) { // Add trailing '-' |
IanBenzMaxim | 0:33d4e66780c0 | 499 | SizeType r = NewRange('-'); |
IanBenzMaxim | 0:33d4e66780c0 | 500 | RAPIDJSON_ASSERT(current != kRegexInvalidRange); |
IanBenzMaxim | 0:33d4e66780c0 | 501 | GetRange(current).next = r; |
IanBenzMaxim | 0:33d4e66780c0 | 502 | } |
IanBenzMaxim | 0:33d4e66780c0 | 503 | if (negate) |
IanBenzMaxim | 0:33d4e66780c0 | 504 | GetRange(start).start |= kRangeNegationFlag; |
IanBenzMaxim | 0:33d4e66780c0 | 505 | *range = start; |
IanBenzMaxim | 0:33d4e66780c0 | 506 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 507 | |
IanBenzMaxim | 0:33d4e66780c0 | 508 | case '\\': |
IanBenzMaxim | 0:33d4e66780c0 | 509 | if (ds.Peek() == 'b') { |
IanBenzMaxim | 0:33d4e66780c0 | 510 | ds.Take(); |
IanBenzMaxim | 0:33d4e66780c0 | 511 | codepoint = 0x0008; // Escape backspace character |
IanBenzMaxim | 0:33d4e66780c0 | 512 | } |
IanBenzMaxim | 0:33d4e66780c0 | 513 | else if (!CharacterEscape(ds, &codepoint)) |
IanBenzMaxim | 0:33d4e66780c0 | 514 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 515 | // fall through to default |
IanBenzMaxim | 0:33d4e66780c0 | 516 | |
IanBenzMaxim | 0:33d4e66780c0 | 517 | default: |
IanBenzMaxim | 0:33d4e66780c0 | 518 | switch (step) { |
IanBenzMaxim | 0:33d4e66780c0 | 519 | case 1: |
IanBenzMaxim | 0:33d4e66780c0 | 520 | if (codepoint == '-') { |
IanBenzMaxim | 0:33d4e66780c0 | 521 | step++; |
IanBenzMaxim | 0:33d4e66780c0 | 522 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 523 | } |
IanBenzMaxim | 0:33d4e66780c0 | 524 | // fall through to step 0 for other characters |
IanBenzMaxim | 0:33d4e66780c0 | 525 | |
IanBenzMaxim | 0:33d4e66780c0 | 526 | case 0: |
IanBenzMaxim | 0:33d4e66780c0 | 527 | { |
IanBenzMaxim | 0:33d4e66780c0 | 528 | SizeType r = NewRange(codepoint); |
IanBenzMaxim | 0:33d4e66780c0 | 529 | if (current != kRegexInvalidRange) |
IanBenzMaxim | 0:33d4e66780c0 | 530 | GetRange(current).next = r; |
IanBenzMaxim | 0:33d4e66780c0 | 531 | if (start == kRegexInvalidRange) |
IanBenzMaxim | 0:33d4e66780c0 | 532 | start = r; |
IanBenzMaxim | 0:33d4e66780c0 | 533 | current = r; |
IanBenzMaxim | 0:33d4e66780c0 | 534 | } |
IanBenzMaxim | 0:33d4e66780c0 | 535 | step = 1; |
IanBenzMaxim | 0:33d4e66780c0 | 536 | break; |
IanBenzMaxim | 0:33d4e66780c0 | 537 | |
IanBenzMaxim | 0:33d4e66780c0 | 538 | default: |
IanBenzMaxim | 0:33d4e66780c0 | 539 | RAPIDJSON_ASSERT(step == 2); |
IanBenzMaxim | 0:33d4e66780c0 | 540 | GetRange(current).end = codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 541 | step = 0; |
IanBenzMaxim | 0:33d4e66780c0 | 542 | } |
IanBenzMaxim | 0:33d4e66780c0 | 543 | } |
IanBenzMaxim | 0:33d4e66780c0 | 544 | } |
IanBenzMaxim | 0:33d4e66780c0 | 545 | return false; |
IanBenzMaxim | 0:33d4e66780c0 | 546 | } |
IanBenzMaxim | 0:33d4e66780c0 | 547 | |
IanBenzMaxim | 0:33d4e66780c0 | 548 | SizeType NewRange(unsigned codepoint) { |
IanBenzMaxim | 0:33d4e66780c0 | 549 | Range* r = ranges_.template Push<Range>(); |
IanBenzMaxim | 0:33d4e66780c0 | 550 | r->start = r->end = codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 551 | r->next = kRegexInvalidRange; |
IanBenzMaxim | 0:33d4e66780c0 | 552 | return rangeCount_++; |
IanBenzMaxim | 0:33d4e66780c0 | 553 | } |
IanBenzMaxim | 0:33d4e66780c0 | 554 | |
IanBenzMaxim | 0:33d4e66780c0 | 555 | template <typename InputStream> |
IanBenzMaxim | 0:33d4e66780c0 | 556 | bool CharacterEscape(DecodedStream<InputStream, Encoding>& ds, unsigned* escapedCodepoint) { |
IanBenzMaxim | 0:33d4e66780c0 | 557 | unsigned codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 558 | switch (codepoint = ds.Take()) { |
IanBenzMaxim | 0:33d4e66780c0 | 559 | case '^': |
IanBenzMaxim | 0:33d4e66780c0 | 560 | case '$': |
IanBenzMaxim | 0:33d4e66780c0 | 561 | case '|': |
IanBenzMaxim | 0:33d4e66780c0 | 562 | case '(': |
IanBenzMaxim | 0:33d4e66780c0 | 563 | case ')': |
IanBenzMaxim | 0:33d4e66780c0 | 564 | case '?': |
IanBenzMaxim | 0:33d4e66780c0 | 565 | case '*': |
IanBenzMaxim | 0:33d4e66780c0 | 566 | case '+': |
IanBenzMaxim | 0:33d4e66780c0 | 567 | case '.': |
IanBenzMaxim | 0:33d4e66780c0 | 568 | case '[': |
IanBenzMaxim | 0:33d4e66780c0 | 569 | case ']': |
IanBenzMaxim | 0:33d4e66780c0 | 570 | case '{': |
IanBenzMaxim | 0:33d4e66780c0 | 571 | case '}': |
IanBenzMaxim | 0:33d4e66780c0 | 572 | case '\\': |
IanBenzMaxim | 0:33d4e66780c0 | 573 | *escapedCodepoint = codepoint; return true; |
IanBenzMaxim | 0:33d4e66780c0 | 574 | case 'f': *escapedCodepoint = 0x000C; return true; |
IanBenzMaxim | 0:33d4e66780c0 | 575 | case 'n': *escapedCodepoint = 0x000A; return true; |
IanBenzMaxim | 0:33d4e66780c0 | 576 | case 'r': *escapedCodepoint = 0x000D; return true; |
IanBenzMaxim | 0:33d4e66780c0 | 577 | case 't': *escapedCodepoint = 0x0009; return true; |
IanBenzMaxim | 0:33d4e66780c0 | 578 | case 'v': *escapedCodepoint = 0x000B; return true; |
IanBenzMaxim | 0:33d4e66780c0 | 579 | default: |
IanBenzMaxim | 0:33d4e66780c0 | 580 | return false; // Unsupported escape character |
IanBenzMaxim | 0:33d4e66780c0 | 581 | } |
IanBenzMaxim | 0:33d4e66780c0 | 582 | } |
IanBenzMaxim | 0:33d4e66780c0 | 583 | |
IanBenzMaxim | 0:33d4e66780c0 | 584 | Stack<Allocator> states_; |
IanBenzMaxim | 0:33d4e66780c0 | 585 | Stack<Allocator> ranges_; |
IanBenzMaxim | 0:33d4e66780c0 | 586 | SizeType root_; |
IanBenzMaxim | 0:33d4e66780c0 | 587 | SizeType stateCount_; |
IanBenzMaxim | 0:33d4e66780c0 | 588 | SizeType rangeCount_; |
IanBenzMaxim | 0:33d4e66780c0 | 589 | |
IanBenzMaxim | 0:33d4e66780c0 | 590 | static const unsigned kInfinityQuantifier = ~0u; |
IanBenzMaxim | 0:33d4e66780c0 | 591 | |
IanBenzMaxim | 0:33d4e66780c0 | 592 | // For SearchWithAnchoring() |
IanBenzMaxim | 0:33d4e66780c0 | 593 | bool anchorBegin_; |
IanBenzMaxim | 0:33d4e66780c0 | 594 | bool anchorEnd_; |
IanBenzMaxim | 0:33d4e66780c0 | 595 | }; |
IanBenzMaxim | 0:33d4e66780c0 | 596 | |
IanBenzMaxim | 0:33d4e66780c0 | 597 | template <typename RegexType, typename Allocator = CrtAllocator> |
IanBenzMaxim | 0:33d4e66780c0 | 598 | class GenericRegexSearch { |
IanBenzMaxim | 0:33d4e66780c0 | 599 | public: |
IanBenzMaxim | 0:33d4e66780c0 | 600 | typedef typename RegexType::EncodingType Encoding; |
IanBenzMaxim | 0:33d4e66780c0 | 601 | typedef typename Encoding::Ch Ch; |
IanBenzMaxim | 0:33d4e66780c0 | 602 | |
IanBenzMaxim | 0:33d4e66780c0 | 603 | GenericRegexSearch(const RegexType& regex, Allocator* allocator = 0) : |
IanBenzMaxim | 0:33d4e66780c0 | 604 | regex_(regex), allocator_(allocator), ownAllocator_(0), |
IanBenzMaxim | 0:33d4e66780c0 | 605 | state0_(allocator, 0), state1_(allocator, 0), stateSet_() |
IanBenzMaxim | 0:33d4e66780c0 | 606 | { |
IanBenzMaxim | 0:33d4e66780c0 | 607 | RAPIDJSON_ASSERT(regex_.IsValid()); |
IanBenzMaxim | 0:33d4e66780c0 | 608 | if (!allocator_) |
IanBenzMaxim | 0:33d4e66780c0 | 609 | ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator()); |
IanBenzMaxim | 0:33d4e66780c0 | 610 | stateSet_ = static_cast<unsigned*>(allocator_->Malloc(GetStateSetSize())); |
IanBenzMaxim | 0:33d4e66780c0 | 611 | state0_.template Reserve<SizeType>(regex_.stateCount_); |
IanBenzMaxim | 0:33d4e66780c0 | 612 | state1_.template Reserve<SizeType>(regex_.stateCount_); |
IanBenzMaxim | 0:33d4e66780c0 | 613 | } |
IanBenzMaxim | 0:33d4e66780c0 | 614 | |
IanBenzMaxim | 0:33d4e66780c0 | 615 | ~GenericRegexSearch() { |
IanBenzMaxim | 0:33d4e66780c0 | 616 | Allocator::Free(stateSet_); |
IanBenzMaxim | 0:33d4e66780c0 | 617 | RAPIDJSON_DELETE(ownAllocator_); |
IanBenzMaxim | 0:33d4e66780c0 | 618 | } |
IanBenzMaxim | 0:33d4e66780c0 | 619 | |
IanBenzMaxim | 0:33d4e66780c0 | 620 | template <typename InputStream> |
IanBenzMaxim | 0:33d4e66780c0 | 621 | bool Match(InputStream& is) { |
IanBenzMaxim | 0:33d4e66780c0 | 622 | return SearchWithAnchoring(is, true, true); |
IanBenzMaxim | 0:33d4e66780c0 | 623 | } |
IanBenzMaxim | 0:33d4e66780c0 | 624 | |
IanBenzMaxim | 0:33d4e66780c0 | 625 | bool Match(const Ch* s) { |
IanBenzMaxim | 0:33d4e66780c0 | 626 | GenericStringStream<Encoding> is(s); |
IanBenzMaxim | 0:33d4e66780c0 | 627 | return Match(is); |
IanBenzMaxim | 0:33d4e66780c0 | 628 | } |
IanBenzMaxim | 0:33d4e66780c0 | 629 | |
IanBenzMaxim | 0:33d4e66780c0 | 630 | template <typename InputStream> |
IanBenzMaxim | 0:33d4e66780c0 | 631 | bool Search(InputStream& is) { |
IanBenzMaxim | 0:33d4e66780c0 | 632 | return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_); |
IanBenzMaxim | 0:33d4e66780c0 | 633 | } |
IanBenzMaxim | 0:33d4e66780c0 | 634 | |
IanBenzMaxim | 0:33d4e66780c0 | 635 | bool Search(const Ch* s) { |
IanBenzMaxim | 0:33d4e66780c0 | 636 | GenericStringStream<Encoding> is(s); |
IanBenzMaxim | 0:33d4e66780c0 | 637 | return Search(is); |
IanBenzMaxim | 0:33d4e66780c0 | 638 | } |
IanBenzMaxim | 0:33d4e66780c0 | 639 | |
IanBenzMaxim | 0:33d4e66780c0 | 640 | private: |
IanBenzMaxim | 0:33d4e66780c0 | 641 | typedef typename RegexType::State State; |
IanBenzMaxim | 0:33d4e66780c0 | 642 | typedef typename RegexType::Range Range; |
IanBenzMaxim | 0:33d4e66780c0 | 643 | |
IanBenzMaxim | 0:33d4e66780c0 | 644 | template <typename InputStream> |
IanBenzMaxim | 0:33d4e66780c0 | 645 | bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) { |
IanBenzMaxim | 0:33d4e66780c0 | 646 | DecodedStream<InputStream, Encoding> ds(is); |
IanBenzMaxim | 0:33d4e66780c0 | 647 | |
IanBenzMaxim | 0:33d4e66780c0 | 648 | state0_.Clear(); |
IanBenzMaxim | 0:33d4e66780c0 | 649 | Stack<Allocator> *current = &state0_, *next = &state1_; |
IanBenzMaxim | 0:33d4e66780c0 | 650 | const size_t stateSetSize = GetStateSetSize(); |
IanBenzMaxim | 0:33d4e66780c0 | 651 | std::memset(stateSet_, 0, stateSetSize); |
IanBenzMaxim | 0:33d4e66780c0 | 652 | |
IanBenzMaxim | 0:33d4e66780c0 | 653 | bool matched = AddState(*current, regex_.root_); |
IanBenzMaxim | 0:33d4e66780c0 | 654 | unsigned codepoint; |
IanBenzMaxim | 0:33d4e66780c0 | 655 | while (!current->Empty() && (codepoint = ds.Take()) != 0) { |
IanBenzMaxim | 0:33d4e66780c0 | 656 | std::memset(stateSet_, 0, stateSetSize); |
IanBenzMaxim | 0:33d4e66780c0 | 657 | next->Clear(); |
IanBenzMaxim | 0:33d4e66780c0 | 658 | matched = false; |
IanBenzMaxim | 0:33d4e66780c0 | 659 | for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) { |
IanBenzMaxim | 0:33d4e66780c0 | 660 | const State& sr = regex_.GetState(*s); |
IanBenzMaxim | 0:33d4e66780c0 | 661 | if (sr.codepoint == codepoint || |
IanBenzMaxim | 0:33d4e66780c0 | 662 | sr.codepoint == RegexType::kAnyCharacterClass || |
IanBenzMaxim | 0:33d4e66780c0 | 663 | (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint))) |
IanBenzMaxim | 0:33d4e66780c0 | 664 | { |
IanBenzMaxim | 0:33d4e66780c0 | 665 | matched = AddState(*next, sr.out) || matched; |
IanBenzMaxim | 0:33d4e66780c0 | 666 | if (!anchorEnd && matched) |
IanBenzMaxim | 0:33d4e66780c0 | 667 | return true; |
IanBenzMaxim | 0:33d4e66780c0 | 668 | } |
IanBenzMaxim | 0:33d4e66780c0 | 669 | if (!anchorBegin) |
IanBenzMaxim | 0:33d4e66780c0 | 670 | AddState(*next, regex_.root_); |
IanBenzMaxim | 0:33d4e66780c0 | 671 | } |
IanBenzMaxim | 0:33d4e66780c0 | 672 | internal::Swap(current, next); |
IanBenzMaxim | 0:33d4e66780c0 | 673 | } |
IanBenzMaxim | 0:33d4e66780c0 | 674 | |
IanBenzMaxim | 0:33d4e66780c0 | 675 | return matched; |
IanBenzMaxim | 0:33d4e66780c0 | 676 | } |
IanBenzMaxim | 0:33d4e66780c0 | 677 | |
IanBenzMaxim | 0:33d4e66780c0 | 678 | size_t GetStateSetSize() const { |
IanBenzMaxim | 0:33d4e66780c0 | 679 | return (regex_.stateCount_ + 31) / 32 * 4; |
IanBenzMaxim | 0:33d4e66780c0 | 680 | } |
IanBenzMaxim | 0:33d4e66780c0 | 681 | |
IanBenzMaxim | 0:33d4e66780c0 | 682 | // Return whether the added states is a match state |
IanBenzMaxim | 0:33d4e66780c0 | 683 | bool AddState(Stack<Allocator>& l, SizeType index) { |
IanBenzMaxim | 0:33d4e66780c0 | 684 | RAPIDJSON_ASSERT(index != kRegexInvalidState); |
IanBenzMaxim | 0:33d4e66780c0 | 685 | |
IanBenzMaxim | 0:33d4e66780c0 | 686 | const State& s = regex_.GetState(index); |
IanBenzMaxim | 0:33d4e66780c0 | 687 | if (s.out1 != kRegexInvalidState) { // Split |
IanBenzMaxim | 0:33d4e66780c0 | 688 | bool matched = AddState(l, s.out); |
IanBenzMaxim | 0:33d4e66780c0 | 689 | return AddState(l, s.out1) || matched; |
IanBenzMaxim | 0:33d4e66780c0 | 690 | } |
IanBenzMaxim | 0:33d4e66780c0 | 691 | else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) { |
IanBenzMaxim | 0:33d4e66780c0 | 692 | stateSet_[index >> 5] |= (1 << (index & 31)); |
IanBenzMaxim | 0:33d4e66780c0 | 693 | *l.template PushUnsafe<SizeType>() = index; |
IanBenzMaxim | 0:33d4e66780c0 | 694 | } |
IanBenzMaxim | 0:33d4e66780c0 | 695 | return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation. |
IanBenzMaxim | 0:33d4e66780c0 | 696 | } |
IanBenzMaxim | 0:33d4e66780c0 | 697 | |
IanBenzMaxim | 0:33d4e66780c0 | 698 | bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { |
IanBenzMaxim | 0:33d4e66780c0 | 699 | bool yes = (regex_.GetRange(rangeIndex).start & RegexType::kRangeNegationFlag) == 0; |
IanBenzMaxim | 0:33d4e66780c0 | 700 | while (rangeIndex != kRegexInvalidRange) { |
IanBenzMaxim | 0:33d4e66780c0 | 701 | const Range& r = regex_.GetRange(rangeIndex); |
IanBenzMaxim | 0:33d4e66780c0 | 702 | if (codepoint >= (r.start & ~RegexType::kRangeNegationFlag) && codepoint <= r.end) |
IanBenzMaxim | 0:33d4e66780c0 | 703 | return yes; |
IanBenzMaxim | 0:33d4e66780c0 | 704 | rangeIndex = r.next; |
IanBenzMaxim | 0:33d4e66780c0 | 705 | } |
IanBenzMaxim | 0:33d4e66780c0 | 706 | return !yes; |
IanBenzMaxim | 0:33d4e66780c0 | 707 | } |
IanBenzMaxim | 0:33d4e66780c0 | 708 | |
IanBenzMaxim | 0:33d4e66780c0 | 709 | const RegexType& regex_; |
IanBenzMaxim | 0:33d4e66780c0 | 710 | Allocator* allocator_; |
IanBenzMaxim | 0:33d4e66780c0 | 711 | Allocator* ownAllocator_; |
IanBenzMaxim | 0:33d4e66780c0 | 712 | Stack<Allocator> state0_; |
IanBenzMaxim | 0:33d4e66780c0 | 713 | Stack<Allocator> state1_; |
IanBenzMaxim | 0:33d4e66780c0 | 714 | uint32_t* stateSet_; |
IanBenzMaxim | 0:33d4e66780c0 | 715 | }; |
IanBenzMaxim | 0:33d4e66780c0 | 716 | |
IanBenzMaxim | 0:33d4e66780c0 | 717 | typedef GenericRegex<UTF8<> > Regex; |
IanBenzMaxim | 0:33d4e66780c0 | 718 | typedef GenericRegexSearch<Regex> RegexSearch; |
IanBenzMaxim | 0:33d4e66780c0 | 719 | |
IanBenzMaxim | 0:33d4e66780c0 | 720 | } // namespace internal |
IanBenzMaxim | 0:33d4e66780c0 | 721 | RAPIDJSON_NAMESPACE_END |
IanBenzMaxim | 0:33d4e66780c0 | 722 | |
IanBenzMaxim | 0:33d4e66780c0 | 723 | #ifdef __clang__ |
IanBenzMaxim | 0:33d4e66780c0 | 724 | RAPIDJSON_DIAG_POP |
IanBenzMaxim | 0:33d4e66780c0 | 725 | #endif |
IanBenzMaxim | 0:33d4e66780c0 | 726 | |
IanBenzMaxim | 0:33d4e66780c0 | 727 | #ifdef _MSC_VER |
IanBenzMaxim | 0:33d4e66780c0 | 728 | RAPIDJSON_DIAG_POP |
IanBenzMaxim | 0:33d4e66780c0 | 729 | #endif |
IanBenzMaxim | 0:33d4e66780c0 | 730 | |
IanBenzMaxim | 0:33d4e66780c0 | 731 | #endif // RAPIDJSON_INTERNAL_REGEX_H_ |