YoungSik Won
/
NUCLEO_F103_slre_test
SLRE(Super Light Regular Expression library) Unit Test Code.
slre/slre.c@0:3ca3835f816e, 2016-07-28 (annotated)
- Committer:
- monpetit
- Date:
- Thu Jul 28 08:10:30 2016 +0000
- Revision:
- 0:3ca3835f816e
initial commit: unit tests are passed.
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
monpetit | 0:3ca3835f816e | 1 | /* |
monpetit | 0:3ca3835f816e | 2 | * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com> |
monpetit | 0:3ca3835f816e | 3 | * Copyright (c) 2013 Cesanta Software Limited |
monpetit | 0:3ca3835f816e | 4 | * All rights reserved |
monpetit | 0:3ca3835f816e | 5 | * |
monpetit | 0:3ca3835f816e | 6 | * This library is dual-licensed: you can redistribute it and/or modify |
monpetit | 0:3ca3835f816e | 7 | * it under the terms of the GNU General Public License version 2 as |
monpetit | 0:3ca3835f816e | 8 | * published by the Free Software Foundation. For the terms of this |
monpetit | 0:3ca3835f816e | 9 | * license, see <http://www.gnu.org/licenses/>. |
monpetit | 0:3ca3835f816e | 10 | * |
monpetit | 0:3ca3835f816e | 11 | * You are free to use this library under the terms of the GNU General |
monpetit | 0:3ca3835f816e | 12 | * Public License, but WITHOUT ANY WARRANTY; without even the implied |
monpetit | 0:3ca3835f816e | 13 | * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
monpetit | 0:3ca3835f816e | 14 | * See the GNU General Public License for more details. |
monpetit | 0:3ca3835f816e | 15 | * |
monpetit | 0:3ca3835f816e | 16 | * Alternatively, you can license this library under a commercial |
monpetit | 0:3ca3835f816e | 17 | * license, as set out in <http://cesanta.com/products.html>. |
monpetit | 0:3ca3835f816e | 18 | */ |
monpetit | 0:3ca3835f816e | 19 | |
monpetit | 0:3ca3835f816e | 20 | #include <stdio.h> |
monpetit | 0:3ca3835f816e | 21 | #include <ctype.h> |
monpetit | 0:3ca3835f816e | 22 | #include <string.h> |
monpetit | 0:3ca3835f816e | 23 | |
monpetit | 0:3ca3835f816e | 24 | #include "slre.h" |
monpetit | 0:3ca3835f816e | 25 | |
monpetit | 0:3ca3835f816e | 26 | #define MAX_BRANCHES 100 |
monpetit | 0:3ca3835f816e | 27 | #define MAX_BRACKETS 100 |
monpetit | 0:3ca3835f816e | 28 | #define FAIL_IF(condition, error_code) if (condition) return (error_code) |
monpetit | 0:3ca3835f816e | 29 | |
monpetit | 0:3ca3835f816e | 30 | #ifndef ARRAY_SIZE |
monpetit | 0:3ca3835f816e | 31 | #define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0])) |
monpetit | 0:3ca3835f816e | 32 | #endif |
monpetit | 0:3ca3835f816e | 33 | |
monpetit | 0:3ca3835f816e | 34 | #ifdef SLRE_DEBUG |
monpetit | 0:3ca3835f816e | 35 | #define DBG(x) printf x |
monpetit | 0:3ca3835f816e | 36 | #else |
monpetit | 0:3ca3835f816e | 37 | #define DBG(x) |
monpetit | 0:3ca3835f816e | 38 | #endif |
monpetit | 0:3ca3835f816e | 39 | |
monpetit | 0:3ca3835f816e | 40 | struct bracket_pair { |
monpetit | 0:3ca3835f816e | 41 | const char *ptr; /* Points to the first char after '(' in regex */ |
monpetit | 0:3ca3835f816e | 42 | int len; /* Length of the text between '(' and ')' */ |
monpetit | 0:3ca3835f816e | 43 | int branches; /* Index in the branches array for this pair */ |
monpetit | 0:3ca3835f816e | 44 | int num_branches; /* Number of '|' in this bracket pair */ |
monpetit | 0:3ca3835f816e | 45 | }; |
monpetit | 0:3ca3835f816e | 46 | |
monpetit | 0:3ca3835f816e | 47 | struct branch { |
monpetit | 0:3ca3835f816e | 48 | int bracket_index; /* index for 'struct bracket_pair brackets' */ |
monpetit | 0:3ca3835f816e | 49 | /* array defined below */ |
monpetit | 0:3ca3835f816e | 50 | const char *schlong; /* points to the '|' character in the regex */ |
monpetit | 0:3ca3835f816e | 51 | }; |
monpetit | 0:3ca3835f816e | 52 | |
monpetit | 0:3ca3835f816e | 53 | struct regex_info { |
monpetit | 0:3ca3835f816e | 54 | /* |
monpetit | 0:3ca3835f816e | 55 | * Describes all bracket pairs in the regular expression. |
monpetit | 0:3ca3835f816e | 56 | * First entry is always present, and grabs the whole regex. |
monpetit | 0:3ca3835f816e | 57 | */ |
monpetit | 0:3ca3835f816e | 58 | struct bracket_pair brackets[MAX_BRACKETS]; |
monpetit | 0:3ca3835f816e | 59 | int num_brackets; |
monpetit | 0:3ca3835f816e | 60 | |
monpetit | 0:3ca3835f816e | 61 | /* |
monpetit | 0:3ca3835f816e | 62 | * Describes alternations ('|' operators) in the regular expression. |
monpetit | 0:3ca3835f816e | 63 | * Each branch falls into a specific branch pair. |
monpetit | 0:3ca3835f816e | 64 | */ |
monpetit | 0:3ca3835f816e | 65 | struct branch branches[MAX_BRANCHES]; |
monpetit | 0:3ca3835f816e | 66 | int num_branches; |
monpetit | 0:3ca3835f816e | 67 | |
monpetit | 0:3ca3835f816e | 68 | /* Array of captures provided by the user */ |
monpetit | 0:3ca3835f816e | 69 | struct slre_cap *caps; |
monpetit | 0:3ca3835f816e | 70 | int num_caps; |
monpetit | 0:3ca3835f816e | 71 | |
monpetit | 0:3ca3835f816e | 72 | /* E.g. SLRE_IGNORE_CASE. See enum below */ |
monpetit | 0:3ca3835f816e | 73 | int flags; |
monpetit | 0:3ca3835f816e | 74 | }; |
monpetit | 0:3ca3835f816e | 75 | |
monpetit | 0:3ca3835f816e | 76 | static int is_metacharacter(const unsigned char *s) { |
monpetit | 0:3ca3835f816e | 77 | static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv"; |
monpetit | 0:3ca3835f816e | 78 | return strchr(metacharacters, *s) != NULL; |
monpetit | 0:3ca3835f816e | 79 | } |
monpetit | 0:3ca3835f816e | 80 | |
monpetit | 0:3ca3835f816e | 81 | static int op_len(const char *re) { |
monpetit | 0:3ca3835f816e | 82 | return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1; |
monpetit | 0:3ca3835f816e | 83 | } |
monpetit | 0:3ca3835f816e | 84 | |
monpetit | 0:3ca3835f816e | 85 | static int set_len(const char *re, int re_len) { |
monpetit | 0:3ca3835f816e | 86 | int len = 0; |
monpetit | 0:3ca3835f816e | 87 | |
monpetit | 0:3ca3835f816e | 88 | while (len < re_len && re[len] != ']') { |
monpetit | 0:3ca3835f816e | 89 | len += op_len(re + len); |
monpetit | 0:3ca3835f816e | 90 | } |
monpetit | 0:3ca3835f816e | 91 | |
monpetit | 0:3ca3835f816e | 92 | return len <= re_len ? len + 1 : -1; |
monpetit | 0:3ca3835f816e | 93 | } |
monpetit | 0:3ca3835f816e | 94 | |
monpetit | 0:3ca3835f816e | 95 | static int get_op_len(const char *re, int re_len) { |
monpetit | 0:3ca3835f816e | 96 | return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re); |
monpetit | 0:3ca3835f816e | 97 | } |
monpetit | 0:3ca3835f816e | 98 | |
monpetit | 0:3ca3835f816e | 99 | static int is_quantifier(const char *re) { |
monpetit | 0:3ca3835f816e | 100 | return re[0] == '*' || re[0] == '+' || re[0] == '?'; |
monpetit | 0:3ca3835f816e | 101 | } |
monpetit | 0:3ca3835f816e | 102 | |
monpetit | 0:3ca3835f816e | 103 | static int toi(int x) { |
monpetit | 0:3ca3835f816e | 104 | return isdigit(x) ? x - '0' : x - 'W'; |
monpetit | 0:3ca3835f816e | 105 | } |
monpetit | 0:3ca3835f816e | 106 | |
monpetit | 0:3ca3835f816e | 107 | static int hextoi(const unsigned char *s) { |
monpetit | 0:3ca3835f816e | 108 | return (toi(tolower(s[0])) << 4) | toi(tolower(s[1])); |
monpetit | 0:3ca3835f816e | 109 | } |
monpetit | 0:3ca3835f816e | 110 | |
monpetit | 0:3ca3835f816e | 111 | static int match_op(const unsigned char *re, const unsigned char *s, |
monpetit | 0:3ca3835f816e | 112 | struct regex_info *info) { |
monpetit | 0:3ca3835f816e | 113 | int result = 0; |
monpetit | 0:3ca3835f816e | 114 | switch (*re) { |
monpetit | 0:3ca3835f816e | 115 | case '\\': |
monpetit | 0:3ca3835f816e | 116 | /* Metacharacters */ |
monpetit | 0:3ca3835f816e | 117 | switch (re[1]) { |
monpetit | 0:3ca3835f816e | 118 | case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 119 | case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 120 | case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 121 | case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 122 | case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 123 | case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 124 | case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 125 | case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 126 | case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break; |
monpetit | 0:3ca3835f816e | 127 | |
monpetit | 0:3ca3835f816e | 128 | case 'x': |
monpetit | 0:3ca3835f816e | 129 | /* Match byte, \xHH where HH is hexadecimal byte representaion */ |
monpetit | 0:3ca3835f816e | 130 | FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 131 | result++; |
monpetit | 0:3ca3835f816e | 132 | break; |
monpetit | 0:3ca3835f816e | 133 | |
monpetit | 0:3ca3835f816e | 134 | default: |
monpetit | 0:3ca3835f816e | 135 | /* Valid metacharacter check is done in bar() */ |
monpetit | 0:3ca3835f816e | 136 | FAIL_IF(re[1] != s[0], SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 137 | result++; |
monpetit | 0:3ca3835f816e | 138 | break; |
monpetit | 0:3ca3835f816e | 139 | } |
monpetit | 0:3ca3835f816e | 140 | break; |
monpetit | 0:3ca3835f816e | 141 | |
monpetit | 0:3ca3835f816e | 142 | case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break; |
monpetit | 0:3ca3835f816e | 143 | case '$': FAIL_IF(1, SLRE_NO_MATCH); break; |
monpetit | 0:3ca3835f816e | 144 | case '.': result++; break; |
monpetit | 0:3ca3835f816e | 145 | |
monpetit | 0:3ca3835f816e | 146 | default: |
monpetit | 0:3ca3835f816e | 147 | if (info->flags & SLRE_IGNORE_CASE) { |
monpetit | 0:3ca3835f816e | 148 | FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 149 | } else { |
monpetit | 0:3ca3835f816e | 150 | FAIL_IF(*re != *s, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 151 | } |
monpetit | 0:3ca3835f816e | 152 | result++; |
monpetit | 0:3ca3835f816e | 153 | break; |
monpetit | 0:3ca3835f816e | 154 | } |
monpetit | 0:3ca3835f816e | 155 | |
monpetit | 0:3ca3835f816e | 156 | return result; |
monpetit | 0:3ca3835f816e | 157 | } |
monpetit | 0:3ca3835f816e | 158 | |
monpetit | 0:3ca3835f816e | 159 | static int match_set(const char *re, int re_len, const char *s, |
monpetit | 0:3ca3835f816e | 160 | struct regex_info *info) { |
monpetit | 0:3ca3835f816e | 161 | int len = 0, result = -1, invert = re[0] == '^'; |
monpetit | 0:3ca3835f816e | 162 | |
monpetit | 0:3ca3835f816e | 163 | if (invert) re++, re_len--; |
monpetit | 0:3ca3835f816e | 164 | |
monpetit | 0:3ca3835f816e | 165 | while (len <= re_len && re[len] != ']' && result <= 0) { |
monpetit | 0:3ca3835f816e | 166 | /* Support character range */ |
monpetit | 0:3ca3835f816e | 167 | if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' && |
monpetit | 0:3ca3835f816e | 168 | re[len + 2] != '\0') { |
monpetit | 0:3ca3835f816e | 169 | result = info->flags & SLRE_IGNORE_CASE ? |
monpetit | 0:3ca3835f816e | 170 | tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]) : |
monpetit | 0:3ca3835f816e | 171 | *s >= re[len] && *s <= re[len + 2]; |
monpetit | 0:3ca3835f816e | 172 | len += 3; |
monpetit | 0:3ca3835f816e | 173 | } else { |
monpetit | 0:3ca3835f816e | 174 | result = match_op((unsigned char *) re + len, (unsigned char *) s, info); |
monpetit | 0:3ca3835f816e | 175 | len += op_len(re + len); |
monpetit | 0:3ca3835f816e | 176 | } |
monpetit | 0:3ca3835f816e | 177 | } |
monpetit | 0:3ca3835f816e | 178 | return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1; |
monpetit | 0:3ca3835f816e | 179 | } |
monpetit | 0:3ca3835f816e | 180 | |
monpetit | 0:3ca3835f816e | 181 | static int doh(const char *s, int s_len, struct regex_info *info, int bi); |
monpetit | 0:3ca3835f816e | 182 | |
monpetit | 0:3ca3835f816e | 183 | static int bar(const char *re, int re_len, const char *s, int s_len, |
monpetit | 0:3ca3835f816e | 184 | struct regex_info *info, int bi) { |
monpetit | 0:3ca3835f816e | 185 | /* i is offset in re, j is offset in s, bi is brackets index */ |
monpetit | 0:3ca3835f816e | 186 | int i, j, n, step; |
monpetit | 0:3ca3835f816e | 187 | |
monpetit | 0:3ca3835f816e | 188 | for (i = j = 0; i < re_len && j <= s_len; i += step) { |
monpetit | 0:3ca3835f816e | 189 | |
monpetit | 0:3ca3835f816e | 190 | /* Handle quantifiers. Get the length of the chunk. */ |
monpetit | 0:3ca3835f816e | 191 | step = re[i] == '(' ? info->brackets[bi + 1].len + 2 : |
monpetit | 0:3ca3835f816e | 192 | get_op_len(re + i, re_len - i); |
monpetit | 0:3ca3835f816e | 193 | |
monpetit | 0:3ca3835f816e | 194 | DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__, |
monpetit | 0:3ca3835f816e | 195 | re_len - i, re + i, s_len - j, s + j, re_len, step, i, j)); |
monpetit | 0:3ca3835f816e | 196 | |
monpetit | 0:3ca3835f816e | 197 | FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER); |
monpetit | 0:3ca3835f816e | 198 | FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET); |
monpetit | 0:3ca3835f816e | 199 | |
monpetit | 0:3ca3835f816e | 200 | if (i + step < re_len && is_quantifier(re + i + step)) { |
monpetit | 0:3ca3835f816e | 201 | DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i, |
monpetit | 0:3ca3835f816e | 202 | re[i + step], s_len - j, s + j)); |
monpetit | 0:3ca3835f816e | 203 | if (re[i + step] == '?') { |
monpetit | 0:3ca3835f816e | 204 | int result = bar(re + i, step, s + j, s_len - j, info, bi); |
monpetit | 0:3ca3835f816e | 205 | j += result > 0 ? result : 0; |
monpetit | 0:3ca3835f816e | 206 | i++; |
monpetit | 0:3ca3835f816e | 207 | } else if (re[i + step] == '+' || re[i + step] == '*') { |
monpetit | 0:3ca3835f816e | 208 | int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0; |
monpetit | 0:3ca3835f816e | 209 | |
monpetit | 0:3ca3835f816e | 210 | /* Points to the regexp code after the quantifier */ |
monpetit | 0:3ca3835f816e | 211 | ni = i + step + 1; |
monpetit | 0:3ca3835f816e | 212 | if (ni < re_len && re[ni] == '?') { |
monpetit | 0:3ca3835f816e | 213 | non_greedy = 1; |
monpetit | 0:3ca3835f816e | 214 | ni++; |
monpetit | 0:3ca3835f816e | 215 | } |
monpetit | 0:3ca3835f816e | 216 | |
monpetit | 0:3ca3835f816e | 217 | do { |
monpetit | 0:3ca3835f816e | 218 | if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) { |
monpetit | 0:3ca3835f816e | 219 | j2 += n1; |
monpetit | 0:3ca3835f816e | 220 | } |
monpetit | 0:3ca3835f816e | 221 | if (re[i + step] == '+' && n1 < 0) break; |
monpetit | 0:3ca3835f816e | 222 | |
monpetit | 0:3ca3835f816e | 223 | if (ni >= re_len) { |
monpetit | 0:3ca3835f816e | 224 | /* After quantifier, there is nothing */ |
monpetit | 0:3ca3835f816e | 225 | nj = j2; |
monpetit | 0:3ca3835f816e | 226 | } else if ((n2 = bar(re + ni, re_len - ni, s + j2, |
monpetit | 0:3ca3835f816e | 227 | s_len - j2, info, bi)) >= 0) { |
monpetit | 0:3ca3835f816e | 228 | /* Regex after quantifier matched */ |
monpetit | 0:3ca3835f816e | 229 | nj = j2 + n2; |
monpetit | 0:3ca3835f816e | 230 | } |
monpetit | 0:3ca3835f816e | 231 | if (nj > j && non_greedy) break; |
monpetit | 0:3ca3835f816e | 232 | } while (n1 > 0); |
monpetit | 0:3ca3835f816e | 233 | |
monpetit | 0:3ca3835f816e | 234 | /* |
monpetit | 0:3ca3835f816e | 235 | * Even if we found one or more pattern, this branch will be executed, |
monpetit | 0:3ca3835f816e | 236 | * changing the next captures. |
monpetit | 0:3ca3835f816e | 237 | */ |
monpetit | 0:3ca3835f816e | 238 | if (n1 < 0 && n2 < 0 && re[i + step] == '*' && |
monpetit | 0:3ca3835f816e | 239 | (n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) { |
monpetit | 0:3ca3835f816e | 240 | nj = j + n2; |
monpetit | 0:3ca3835f816e | 241 | } |
monpetit | 0:3ca3835f816e | 242 | |
monpetit | 0:3ca3835f816e | 243 | DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2)); |
monpetit | 0:3ca3835f816e | 244 | FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 245 | |
monpetit | 0:3ca3835f816e | 246 | /* If while loop body above was not executed for the * quantifier, */ |
monpetit | 0:3ca3835f816e | 247 | /* make sure the rest of the regex matches */ |
monpetit | 0:3ca3835f816e | 248 | FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 249 | |
monpetit | 0:3ca3835f816e | 250 | /* Returning here cause we've matched the rest of RE already */ |
monpetit | 0:3ca3835f816e | 251 | return nj; |
monpetit | 0:3ca3835f816e | 252 | } |
monpetit | 0:3ca3835f816e | 253 | continue; |
monpetit | 0:3ca3835f816e | 254 | } |
monpetit | 0:3ca3835f816e | 255 | |
monpetit | 0:3ca3835f816e | 256 | if (re[i] == '[') { |
monpetit | 0:3ca3835f816e | 257 | n = match_set(re + i + 1, re_len - (i + 2), s + j, info); |
monpetit | 0:3ca3835f816e | 258 | DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n)); |
monpetit | 0:3ca3835f816e | 259 | FAIL_IF(n <= 0, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 260 | j += n; |
monpetit | 0:3ca3835f816e | 261 | } else if (re[i] == '(') { |
monpetit | 0:3ca3835f816e | 262 | n = SLRE_NO_MATCH; |
monpetit | 0:3ca3835f816e | 263 | bi++; |
monpetit | 0:3ca3835f816e | 264 | FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR); |
monpetit | 0:3ca3835f816e | 265 | DBG(("CAPTURING [%.*s] [%.*s] [%s]\n", |
monpetit | 0:3ca3835f816e | 266 | step, re + i, s_len - j, s + j, re + i + step)); |
monpetit | 0:3ca3835f816e | 267 | |
monpetit | 0:3ca3835f816e | 268 | if (re_len - (i + step) <= 0) { |
monpetit | 0:3ca3835f816e | 269 | /* Nothing follows brackets */ |
monpetit | 0:3ca3835f816e | 270 | n = doh(s + j, s_len - j, info, bi); |
monpetit | 0:3ca3835f816e | 271 | } else { |
monpetit | 0:3ca3835f816e | 272 | int j2; |
monpetit | 0:3ca3835f816e | 273 | for (j2 = 0; j2 <= s_len - j; j2++) { |
monpetit | 0:3ca3835f816e | 274 | if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 && |
monpetit | 0:3ca3835f816e | 275 | bar(re + i + step, re_len - (i + step), |
monpetit | 0:3ca3835f816e | 276 | s + j + n, s_len - (j + n), info, bi) >= 0) break; |
monpetit | 0:3ca3835f816e | 277 | } |
monpetit | 0:3ca3835f816e | 278 | } |
monpetit | 0:3ca3835f816e | 279 | |
monpetit | 0:3ca3835f816e | 280 | DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n)); |
monpetit | 0:3ca3835f816e | 281 | FAIL_IF(n < 0, n); |
monpetit | 0:3ca3835f816e | 282 | if (info->caps != NULL && n > 0) { |
monpetit | 0:3ca3835f816e | 283 | info->caps[bi - 1].ptr = s + j; |
monpetit | 0:3ca3835f816e | 284 | info->caps[bi - 1].len = n; |
monpetit | 0:3ca3835f816e | 285 | } |
monpetit | 0:3ca3835f816e | 286 | j += n; |
monpetit | 0:3ca3835f816e | 287 | } else if (re[i] == '^') { |
monpetit | 0:3ca3835f816e | 288 | FAIL_IF(j != 0, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 289 | } else if (re[i] == '$') { |
monpetit | 0:3ca3835f816e | 290 | FAIL_IF(j != s_len, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 291 | } else { |
monpetit | 0:3ca3835f816e | 292 | FAIL_IF(j >= s_len, SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 293 | n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info); |
monpetit | 0:3ca3835f816e | 294 | FAIL_IF(n <= 0, n); |
monpetit | 0:3ca3835f816e | 295 | j += n; |
monpetit | 0:3ca3835f816e | 296 | } |
monpetit | 0:3ca3835f816e | 297 | } |
monpetit | 0:3ca3835f816e | 298 | |
monpetit | 0:3ca3835f816e | 299 | return j; |
monpetit | 0:3ca3835f816e | 300 | } |
monpetit | 0:3ca3835f816e | 301 | |
monpetit | 0:3ca3835f816e | 302 | /* Process branch points */ |
monpetit | 0:3ca3835f816e | 303 | static int doh(const char *s, int s_len, struct regex_info *info, int bi) { |
monpetit | 0:3ca3835f816e | 304 | const struct bracket_pair *b = &info->brackets[bi]; |
monpetit | 0:3ca3835f816e | 305 | int i = 0, len, result; |
monpetit | 0:3ca3835f816e | 306 | const char *p; |
monpetit | 0:3ca3835f816e | 307 | |
monpetit | 0:3ca3835f816e | 308 | do { |
monpetit | 0:3ca3835f816e | 309 | p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1; |
monpetit | 0:3ca3835f816e | 310 | len = b->num_branches == 0 ? b->len : |
monpetit | 0:3ca3835f816e | 311 | i == b->num_branches ? (int) (b->ptr + b->len - p) : |
monpetit | 0:3ca3835f816e | 312 | (int) (info->branches[b->branches + i].schlong - p); |
monpetit | 0:3ca3835f816e | 313 | DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s)); |
monpetit | 0:3ca3835f816e | 314 | result = bar(p, len, s, s_len, info, bi); |
monpetit | 0:3ca3835f816e | 315 | DBG(("%s <- %d\n", __func__, result)); |
monpetit | 0:3ca3835f816e | 316 | } while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */ |
monpetit | 0:3ca3835f816e | 317 | |
monpetit | 0:3ca3835f816e | 318 | return result; |
monpetit | 0:3ca3835f816e | 319 | } |
monpetit | 0:3ca3835f816e | 320 | |
monpetit | 0:3ca3835f816e | 321 | static int baz(const char *s, int s_len, struct regex_info *info) { |
monpetit | 0:3ca3835f816e | 322 | int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^'; |
monpetit | 0:3ca3835f816e | 323 | |
monpetit | 0:3ca3835f816e | 324 | for (i = 0; i <= s_len; i++) { |
monpetit | 0:3ca3835f816e | 325 | result = doh(s + i, s_len - i, info, 0); |
monpetit | 0:3ca3835f816e | 326 | if (result >= 0) { |
monpetit | 0:3ca3835f816e | 327 | result += i; |
monpetit | 0:3ca3835f816e | 328 | break; |
monpetit | 0:3ca3835f816e | 329 | } |
monpetit | 0:3ca3835f816e | 330 | if (is_anchored) break; |
monpetit | 0:3ca3835f816e | 331 | } |
monpetit | 0:3ca3835f816e | 332 | |
monpetit | 0:3ca3835f816e | 333 | return result; |
monpetit | 0:3ca3835f816e | 334 | } |
monpetit | 0:3ca3835f816e | 335 | |
monpetit | 0:3ca3835f816e | 336 | static void setup_branch_points(struct regex_info *info) { |
monpetit | 0:3ca3835f816e | 337 | int i, j; |
monpetit | 0:3ca3835f816e | 338 | struct branch tmp; |
monpetit | 0:3ca3835f816e | 339 | |
monpetit | 0:3ca3835f816e | 340 | /* First, sort branches. Must be stable, no qsort. Use bubble algo. */ |
monpetit | 0:3ca3835f816e | 341 | for (i = 0; i < info->num_branches; i++) { |
monpetit | 0:3ca3835f816e | 342 | for (j = i + 1; j < info->num_branches; j++) { |
monpetit | 0:3ca3835f816e | 343 | if (info->branches[i].bracket_index > info->branches[j].bracket_index) { |
monpetit | 0:3ca3835f816e | 344 | tmp = info->branches[i]; |
monpetit | 0:3ca3835f816e | 345 | info->branches[i] = info->branches[j]; |
monpetit | 0:3ca3835f816e | 346 | info->branches[j] = tmp; |
monpetit | 0:3ca3835f816e | 347 | } |
monpetit | 0:3ca3835f816e | 348 | } |
monpetit | 0:3ca3835f816e | 349 | } |
monpetit | 0:3ca3835f816e | 350 | |
monpetit | 0:3ca3835f816e | 351 | /* |
monpetit | 0:3ca3835f816e | 352 | * For each bracket, set their branch points. This way, for every bracket |
monpetit | 0:3ca3835f816e | 353 | * (i.e. every chunk of regex) we know all branch points before matching. |
monpetit | 0:3ca3835f816e | 354 | */ |
monpetit | 0:3ca3835f816e | 355 | for (i = j = 0; i < info->num_brackets; i++) { |
monpetit | 0:3ca3835f816e | 356 | info->brackets[i].num_branches = 0; |
monpetit | 0:3ca3835f816e | 357 | info->brackets[i].branches = j; |
monpetit | 0:3ca3835f816e | 358 | while (j < info->num_branches && info->branches[j].bracket_index == i) { |
monpetit | 0:3ca3835f816e | 359 | info->brackets[i].num_branches++; |
monpetit | 0:3ca3835f816e | 360 | j++; |
monpetit | 0:3ca3835f816e | 361 | } |
monpetit | 0:3ca3835f816e | 362 | } |
monpetit | 0:3ca3835f816e | 363 | } |
monpetit | 0:3ca3835f816e | 364 | |
monpetit | 0:3ca3835f816e | 365 | static int foo(const char *re, int re_len, const char *s, int s_len, |
monpetit | 0:3ca3835f816e | 366 | struct regex_info *info) { |
monpetit | 0:3ca3835f816e | 367 | int i, step, depth = 0; |
monpetit | 0:3ca3835f816e | 368 | |
monpetit | 0:3ca3835f816e | 369 | /* First bracket captures everything */ |
monpetit | 0:3ca3835f816e | 370 | info->brackets[0].ptr = re; |
monpetit | 0:3ca3835f816e | 371 | info->brackets[0].len = re_len; |
monpetit | 0:3ca3835f816e | 372 | info->num_brackets = 1; |
monpetit | 0:3ca3835f816e | 373 | |
monpetit | 0:3ca3835f816e | 374 | /* Make a single pass over regex string, memorize brackets and branches */ |
monpetit | 0:3ca3835f816e | 375 | for (i = 0; i < re_len; i += step) { |
monpetit | 0:3ca3835f816e | 376 | step = get_op_len(re + i, re_len - i); |
monpetit | 0:3ca3835f816e | 377 | |
monpetit | 0:3ca3835f816e | 378 | if (re[i] == '|') { |
monpetit | 0:3ca3835f816e | 379 | FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches), |
monpetit | 0:3ca3835f816e | 380 | SLRE_TOO_MANY_BRANCHES); |
monpetit | 0:3ca3835f816e | 381 | info->branches[info->num_branches].bracket_index = |
monpetit | 0:3ca3835f816e | 382 | info->brackets[info->num_brackets - 1].len == -1 ? |
monpetit | 0:3ca3835f816e | 383 | info->num_brackets - 1 : depth; |
monpetit | 0:3ca3835f816e | 384 | info->branches[info->num_branches].schlong = &re[i]; |
monpetit | 0:3ca3835f816e | 385 | info->num_branches++; |
monpetit | 0:3ca3835f816e | 386 | } else if (re[i] == '\\') { |
monpetit | 0:3ca3835f816e | 387 | FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER); |
monpetit | 0:3ca3835f816e | 388 | if (re[i + 1] == 'x') { |
monpetit | 0:3ca3835f816e | 389 | /* Hex digit specification must follow */ |
monpetit | 0:3ca3835f816e | 390 | FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3, |
monpetit | 0:3ca3835f816e | 391 | SLRE_INVALID_METACHARACTER); |
monpetit | 0:3ca3835f816e | 392 | FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) && |
monpetit | 0:3ca3835f816e | 393 | isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER); |
monpetit | 0:3ca3835f816e | 394 | } else { |
monpetit | 0:3ca3835f816e | 395 | FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1), |
monpetit | 0:3ca3835f816e | 396 | SLRE_INVALID_METACHARACTER); |
monpetit | 0:3ca3835f816e | 397 | } |
monpetit | 0:3ca3835f816e | 398 | } else if (re[i] == '(') { |
monpetit | 0:3ca3835f816e | 399 | FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets), |
monpetit | 0:3ca3835f816e | 400 | SLRE_TOO_MANY_BRACKETS); |
monpetit | 0:3ca3835f816e | 401 | depth++; /* Order is important here. Depth increments first. */ |
monpetit | 0:3ca3835f816e | 402 | info->brackets[info->num_brackets].ptr = re + i + 1; |
monpetit | 0:3ca3835f816e | 403 | info->brackets[info->num_brackets].len = -1; |
monpetit | 0:3ca3835f816e | 404 | info->num_brackets++; |
monpetit | 0:3ca3835f816e | 405 | FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps, |
monpetit | 0:3ca3835f816e | 406 | SLRE_CAPS_ARRAY_TOO_SMALL); |
monpetit | 0:3ca3835f816e | 407 | } else if (re[i] == ')') { |
monpetit | 0:3ca3835f816e | 408 | int ind = info->brackets[info->num_brackets - 1].len == -1 ? |
monpetit | 0:3ca3835f816e | 409 | info->num_brackets - 1 : depth; |
monpetit | 0:3ca3835f816e | 410 | info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr); |
monpetit | 0:3ca3835f816e | 411 | DBG(("SETTING BRACKET %d [%.*s]\n", |
monpetit | 0:3ca3835f816e | 412 | ind, info->brackets[ind].len, info->brackets[ind].ptr)); |
monpetit | 0:3ca3835f816e | 413 | depth--; |
monpetit | 0:3ca3835f816e | 414 | FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS); |
monpetit | 0:3ca3835f816e | 415 | FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH); |
monpetit | 0:3ca3835f816e | 416 | } |
monpetit | 0:3ca3835f816e | 417 | } |
monpetit | 0:3ca3835f816e | 418 | |
monpetit | 0:3ca3835f816e | 419 | FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS); |
monpetit | 0:3ca3835f816e | 420 | setup_branch_points(info); |
monpetit | 0:3ca3835f816e | 421 | |
monpetit | 0:3ca3835f816e | 422 | return baz(s, s_len, info); |
monpetit | 0:3ca3835f816e | 423 | } |
monpetit | 0:3ca3835f816e | 424 | |
monpetit | 0:3ca3835f816e | 425 | int slre_match(const char *regexp, const char *s, int s_len, |
monpetit | 0:3ca3835f816e | 426 | struct slre_cap *caps, int num_caps, int flags) { |
monpetit | 0:3ca3835f816e | 427 | struct regex_info info; |
monpetit | 0:3ca3835f816e | 428 | |
monpetit | 0:3ca3835f816e | 429 | /* Initialize info structure */ |
monpetit | 0:3ca3835f816e | 430 | info.flags = flags; |
monpetit | 0:3ca3835f816e | 431 | info.num_brackets = info.num_branches = 0; |
monpetit | 0:3ca3835f816e | 432 | info.num_caps = num_caps; |
monpetit | 0:3ca3835f816e | 433 | info.caps = caps; |
monpetit | 0:3ca3835f816e | 434 | |
monpetit | 0:3ca3835f816e | 435 | DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s)); |
monpetit | 0:3ca3835f816e | 436 | return foo(regexp, (int) strlen(regexp), s, s_len, &info); |
monpetit | 0:3ca3835f816e | 437 | } |