SLRE(Super Light Regular Expression library) Unit Test Code.

Dependencies:   mbed

Committer:
monpetit
Date:
Thu Jul 28 08:10:30 2016 +0000
Revision:
0:3ca3835f816e
initial commit: unit tests are passed.

Who changed what in which revision?

UserRevisionLine numberNew contents of line
monpetit 0:3ca3835f816e 1 /*
monpetit 0:3ca3835f816e 2 * Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
monpetit 0:3ca3835f816e 3 * Copyright (c) 2013 Cesanta Software Limited
monpetit 0:3ca3835f816e 4 * All rights reserved
monpetit 0:3ca3835f816e 5 *
monpetit 0:3ca3835f816e 6 * This library is dual-licensed: you can redistribute it and/or modify
monpetit 0:3ca3835f816e 7 * it under the terms of the GNU General Public License version 2 as
monpetit 0:3ca3835f816e 8 * published by the Free Software Foundation. For the terms of this
monpetit 0:3ca3835f816e 9 * license, see <http://www.gnu.org/licenses/>.
monpetit 0:3ca3835f816e 10 *
monpetit 0:3ca3835f816e 11 * You are free to use this library under the terms of the GNU General
monpetit 0:3ca3835f816e 12 * Public License, but WITHOUT ANY WARRANTY; without even the implied
monpetit 0:3ca3835f816e 13 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
monpetit 0:3ca3835f816e 14 * See the GNU General Public License for more details.
monpetit 0:3ca3835f816e 15 *
monpetit 0:3ca3835f816e 16 * Alternatively, you can license this library under a commercial
monpetit 0:3ca3835f816e 17 * license, as set out in <http://cesanta.com/products.html>.
monpetit 0:3ca3835f816e 18 */
monpetit 0:3ca3835f816e 19
monpetit 0:3ca3835f816e 20 #include <stdio.h>
monpetit 0:3ca3835f816e 21 #include <ctype.h>
monpetit 0:3ca3835f816e 22 #include <string.h>
monpetit 0:3ca3835f816e 23
monpetit 0:3ca3835f816e 24 #include "slre.h"
monpetit 0:3ca3835f816e 25
monpetit 0:3ca3835f816e 26 #define MAX_BRANCHES 100
monpetit 0:3ca3835f816e 27 #define MAX_BRACKETS 100
monpetit 0:3ca3835f816e 28 #define FAIL_IF(condition, error_code) if (condition) return (error_code)
monpetit 0:3ca3835f816e 29
monpetit 0:3ca3835f816e 30 #ifndef ARRAY_SIZE
monpetit 0:3ca3835f816e 31 #define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
monpetit 0:3ca3835f816e 32 #endif
monpetit 0:3ca3835f816e 33
monpetit 0:3ca3835f816e 34 #ifdef SLRE_DEBUG
monpetit 0:3ca3835f816e 35 #define DBG(x) printf x
monpetit 0:3ca3835f816e 36 #else
monpetit 0:3ca3835f816e 37 #define DBG(x)
monpetit 0:3ca3835f816e 38 #endif
monpetit 0:3ca3835f816e 39
monpetit 0:3ca3835f816e 40 struct bracket_pair {
monpetit 0:3ca3835f816e 41 const char *ptr; /* Points to the first char after '(' in regex */
monpetit 0:3ca3835f816e 42 int len; /* Length of the text between '(' and ')' */
monpetit 0:3ca3835f816e 43 int branches; /* Index in the branches array for this pair */
monpetit 0:3ca3835f816e 44 int num_branches; /* Number of '|' in this bracket pair */
monpetit 0:3ca3835f816e 45 };
monpetit 0:3ca3835f816e 46
monpetit 0:3ca3835f816e 47 struct branch {
monpetit 0:3ca3835f816e 48 int bracket_index; /* index for 'struct bracket_pair brackets' */
monpetit 0:3ca3835f816e 49 /* array defined below */
monpetit 0:3ca3835f816e 50 const char *schlong; /* points to the '|' character in the regex */
monpetit 0:3ca3835f816e 51 };
monpetit 0:3ca3835f816e 52
monpetit 0:3ca3835f816e 53 struct regex_info {
monpetit 0:3ca3835f816e 54 /*
monpetit 0:3ca3835f816e 55 * Describes all bracket pairs in the regular expression.
monpetit 0:3ca3835f816e 56 * First entry is always present, and grabs the whole regex.
monpetit 0:3ca3835f816e 57 */
monpetit 0:3ca3835f816e 58 struct bracket_pair brackets[MAX_BRACKETS];
monpetit 0:3ca3835f816e 59 int num_brackets;
monpetit 0:3ca3835f816e 60
monpetit 0:3ca3835f816e 61 /*
monpetit 0:3ca3835f816e 62 * Describes alternations ('|' operators) in the regular expression.
monpetit 0:3ca3835f816e 63 * Each branch falls into a specific branch pair.
monpetit 0:3ca3835f816e 64 */
monpetit 0:3ca3835f816e 65 struct branch branches[MAX_BRANCHES];
monpetit 0:3ca3835f816e 66 int num_branches;
monpetit 0:3ca3835f816e 67
monpetit 0:3ca3835f816e 68 /* Array of captures provided by the user */
monpetit 0:3ca3835f816e 69 struct slre_cap *caps;
monpetit 0:3ca3835f816e 70 int num_caps;
monpetit 0:3ca3835f816e 71
monpetit 0:3ca3835f816e 72 /* E.g. SLRE_IGNORE_CASE. See enum below */
monpetit 0:3ca3835f816e 73 int flags;
monpetit 0:3ca3835f816e 74 };
monpetit 0:3ca3835f816e 75
monpetit 0:3ca3835f816e 76 static int is_metacharacter(const unsigned char *s) {
monpetit 0:3ca3835f816e 77 static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
monpetit 0:3ca3835f816e 78 return strchr(metacharacters, *s) != NULL;
monpetit 0:3ca3835f816e 79 }
monpetit 0:3ca3835f816e 80
monpetit 0:3ca3835f816e 81 static int op_len(const char *re) {
monpetit 0:3ca3835f816e 82 return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
monpetit 0:3ca3835f816e 83 }
monpetit 0:3ca3835f816e 84
monpetit 0:3ca3835f816e 85 static int set_len(const char *re, int re_len) {
monpetit 0:3ca3835f816e 86 int len = 0;
monpetit 0:3ca3835f816e 87
monpetit 0:3ca3835f816e 88 while (len < re_len && re[len] != ']') {
monpetit 0:3ca3835f816e 89 len += op_len(re + len);
monpetit 0:3ca3835f816e 90 }
monpetit 0:3ca3835f816e 91
monpetit 0:3ca3835f816e 92 return len <= re_len ? len + 1 : -1;
monpetit 0:3ca3835f816e 93 }
monpetit 0:3ca3835f816e 94
monpetit 0:3ca3835f816e 95 static int get_op_len(const char *re, int re_len) {
monpetit 0:3ca3835f816e 96 return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
monpetit 0:3ca3835f816e 97 }
monpetit 0:3ca3835f816e 98
monpetit 0:3ca3835f816e 99 static int is_quantifier(const char *re) {
monpetit 0:3ca3835f816e 100 return re[0] == '*' || re[0] == '+' || re[0] == '?';
monpetit 0:3ca3835f816e 101 }
monpetit 0:3ca3835f816e 102
monpetit 0:3ca3835f816e 103 static int toi(int x) {
monpetit 0:3ca3835f816e 104 return isdigit(x) ? x - '0' : x - 'W';
monpetit 0:3ca3835f816e 105 }
monpetit 0:3ca3835f816e 106
monpetit 0:3ca3835f816e 107 static int hextoi(const unsigned char *s) {
monpetit 0:3ca3835f816e 108 return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
monpetit 0:3ca3835f816e 109 }
monpetit 0:3ca3835f816e 110
monpetit 0:3ca3835f816e 111 static int match_op(const unsigned char *re, const unsigned char *s,
monpetit 0:3ca3835f816e 112 struct regex_info *info) {
monpetit 0:3ca3835f816e 113 int result = 0;
monpetit 0:3ca3835f816e 114 switch (*re) {
monpetit 0:3ca3835f816e 115 case '\\':
monpetit 0:3ca3835f816e 116 /* Metacharacters */
monpetit 0:3ca3835f816e 117 switch (re[1]) {
monpetit 0:3ca3835f816e 118 case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 119 case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 120 case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 121 case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 122 case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 123 case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 124 case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 125 case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 126 case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
monpetit 0:3ca3835f816e 127
monpetit 0:3ca3835f816e 128 case 'x':
monpetit 0:3ca3835f816e 129 /* Match byte, \xHH where HH is hexadecimal byte representaion */
monpetit 0:3ca3835f816e 130 FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 131 result++;
monpetit 0:3ca3835f816e 132 break;
monpetit 0:3ca3835f816e 133
monpetit 0:3ca3835f816e 134 default:
monpetit 0:3ca3835f816e 135 /* Valid metacharacter check is done in bar() */
monpetit 0:3ca3835f816e 136 FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 137 result++;
monpetit 0:3ca3835f816e 138 break;
monpetit 0:3ca3835f816e 139 }
monpetit 0:3ca3835f816e 140 break;
monpetit 0:3ca3835f816e 141
monpetit 0:3ca3835f816e 142 case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
monpetit 0:3ca3835f816e 143 case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
monpetit 0:3ca3835f816e 144 case '.': result++; break;
monpetit 0:3ca3835f816e 145
monpetit 0:3ca3835f816e 146 default:
monpetit 0:3ca3835f816e 147 if (info->flags & SLRE_IGNORE_CASE) {
monpetit 0:3ca3835f816e 148 FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 149 } else {
monpetit 0:3ca3835f816e 150 FAIL_IF(*re != *s, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 151 }
monpetit 0:3ca3835f816e 152 result++;
monpetit 0:3ca3835f816e 153 break;
monpetit 0:3ca3835f816e 154 }
monpetit 0:3ca3835f816e 155
monpetit 0:3ca3835f816e 156 return result;
monpetit 0:3ca3835f816e 157 }
monpetit 0:3ca3835f816e 158
monpetit 0:3ca3835f816e 159 static int match_set(const char *re, int re_len, const char *s,
monpetit 0:3ca3835f816e 160 struct regex_info *info) {
monpetit 0:3ca3835f816e 161 int len = 0, result = -1, invert = re[0] == '^';
monpetit 0:3ca3835f816e 162
monpetit 0:3ca3835f816e 163 if (invert) re++, re_len--;
monpetit 0:3ca3835f816e 164
monpetit 0:3ca3835f816e 165 while (len <= re_len && re[len] != ']' && result <= 0) {
monpetit 0:3ca3835f816e 166 /* Support character range */
monpetit 0:3ca3835f816e 167 if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
monpetit 0:3ca3835f816e 168 re[len + 2] != '\0') {
monpetit 0:3ca3835f816e 169 result = info->flags & SLRE_IGNORE_CASE ?
monpetit 0:3ca3835f816e 170 tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]) :
monpetit 0:3ca3835f816e 171 *s >= re[len] && *s <= re[len + 2];
monpetit 0:3ca3835f816e 172 len += 3;
monpetit 0:3ca3835f816e 173 } else {
monpetit 0:3ca3835f816e 174 result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
monpetit 0:3ca3835f816e 175 len += op_len(re + len);
monpetit 0:3ca3835f816e 176 }
monpetit 0:3ca3835f816e 177 }
monpetit 0:3ca3835f816e 178 return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
monpetit 0:3ca3835f816e 179 }
monpetit 0:3ca3835f816e 180
monpetit 0:3ca3835f816e 181 static int doh(const char *s, int s_len, struct regex_info *info, int bi);
monpetit 0:3ca3835f816e 182
monpetit 0:3ca3835f816e 183 static int bar(const char *re, int re_len, const char *s, int s_len,
monpetit 0:3ca3835f816e 184 struct regex_info *info, int bi) {
monpetit 0:3ca3835f816e 185 /* i is offset in re, j is offset in s, bi is brackets index */
monpetit 0:3ca3835f816e 186 int i, j, n, step;
monpetit 0:3ca3835f816e 187
monpetit 0:3ca3835f816e 188 for (i = j = 0; i < re_len && j <= s_len; i += step) {
monpetit 0:3ca3835f816e 189
monpetit 0:3ca3835f816e 190 /* Handle quantifiers. Get the length of the chunk. */
monpetit 0:3ca3835f816e 191 step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
monpetit 0:3ca3835f816e 192 get_op_len(re + i, re_len - i);
monpetit 0:3ca3835f816e 193
monpetit 0:3ca3835f816e 194 DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
monpetit 0:3ca3835f816e 195 re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
monpetit 0:3ca3835f816e 196
monpetit 0:3ca3835f816e 197 FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
monpetit 0:3ca3835f816e 198 FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
monpetit 0:3ca3835f816e 199
monpetit 0:3ca3835f816e 200 if (i + step < re_len && is_quantifier(re + i + step)) {
monpetit 0:3ca3835f816e 201 DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
monpetit 0:3ca3835f816e 202 re[i + step], s_len - j, s + j));
monpetit 0:3ca3835f816e 203 if (re[i + step] == '?') {
monpetit 0:3ca3835f816e 204 int result = bar(re + i, step, s + j, s_len - j, info, bi);
monpetit 0:3ca3835f816e 205 j += result > 0 ? result : 0;
monpetit 0:3ca3835f816e 206 i++;
monpetit 0:3ca3835f816e 207 } else if (re[i + step] == '+' || re[i + step] == '*') {
monpetit 0:3ca3835f816e 208 int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
monpetit 0:3ca3835f816e 209
monpetit 0:3ca3835f816e 210 /* Points to the regexp code after the quantifier */
monpetit 0:3ca3835f816e 211 ni = i + step + 1;
monpetit 0:3ca3835f816e 212 if (ni < re_len && re[ni] == '?') {
monpetit 0:3ca3835f816e 213 non_greedy = 1;
monpetit 0:3ca3835f816e 214 ni++;
monpetit 0:3ca3835f816e 215 }
monpetit 0:3ca3835f816e 216
monpetit 0:3ca3835f816e 217 do {
monpetit 0:3ca3835f816e 218 if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
monpetit 0:3ca3835f816e 219 j2 += n1;
monpetit 0:3ca3835f816e 220 }
monpetit 0:3ca3835f816e 221 if (re[i + step] == '+' && n1 < 0) break;
monpetit 0:3ca3835f816e 222
monpetit 0:3ca3835f816e 223 if (ni >= re_len) {
monpetit 0:3ca3835f816e 224 /* After quantifier, there is nothing */
monpetit 0:3ca3835f816e 225 nj = j2;
monpetit 0:3ca3835f816e 226 } else if ((n2 = bar(re + ni, re_len - ni, s + j2,
monpetit 0:3ca3835f816e 227 s_len - j2, info, bi)) >= 0) {
monpetit 0:3ca3835f816e 228 /* Regex after quantifier matched */
monpetit 0:3ca3835f816e 229 nj = j2 + n2;
monpetit 0:3ca3835f816e 230 }
monpetit 0:3ca3835f816e 231 if (nj > j && non_greedy) break;
monpetit 0:3ca3835f816e 232 } while (n1 > 0);
monpetit 0:3ca3835f816e 233
monpetit 0:3ca3835f816e 234 /*
monpetit 0:3ca3835f816e 235 * Even if we found one or more pattern, this branch will be executed,
monpetit 0:3ca3835f816e 236 * changing the next captures.
monpetit 0:3ca3835f816e 237 */
monpetit 0:3ca3835f816e 238 if (n1 < 0 && n2 < 0 && re[i + step] == '*' &&
monpetit 0:3ca3835f816e 239 (n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
monpetit 0:3ca3835f816e 240 nj = j + n2;
monpetit 0:3ca3835f816e 241 }
monpetit 0:3ca3835f816e 242
monpetit 0:3ca3835f816e 243 DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
monpetit 0:3ca3835f816e 244 FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 245
monpetit 0:3ca3835f816e 246 /* If while loop body above was not executed for the * quantifier, */
monpetit 0:3ca3835f816e 247 /* make sure the rest of the regex matches */
monpetit 0:3ca3835f816e 248 FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 249
monpetit 0:3ca3835f816e 250 /* Returning here cause we've matched the rest of RE already */
monpetit 0:3ca3835f816e 251 return nj;
monpetit 0:3ca3835f816e 252 }
monpetit 0:3ca3835f816e 253 continue;
monpetit 0:3ca3835f816e 254 }
monpetit 0:3ca3835f816e 255
monpetit 0:3ca3835f816e 256 if (re[i] == '[') {
monpetit 0:3ca3835f816e 257 n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
monpetit 0:3ca3835f816e 258 DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
monpetit 0:3ca3835f816e 259 FAIL_IF(n <= 0, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 260 j += n;
monpetit 0:3ca3835f816e 261 } else if (re[i] == '(') {
monpetit 0:3ca3835f816e 262 n = SLRE_NO_MATCH;
monpetit 0:3ca3835f816e 263 bi++;
monpetit 0:3ca3835f816e 264 FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
monpetit 0:3ca3835f816e 265 DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
monpetit 0:3ca3835f816e 266 step, re + i, s_len - j, s + j, re + i + step));
monpetit 0:3ca3835f816e 267
monpetit 0:3ca3835f816e 268 if (re_len - (i + step) <= 0) {
monpetit 0:3ca3835f816e 269 /* Nothing follows brackets */
monpetit 0:3ca3835f816e 270 n = doh(s + j, s_len - j, info, bi);
monpetit 0:3ca3835f816e 271 } else {
monpetit 0:3ca3835f816e 272 int j2;
monpetit 0:3ca3835f816e 273 for (j2 = 0; j2 <= s_len - j; j2++) {
monpetit 0:3ca3835f816e 274 if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
monpetit 0:3ca3835f816e 275 bar(re + i + step, re_len - (i + step),
monpetit 0:3ca3835f816e 276 s + j + n, s_len - (j + n), info, bi) >= 0) break;
monpetit 0:3ca3835f816e 277 }
monpetit 0:3ca3835f816e 278 }
monpetit 0:3ca3835f816e 279
monpetit 0:3ca3835f816e 280 DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
monpetit 0:3ca3835f816e 281 FAIL_IF(n < 0, n);
monpetit 0:3ca3835f816e 282 if (info->caps != NULL && n > 0) {
monpetit 0:3ca3835f816e 283 info->caps[bi - 1].ptr = s + j;
monpetit 0:3ca3835f816e 284 info->caps[bi - 1].len = n;
monpetit 0:3ca3835f816e 285 }
monpetit 0:3ca3835f816e 286 j += n;
monpetit 0:3ca3835f816e 287 } else if (re[i] == '^') {
monpetit 0:3ca3835f816e 288 FAIL_IF(j != 0, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 289 } else if (re[i] == '$') {
monpetit 0:3ca3835f816e 290 FAIL_IF(j != s_len, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 291 } else {
monpetit 0:3ca3835f816e 292 FAIL_IF(j >= s_len, SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 293 n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
monpetit 0:3ca3835f816e 294 FAIL_IF(n <= 0, n);
monpetit 0:3ca3835f816e 295 j += n;
monpetit 0:3ca3835f816e 296 }
monpetit 0:3ca3835f816e 297 }
monpetit 0:3ca3835f816e 298
monpetit 0:3ca3835f816e 299 return j;
monpetit 0:3ca3835f816e 300 }
monpetit 0:3ca3835f816e 301
monpetit 0:3ca3835f816e 302 /* Process branch points */
monpetit 0:3ca3835f816e 303 static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
monpetit 0:3ca3835f816e 304 const struct bracket_pair *b = &info->brackets[bi];
monpetit 0:3ca3835f816e 305 int i = 0, len, result;
monpetit 0:3ca3835f816e 306 const char *p;
monpetit 0:3ca3835f816e 307
monpetit 0:3ca3835f816e 308 do {
monpetit 0:3ca3835f816e 309 p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
monpetit 0:3ca3835f816e 310 len = b->num_branches == 0 ? b->len :
monpetit 0:3ca3835f816e 311 i == b->num_branches ? (int) (b->ptr + b->len - p) :
monpetit 0:3ca3835f816e 312 (int) (info->branches[b->branches + i].schlong - p);
monpetit 0:3ca3835f816e 313 DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
monpetit 0:3ca3835f816e 314 result = bar(p, len, s, s_len, info, bi);
monpetit 0:3ca3835f816e 315 DBG(("%s <- %d\n", __func__, result));
monpetit 0:3ca3835f816e 316 } while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */
monpetit 0:3ca3835f816e 317
monpetit 0:3ca3835f816e 318 return result;
monpetit 0:3ca3835f816e 319 }
monpetit 0:3ca3835f816e 320
monpetit 0:3ca3835f816e 321 static int baz(const char *s, int s_len, struct regex_info *info) {
monpetit 0:3ca3835f816e 322 int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
monpetit 0:3ca3835f816e 323
monpetit 0:3ca3835f816e 324 for (i = 0; i <= s_len; i++) {
monpetit 0:3ca3835f816e 325 result = doh(s + i, s_len - i, info, 0);
monpetit 0:3ca3835f816e 326 if (result >= 0) {
monpetit 0:3ca3835f816e 327 result += i;
monpetit 0:3ca3835f816e 328 break;
monpetit 0:3ca3835f816e 329 }
monpetit 0:3ca3835f816e 330 if (is_anchored) break;
monpetit 0:3ca3835f816e 331 }
monpetit 0:3ca3835f816e 332
monpetit 0:3ca3835f816e 333 return result;
monpetit 0:3ca3835f816e 334 }
monpetit 0:3ca3835f816e 335
monpetit 0:3ca3835f816e 336 static void setup_branch_points(struct regex_info *info) {
monpetit 0:3ca3835f816e 337 int i, j;
monpetit 0:3ca3835f816e 338 struct branch tmp;
monpetit 0:3ca3835f816e 339
monpetit 0:3ca3835f816e 340 /* First, sort branches. Must be stable, no qsort. Use bubble algo. */
monpetit 0:3ca3835f816e 341 for (i = 0; i < info->num_branches; i++) {
monpetit 0:3ca3835f816e 342 for (j = i + 1; j < info->num_branches; j++) {
monpetit 0:3ca3835f816e 343 if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
monpetit 0:3ca3835f816e 344 tmp = info->branches[i];
monpetit 0:3ca3835f816e 345 info->branches[i] = info->branches[j];
monpetit 0:3ca3835f816e 346 info->branches[j] = tmp;
monpetit 0:3ca3835f816e 347 }
monpetit 0:3ca3835f816e 348 }
monpetit 0:3ca3835f816e 349 }
monpetit 0:3ca3835f816e 350
monpetit 0:3ca3835f816e 351 /*
monpetit 0:3ca3835f816e 352 * For each bracket, set their branch points. This way, for every bracket
monpetit 0:3ca3835f816e 353 * (i.e. every chunk of regex) we know all branch points before matching.
monpetit 0:3ca3835f816e 354 */
monpetit 0:3ca3835f816e 355 for (i = j = 0; i < info->num_brackets; i++) {
monpetit 0:3ca3835f816e 356 info->brackets[i].num_branches = 0;
monpetit 0:3ca3835f816e 357 info->brackets[i].branches = j;
monpetit 0:3ca3835f816e 358 while (j < info->num_branches && info->branches[j].bracket_index == i) {
monpetit 0:3ca3835f816e 359 info->brackets[i].num_branches++;
monpetit 0:3ca3835f816e 360 j++;
monpetit 0:3ca3835f816e 361 }
monpetit 0:3ca3835f816e 362 }
monpetit 0:3ca3835f816e 363 }
monpetit 0:3ca3835f816e 364
monpetit 0:3ca3835f816e 365 static int foo(const char *re, int re_len, const char *s, int s_len,
monpetit 0:3ca3835f816e 366 struct regex_info *info) {
monpetit 0:3ca3835f816e 367 int i, step, depth = 0;
monpetit 0:3ca3835f816e 368
monpetit 0:3ca3835f816e 369 /* First bracket captures everything */
monpetit 0:3ca3835f816e 370 info->brackets[0].ptr = re;
monpetit 0:3ca3835f816e 371 info->brackets[0].len = re_len;
monpetit 0:3ca3835f816e 372 info->num_brackets = 1;
monpetit 0:3ca3835f816e 373
monpetit 0:3ca3835f816e 374 /* Make a single pass over regex string, memorize brackets and branches */
monpetit 0:3ca3835f816e 375 for (i = 0; i < re_len; i += step) {
monpetit 0:3ca3835f816e 376 step = get_op_len(re + i, re_len - i);
monpetit 0:3ca3835f816e 377
monpetit 0:3ca3835f816e 378 if (re[i] == '|') {
monpetit 0:3ca3835f816e 379 FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
monpetit 0:3ca3835f816e 380 SLRE_TOO_MANY_BRANCHES);
monpetit 0:3ca3835f816e 381 info->branches[info->num_branches].bracket_index =
monpetit 0:3ca3835f816e 382 info->brackets[info->num_brackets - 1].len == -1 ?
monpetit 0:3ca3835f816e 383 info->num_brackets - 1 : depth;
monpetit 0:3ca3835f816e 384 info->branches[info->num_branches].schlong = &re[i];
monpetit 0:3ca3835f816e 385 info->num_branches++;
monpetit 0:3ca3835f816e 386 } else if (re[i] == '\\') {
monpetit 0:3ca3835f816e 387 FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
monpetit 0:3ca3835f816e 388 if (re[i + 1] == 'x') {
monpetit 0:3ca3835f816e 389 /* Hex digit specification must follow */
monpetit 0:3ca3835f816e 390 FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
monpetit 0:3ca3835f816e 391 SLRE_INVALID_METACHARACTER);
monpetit 0:3ca3835f816e 392 FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) &&
monpetit 0:3ca3835f816e 393 isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
monpetit 0:3ca3835f816e 394 } else {
monpetit 0:3ca3835f816e 395 FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
monpetit 0:3ca3835f816e 396 SLRE_INVALID_METACHARACTER);
monpetit 0:3ca3835f816e 397 }
monpetit 0:3ca3835f816e 398 } else if (re[i] == '(') {
monpetit 0:3ca3835f816e 399 FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
monpetit 0:3ca3835f816e 400 SLRE_TOO_MANY_BRACKETS);
monpetit 0:3ca3835f816e 401 depth++; /* Order is important here. Depth increments first. */
monpetit 0:3ca3835f816e 402 info->brackets[info->num_brackets].ptr = re + i + 1;
monpetit 0:3ca3835f816e 403 info->brackets[info->num_brackets].len = -1;
monpetit 0:3ca3835f816e 404 info->num_brackets++;
monpetit 0:3ca3835f816e 405 FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
monpetit 0:3ca3835f816e 406 SLRE_CAPS_ARRAY_TOO_SMALL);
monpetit 0:3ca3835f816e 407 } else if (re[i] == ')') {
monpetit 0:3ca3835f816e 408 int ind = info->brackets[info->num_brackets - 1].len == -1 ?
monpetit 0:3ca3835f816e 409 info->num_brackets - 1 : depth;
monpetit 0:3ca3835f816e 410 info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
monpetit 0:3ca3835f816e 411 DBG(("SETTING BRACKET %d [%.*s]\n",
monpetit 0:3ca3835f816e 412 ind, info->brackets[ind].len, info->brackets[ind].ptr));
monpetit 0:3ca3835f816e 413 depth--;
monpetit 0:3ca3835f816e 414 FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
monpetit 0:3ca3835f816e 415 FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
monpetit 0:3ca3835f816e 416 }
monpetit 0:3ca3835f816e 417 }
monpetit 0:3ca3835f816e 418
monpetit 0:3ca3835f816e 419 FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
monpetit 0:3ca3835f816e 420 setup_branch_points(info);
monpetit 0:3ca3835f816e 421
monpetit 0:3ca3835f816e 422 return baz(s, s_len, info);
monpetit 0:3ca3835f816e 423 }
monpetit 0:3ca3835f816e 424
monpetit 0:3ca3835f816e 425 int slre_match(const char *regexp, const char *s, int s_len,
monpetit 0:3ca3835f816e 426 struct slre_cap *caps, int num_caps, int flags) {
monpetit 0:3ca3835f816e 427 struct regex_info info;
monpetit 0:3ca3835f816e 428
monpetit 0:3ca3835f816e 429 /* Initialize info structure */
monpetit 0:3ca3835f816e 430 info.flags = flags;
monpetit 0:3ca3835f816e 431 info.num_brackets = info.num_branches = 0;
monpetit 0:3ca3835f816e 432 info.num_caps = num_caps;
monpetit 0:3ca3835f816e 433 info.caps = caps;
monpetit 0:3ca3835f816e 434
monpetit 0:3ca3835f816e 435 DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
monpetit 0:3ca3835f816e 436 return foo(regexp, (int) strlen(regexp), s, s_len, &info);
monpetit 0:3ca3835f816e 437 }