slre - Super Light Regular Expression library URL: http://slre.sourceforge.net/ Just ported to mbed.

Dependencies:   mbed

Committer:
rolf
Date:
Wed Nov 18 18:01:01 2009 +0000
Revision:
0:e0b85a04e7e5

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
rolf 0:e0b85a04e7e5 1 /*
rolf 0:e0b85a04e7e5 2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
rolf 0:e0b85a04e7e5 3 * All rights reserved
rolf 0:e0b85a04e7e5 4 *
rolf 0:e0b85a04e7e5 5 * "THE BEER-WARE LICENSE" (Revision 42):
rolf 0:e0b85a04e7e5 6 * Sergey Lyubka wrote this file. As long as you retain this notice you
rolf 0:e0b85a04e7e5 7 * can do whatever you want with this stuff. If we meet some day, and you think
rolf 0:e0b85a04e7e5 8 * this stuff is worth it, you can buy me a beer in return.
rolf 0:e0b85a04e7e5 9 */
rolf 0:e0b85a04e7e5 10
rolf 0:e0b85a04e7e5 11 #include <stdio.h>
rolf 0:e0b85a04e7e5 12 #include <assert.h>
rolf 0:e0b85a04e7e5 13 #include <ctype.h>
rolf 0:e0b85a04e7e5 14 #include <stdlib.h>
rolf 0:e0b85a04e7e5 15 #include <string.h>
rolf 0:e0b85a04e7e5 16 #include <errno.h>
rolf 0:e0b85a04e7e5 17
rolf 0:e0b85a04e7e5 18 #include "slre.h"
rolf 0:e0b85a04e7e5 19
rolf 0:e0b85a04e7e5 20 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
rolf 0:e0b85a04e7e5 21 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
rolf 0:e0b85a04e7e5 22
rolf 0:e0b85a04e7e5 23 static struct {
rolf 0:e0b85a04e7e5 24 const char *name;
rolf 0:e0b85a04e7e5 25 int narg;
rolf 0:e0b85a04e7e5 26 const char *flags;
rolf 0:e0b85a04e7e5 27 } opcodes[] = {
rolf 0:e0b85a04e7e5 28 {"END", 0, ""}, /* End of code block or program */
rolf 0:e0b85a04e7e5 29 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
rolf 0:e0b85a04e7e5 30 {"ANY", 0, ""}, /* Match any character, "." */
rolf 0:e0b85a04e7e5 31 {"EXACT", 2, "d"}, /* Match exact string */
rolf 0:e0b85a04e7e5 32 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
rolf 0:e0b85a04e7e5 33 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
rolf 0:e0b85a04e7e5 34 {"OPEN ", 1, "i"}, /* Capture start, "(" */
rolf 0:e0b85a04e7e5 35 {"CLOSE", 1, "i"}, /* Capture end, ")" */
rolf 0:e0b85a04e7e5 36 {"BOL", 0, ""}, /* Beginning of string, "^" */
rolf 0:e0b85a04e7e5 37 {"EOL", 0, ""}, /* End of string, "$" */
rolf 0:e0b85a04e7e5 38 {"STAR", 1, "o"}, /* Match zero or more times "*" */
rolf 0:e0b85a04e7e5 39 {"PLUS", 1, "o"}, /* Match one or more times, "+" */
rolf 0:e0b85a04e7e5 40 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
rolf 0:e0b85a04e7e5 41 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
rolf 0:e0b85a04e7e5 42 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
rolf 0:e0b85a04e7e5 43 {"SPACE", 0, ""}, /* Match whitespace, "\s" */
rolf 0:e0b85a04e7e5 44 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
rolf 0:e0b85a04e7e5 45 {"DIGIT", 0, ""} /* Match digit, "\d" */
rolf 0:e0b85a04e7e5 46 };
rolf 0:e0b85a04e7e5 47
rolf 0:e0b85a04e7e5 48 /*
rolf 0:e0b85a04e7e5 49 * Commands and operands are all unsigned char (1 byte long). All code offsets
rolf 0:e0b85a04e7e5 50 * are relative to current address, and positive (always point forward). Data
rolf 0:e0b85a04e7e5 51 * offsets are absolute. Commands with operands:
rolf 0:e0b85a04e7e5 52 *
rolf 0:e0b85a04e7e5 53 * BRANCH offset1 offset2
rolf 0:e0b85a04e7e5 54 * Try to match the code block that follows the BRANCH instruction
rolf 0:e0b85a04e7e5 55 * (code block ends with END). If no match, try to match code block that
rolf 0:e0b85a04e7e5 56 * starts at offset1. If either of these match, jump to offset2.
rolf 0:e0b85a04e7e5 57 *
rolf 0:e0b85a04e7e5 58 * EXACT data_offset data_length
rolf 0:e0b85a04e7e5 59 * Try to match exact string. String is recorded in data section from
rolf 0:e0b85a04e7e5 60 * data_offset, and has length data_length.
rolf 0:e0b85a04e7e5 61 *
rolf 0:e0b85a04e7e5 62 * OPEN capture_number, CLOSE capture_number
rolf 0:e0b85a04e7e5 63 * If the user have passed 'struct cap' array for captures, OPEN
rolf 0:e0b85a04e7e5 64 * records the beginning of the matched substring (cap->ptr), CLOSE
rolf 0:e0b85a04e7e5 65 * sets the length (cap->len) for respective capture_number.
rolf 0:e0b85a04e7e5 66 *
rolf 0:e0b85a04e7e5 67 * STAR code_offset, PLUS code_offset, QUEST code_offset
rolf 0:e0b85a04e7e5 68 * *, +, ?, respectively. Try to gobble as much as possible from the
rolf 0:e0b85a04e7e5 69 * matched buffer, until code block that follows these instructions
rolf 0:e0b85a04e7e5 70 * matches. When the longest possible string is matched,
rolf 0:e0b85a04e7e5 71 * jump to code_offset
rolf 0:e0b85a04e7e5 72 *
rolf 0:e0b85a04e7e5 73 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
rolf 0:e0b85a04e7e5 74 */
rolf 0:e0b85a04e7e5 75
rolf 0:e0b85a04e7e5 76 static const char *meta_chars = "|.^$*+?()[\\";
rolf 0:e0b85a04e7e5 77
rolf 0:e0b85a04e7e5 78 static void
rolf 0:e0b85a04e7e5 79 print_character_set(FILE *fp, const unsigned char *p, int len)
rolf 0:e0b85a04e7e5 80 {
rolf 0:e0b85a04e7e5 81 int i;
rolf 0:e0b85a04e7e5 82
rolf 0:e0b85a04e7e5 83 for (i = 0; i < len; i++) {
rolf 0:e0b85a04e7e5 84 if (i > 0)
rolf 0:e0b85a04e7e5 85 (void) fputc(',', fp);
rolf 0:e0b85a04e7e5 86 if (p[i] == 0) {
rolf 0:e0b85a04e7e5 87 i++;
rolf 0:e0b85a04e7e5 88 if (p[i] == 0)
rolf 0:e0b85a04e7e5 89 (void) fprintf(fp, "\\x%02x", p[i]);
rolf 0:e0b85a04e7e5 90 else
rolf 0:e0b85a04e7e5 91 (void) fprintf(fp, "%s", opcodes[p[i]].name);
rolf 0:e0b85a04e7e5 92 } else if (isprint(p[i])) {
rolf 0:e0b85a04e7e5 93 (void) fputc(p[i], fp);
rolf 0:e0b85a04e7e5 94 } else {
rolf 0:e0b85a04e7e5 95 (void) fprintf(fp,"\\x%02x", p[i]);
rolf 0:e0b85a04e7e5 96 }
rolf 0:e0b85a04e7e5 97 }
rolf 0:e0b85a04e7e5 98 }
rolf 0:e0b85a04e7e5 99
rolf 0:e0b85a04e7e5 100 void
rolf 0:e0b85a04e7e5 101 slre_dump(const struct slre *r, FILE *fp)
rolf 0:e0b85a04e7e5 102 {
rolf 0:e0b85a04e7e5 103 int i, j, ch, op, pc;
rolf 0:e0b85a04e7e5 104
rolf 0:e0b85a04e7e5 105 for (pc = 0; pc < r->code_size; pc++) {
rolf 0:e0b85a04e7e5 106
rolf 0:e0b85a04e7e5 107 op = r->code[pc];
rolf 0:e0b85a04e7e5 108 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
rolf 0:e0b85a04e7e5 109
rolf 0:e0b85a04e7e5 110 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
rolf 0:e0b85a04e7e5 111 switch (opcodes[op].flags[i]) {
rolf 0:e0b85a04e7e5 112 case 'i':
rolf 0:e0b85a04e7e5 113 (void) fprintf(fp, "%d ", r->code[pc + 1]);
rolf 0:e0b85a04e7e5 114 pc++;
rolf 0:e0b85a04e7e5 115 break;
rolf 0:e0b85a04e7e5 116 case 'o':
rolf 0:e0b85a04e7e5 117 (void) fprintf(fp, "%d ",
rolf 0:e0b85a04e7e5 118 pc + r->code[pc + 1] - i);
rolf 0:e0b85a04e7e5 119 pc++;
rolf 0:e0b85a04e7e5 120 break;
rolf 0:e0b85a04e7e5 121 case 'D':
rolf 0:e0b85a04e7e5 122 print_character_set(fp, r->data +
rolf 0:e0b85a04e7e5 123 r->code[pc + 1], r->code[pc + 2]);
rolf 0:e0b85a04e7e5 124 pc += 2;
rolf 0:e0b85a04e7e5 125 break;
rolf 0:e0b85a04e7e5 126 case 'd':
rolf 0:e0b85a04e7e5 127 (void) fputc('"', fp);
rolf 0:e0b85a04e7e5 128 for (j = 0; j < r->code[pc + 2]; j++) {
rolf 0:e0b85a04e7e5 129 ch = r->data[r->code[pc + 1] + j];
rolf 0:e0b85a04e7e5 130 if (isprint(ch))
rolf 0:e0b85a04e7e5 131 (void) fputc(ch, fp);
rolf 0:e0b85a04e7e5 132 else
rolf 0:e0b85a04e7e5 133 (void) fprintf(fp,"\\x%02x",ch);
rolf 0:e0b85a04e7e5 134 }
rolf 0:e0b85a04e7e5 135 (void) fputc('"', fp);
rolf 0:e0b85a04e7e5 136 pc += 2;
rolf 0:e0b85a04e7e5 137 break;
rolf 0:e0b85a04e7e5 138 }
rolf 0:e0b85a04e7e5 139
rolf 0:e0b85a04e7e5 140 (void) fputc('\n', fp);
rolf 0:e0b85a04e7e5 141 }
rolf 0:e0b85a04e7e5 142 }
rolf 0:e0b85a04e7e5 143
rolf 0:e0b85a04e7e5 144 static void
rolf 0:e0b85a04e7e5 145 set_jump_offset(struct slre *r, int pc, int offset)
rolf 0:e0b85a04e7e5 146 {
rolf 0:e0b85a04e7e5 147 assert(offset < r->code_size);
rolf 0:e0b85a04e7e5 148
rolf 0:e0b85a04e7e5 149 if (r->code_size - offset > 0xff) {
rolf 0:e0b85a04e7e5 150 r->err_str = "Jump offset is too big";
rolf 0:e0b85a04e7e5 151 } else {
rolf 0:e0b85a04e7e5 152 r->code[pc] = (unsigned char) (r->code_size - offset);
rolf 0:e0b85a04e7e5 153 }
rolf 0:e0b85a04e7e5 154 }
rolf 0:e0b85a04e7e5 155
rolf 0:e0b85a04e7e5 156 static void
rolf 0:e0b85a04e7e5 157 emit(struct slre *r, int code)
rolf 0:e0b85a04e7e5 158 {
rolf 0:e0b85a04e7e5 159 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
rolf 0:e0b85a04e7e5 160 r->err_str = "RE is too long (code overflow)";
rolf 0:e0b85a04e7e5 161 else
rolf 0:e0b85a04e7e5 162 r->code[r->code_size++] = (unsigned char) code;
rolf 0:e0b85a04e7e5 163 }
rolf 0:e0b85a04e7e5 164
rolf 0:e0b85a04e7e5 165 static void
rolf 0:e0b85a04e7e5 166 store_char_in_data(struct slre *r, int ch)
rolf 0:e0b85a04e7e5 167 {
rolf 0:e0b85a04e7e5 168 if (r->data_size >= (int) sizeof(r->data))
rolf 0:e0b85a04e7e5 169 r->err_str = "RE is too long (data overflow)";
rolf 0:e0b85a04e7e5 170 else
rolf 0:e0b85a04e7e5 171 r->data[r->data_size++] = ch;
rolf 0:e0b85a04e7e5 172 }
rolf 0:e0b85a04e7e5 173
rolf 0:e0b85a04e7e5 174 static void
rolf 0:e0b85a04e7e5 175 exact(struct slre *r, const char **re)
rolf 0:e0b85a04e7e5 176 {
rolf 0:e0b85a04e7e5 177 int old_data_size = r->data_size;
rolf 0:e0b85a04e7e5 178
rolf 0:e0b85a04e7e5 179 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
rolf 0:e0b85a04e7e5 180 store_char_in_data(r, *(*re)++);
rolf 0:e0b85a04e7e5 181
rolf 0:e0b85a04e7e5 182 emit(r, EXACT);
rolf 0:e0b85a04e7e5 183 emit(r, old_data_size);
rolf 0:e0b85a04e7e5 184 emit(r, r->data_size - old_data_size);
rolf 0:e0b85a04e7e5 185 }
rolf 0:e0b85a04e7e5 186
rolf 0:e0b85a04e7e5 187 static int
rolf 0:e0b85a04e7e5 188 get_escape_char(const char **re)
rolf 0:e0b85a04e7e5 189 {
rolf 0:e0b85a04e7e5 190 int res;
rolf 0:e0b85a04e7e5 191
rolf 0:e0b85a04e7e5 192 switch (*(*re)++) {
rolf 0:e0b85a04e7e5 193 case 'n': res = '\n'; break;
rolf 0:e0b85a04e7e5 194 case 'r': res = '\r'; break;
rolf 0:e0b85a04e7e5 195 case 't': res = '\t'; break;
rolf 0:e0b85a04e7e5 196 case '0': res = 0; break;
rolf 0:e0b85a04e7e5 197 case 'S': res = NONSPACE << 8; break;
rolf 0:e0b85a04e7e5 198 case 's': res = SPACE << 8; break;
rolf 0:e0b85a04e7e5 199 case 'd': res = DIGIT << 8; break;
rolf 0:e0b85a04e7e5 200 default: res = (*re)[-1]; break;
rolf 0:e0b85a04e7e5 201 }
rolf 0:e0b85a04e7e5 202
rolf 0:e0b85a04e7e5 203 return (res);
rolf 0:e0b85a04e7e5 204 }
rolf 0:e0b85a04e7e5 205
rolf 0:e0b85a04e7e5 206 static void
rolf 0:e0b85a04e7e5 207 anyof(struct slre *r, const char **re)
rolf 0:e0b85a04e7e5 208 {
rolf 0:e0b85a04e7e5 209 int esc, old_data_size = r->data_size, op = ANYOF;
rolf 0:e0b85a04e7e5 210
rolf 0:e0b85a04e7e5 211 if (**re == '^') {
rolf 0:e0b85a04e7e5 212 op = ANYBUT;
rolf 0:e0b85a04e7e5 213 (*re)++;
rolf 0:e0b85a04e7e5 214 }
rolf 0:e0b85a04e7e5 215
rolf 0:e0b85a04e7e5 216 while (**re != '\0')
rolf 0:e0b85a04e7e5 217
rolf 0:e0b85a04e7e5 218 switch (*(*re)++) {
rolf 0:e0b85a04e7e5 219 case ']':
rolf 0:e0b85a04e7e5 220 emit(r, op);
rolf 0:e0b85a04e7e5 221 emit(r, old_data_size);
rolf 0:e0b85a04e7e5 222 emit(r, r->data_size - old_data_size);
rolf 0:e0b85a04e7e5 223 return;
rolf 0:e0b85a04e7e5 224 /* NOTREACHED */
rolf 0:e0b85a04e7e5 225 break;
rolf 0:e0b85a04e7e5 226 case '\\':
rolf 0:e0b85a04e7e5 227 esc = get_escape_char(re);
rolf 0:e0b85a04e7e5 228 if ((esc & 0xff) == 0) {
rolf 0:e0b85a04e7e5 229 store_char_in_data(r, 0);
rolf 0:e0b85a04e7e5 230 store_char_in_data(r, esc >> 8);
rolf 0:e0b85a04e7e5 231 } else {
rolf 0:e0b85a04e7e5 232 store_char_in_data(r, esc);
rolf 0:e0b85a04e7e5 233 }
rolf 0:e0b85a04e7e5 234 break;
rolf 0:e0b85a04e7e5 235 default:
rolf 0:e0b85a04e7e5 236 store_char_in_data(r, (*re)[-1]);
rolf 0:e0b85a04e7e5 237 break;
rolf 0:e0b85a04e7e5 238 }
rolf 0:e0b85a04e7e5 239
rolf 0:e0b85a04e7e5 240 r->err_str = "No closing ']' bracket";
rolf 0:e0b85a04e7e5 241 }
rolf 0:e0b85a04e7e5 242
rolf 0:e0b85a04e7e5 243 static void
rolf 0:e0b85a04e7e5 244 relocate(struct slre *r, int begin, int shift)
rolf 0:e0b85a04e7e5 245 {
rolf 0:e0b85a04e7e5 246 emit(r, END);
rolf 0:e0b85a04e7e5 247 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
rolf 0:e0b85a04e7e5 248 r->code_size += shift;
rolf 0:e0b85a04e7e5 249 }
rolf 0:e0b85a04e7e5 250
rolf 0:e0b85a04e7e5 251 static void
rolf 0:e0b85a04e7e5 252 quantifier(struct slre *r, int prev, int op)
rolf 0:e0b85a04e7e5 253 {
rolf 0:e0b85a04e7e5 254 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
rolf 0:e0b85a04e7e5 255 r->code[prev + 2]--;
rolf 0:e0b85a04e7e5 256 emit(r, EXACT);
rolf 0:e0b85a04e7e5 257 emit(r, r->code[prev + 1] + r->code[prev + 2]);
rolf 0:e0b85a04e7e5 258 emit(r, 1);
rolf 0:e0b85a04e7e5 259 prev = r->code_size - 3;
rolf 0:e0b85a04e7e5 260 }
rolf 0:e0b85a04e7e5 261 relocate(r, prev, 2);
rolf 0:e0b85a04e7e5 262 r->code[prev] = op;
rolf 0:e0b85a04e7e5 263 set_jump_offset(r, prev + 1, prev);
rolf 0:e0b85a04e7e5 264 }
rolf 0:e0b85a04e7e5 265
rolf 0:e0b85a04e7e5 266 static void
rolf 0:e0b85a04e7e5 267 exact_one_char(struct slre *r, int ch)
rolf 0:e0b85a04e7e5 268 {
rolf 0:e0b85a04e7e5 269 emit(r, EXACT);
rolf 0:e0b85a04e7e5 270 emit(r, r->data_size);
rolf 0:e0b85a04e7e5 271 emit(r, 1);
rolf 0:e0b85a04e7e5 272 store_char_in_data(r, ch);
rolf 0:e0b85a04e7e5 273 }
rolf 0:e0b85a04e7e5 274
rolf 0:e0b85a04e7e5 275 static void
rolf 0:e0b85a04e7e5 276 fixup_branch(struct slre *r, int fixup)
rolf 0:e0b85a04e7e5 277 {
rolf 0:e0b85a04e7e5 278 if (fixup > 0) {
rolf 0:e0b85a04e7e5 279 emit(r, END);
rolf 0:e0b85a04e7e5 280 set_jump_offset(r, fixup, fixup - 2);
rolf 0:e0b85a04e7e5 281 }
rolf 0:e0b85a04e7e5 282 }
rolf 0:e0b85a04e7e5 283
rolf 0:e0b85a04e7e5 284 static void
rolf 0:e0b85a04e7e5 285 compile(struct slre *r, const char **re)
rolf 0:e0b85a04e7e5 286 {
rolf 0:e0b85a04e7e5 287 int op, esc, branch_start, last_op, fixup, cap_no, level;
rolf 0:e0b85a04e7e5 288
rolf 0:e0b85a04e7e5 289 fixup = 0;
rolf 0:e0b85a04e7e5 290 level = r->num_caps;
rolf 0:e0b85a04e7e5 291 branch_start = last_op = r->code_size;
rolf 0:e0b85a04e7e5 292
rolf 0:e0b85a04e7e5 293 for (;;)
rolf 0:e0b85a04e7e5 294 switch (*(*re)++) {
rolf 0:e0b85a04e7e5 295 case '\0':
rolf 0:e0b85a04e7e5 296 (*re)--;
rolf 0:e0b85a04e7e5 297 return;
rolf 0:e0b85a04e7e5 298 /* NOTREACHED */
rolf 0:e0b85a04e7e5 299 break;
rolf 0:e0b85a04e7e5 300 case '^':
rolf 0:e0b85a04e7e5 301 emit(r, BOL);
rolf 0:e0b85a04e7e5 302 break;
rolf 0:e0b85a04e7e5 303 case '$':
rolf 0:e0b85a04e7e5 304 emit(r, EOL);
rolf 0:e0b85a04e7e5 305 break;
rolf 0:e0b85a04e7e5 306 case '.':
rolf 0:e0b85a04e7e5 307 last_op = r->code_size;
rolf 0:e0b85a04e7e5 308 emit(r, ANY);
rolf 0:e0b85a04e7e5 309 break;
rolf 0:e0b85a04e7e5 310 case '[':
rolf 0:e0b85a04e7e5 311 anyof(r, re);
rolf 0:e0b85a04e7e5 312 break;
rolf 0:e0b85a04e7e5 313 case '\\':
rolf 0:e0b85a04e7e5 314 last_op = r->code_size;
rolf 0:e0b85a04e7e5 315 esc = get_escape_char(re);
rolf 0:e0b85a04e7e5 316 if (esc & 0xff00) {
rolf 0:e0b85a04e7e5 317 emit(r, esc >> 8);
rolf 0:e0b85a04e7e5 318 } else {
rolf 0:e0b85a04e7e5 319 exact_one_char(r, esc);
rolf 0:e0b85a04e7e5 320 }
rolf 0:e0b85a04e7e5 321 break;
rolf 0:e0b85a04e7e5 322 case '(':
rolf 0:e0b85a04e7e5 323 last_op = r->code_size;
rolf 0:e0b85a04e7e5 324 cap_no = ++r->num_caps;
rolf 0:e0b85a04e7e5 325 emit(r, OPEN);
rolf 0:e0b85a04e7e5 326 emit(r, cap_no);
rolf 0:e0b85a04e7e5 327
rolf 0:e0b85a04e7e5 328 compile(r, re);
rolf 0:e0b85a04e7e5 329 if (*(*re)++ != ')') {
rolf 0:e0b85a04e7e5 330 r->err_str = "No closing bracket";
rolf 0:e0b85a04e7e5 331 return;
rolf 0:e0b85a04e7e5 332 }
rolf 0:e0b85a04e7e5 333
rolf 0:e0b85a04e7e5 334 emit(r, CLOSE);
rolf 0:e0b85a04e7e5 335 emit(r, cap_no);
rolf 0:e0b85a04e7e5 336 break;
rolf 0:e0b85a04e7e5 337 case ')':
rolf 0:e0b85a04e7e5 338 (*re)--;
rolf 0:e0b85a04e7e5 339 fixup_branch(r, fixup);
rolf 0:e0b85a04e7e5 340 if (level == 0) {
rolf 0:e0b85a04e7e5 341 r->err_str = "Unbalanced brackets";
rolf 0:e0b85a04e7e5 342 return;
rolf 0:e0b85a04e7e5 343 }
rolf 0:e0b85a04e7e5 344 return;
rolf 0:e0b85a04e7e5 345 /* NOTREACHED */
rolf 0:e0b85a04e7e5 346 break;
rolf 0:e0b85a04e7e5 347 case '+':
rolf 0:e0b85a04e7e5 348 case '*':
rolf 0:e0b85a04e7e5 349 op = (*re)[-1] == '*' ? STAR: PLUS;
rolf 0:e0b85a04e7e5 350 if (**re == '?') {
rolf 0:e0b85a04e7e5 351 (*re)++;
rolf 0:e0b85a04e7e5 352 op = op == STAR ? STARQ : PLUSQ;
rolf 0:e0b85a04e7e5 353 }
rolf 0:e0b85a04e7e5 354 quantifier(r, last_op, op);
rolf 0:e0b85a04e7e5 355 break;
rolf 0:e0b85a04e7e5 356 case '?':
rolf 0:e0b85a04e7e5 357 quantifier(r, last_op, QUEST);
rolf 0:e0b85a04e7e5 358 break;
rolf 0:e0b85a04e7e5 359 case '|':
rolf 0:e0b85a04e7e5 360 fixup_branch(r, fixup);
rolf 0:e0b85a04e7e5 361 relocate(r, branch_start, 3);
rolf 0:e0b85a04e7e5 362 r->code[branch_start] = BRANCH;
rolf 0:e0b85a04e7e5 363 set_jump_offset(r, branch_start + 1, branch_start);
rolf 0:e0b85a04e7e5 364 fixup = branch_start + 2;
rolf 0:e0b85a04e7e5 365 r->code[fixup] = 0xff;
rolf 0:e0b85a04e7e5 366 break;
rolf 0:e0b85a04e7e5 367 default:
rolf 0:e0b85a04e7e5 368 (*re)--;
rolf 0:e0b85a04e7e5 369 last_op = r->code_size;
rolf 0:e0b85a04e7e5 370 exact(r, re);
rolf 0:e0b85a04e7e5 371 break;
rolf 0:e0b85a04e7e5 372 }
rolf 0:e0b85a04e7e5 373 }
rolf 0:e0b85a04e7e5 374
rolf 0:e0b85a04e7e5 375 int
rolf 0:e0b85a04e7e5 376 slre_compile(struct slre *r, const char *re)
rolf 0:e0b85a04e7e5 377 {
rolf 0:e0b85a04e7e5 378 r->err_str = NULL;
rolf 0:e0b85a04e7e5 379 r->code_size = r->data_size = r->num_caps = r->anchored = 0;
rolf 0:e0b85a04e7e5 380
rolf 0:e0b85a04e7e5 381 if (*re == '^')
rolf 0:e0b85a04e7e5 382 r->anchored++;
rolf 0:e0b85a04e7e5 383
rolf 0:e0b85a04e7e5 384 emit(r, OPEN); /* This will capture what matches full RE */
rolf 0:e0b85a04e7e5 385 emit(r, 0);
rolf 0:e0b85a04e7e5 386
rolf 0:e0b85a04e7e5 387 while (*re != '\0')
rolf 0:e0b85a04e7e5 388 compile(r, &re);
rolf 0:e0b85a04e7e5 389
rolf 0:e0b85a04e7e5 390 if (r->code[2] == BRANCH)
rolf 0:e0b85a04e7e5 391 fixup_branch(r, 4);
rolf 0:e0b85a04e7e5 392
rolf 0:e0b85a04e7e5 393 emit(r, CLOSE);
rolf 0:e0b85a04e7e5 394 emit(r, 0);
rolf 0:e0b85a04e7e5 395 emit(r, END);
rolf 0:e0b85a04e7e5 396
rolf 0:e0b85a04e7e5 397 return (r->err_str == NULL ? 1 : 0);
rolf 0:e0b85a04e7e5 398 }
rolf 0:e0b85a04e7e5 399
rolf 0:e0b85a04e7e5 400 static int match(const struct slre *, int,
rolf 0:e0b85a04e7e5 401 const char *, int, int *, struct cap *);
rolf 0:e0b85a04e7e5 402
rolf 0:e0b85a04e7e5 403 static void
rolf 0:e0b85a04e7e5 404 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
rolf 0:e0b85a04e7e5 405 {
rolf 0:e0b85a04e7e5 406 int saved_offset, matched_offset;
rolf 0:e0b85a04e7e5 407
rolf 0:e0b85a04e7e5 408 saved_offset = matched_offset = *ofs;
rolf 0:e0b85a04e7e5 409
rolf 0:e0b85a04e7e5 410 while (match(r, pc + 2, s, len, ofs, NULL)) {
rolf 0:e0b85a04e7e5 411 saved_offset = *ofs;
rolf 0:e0b85a04e7e5 412 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
rolf 0:e0b85a04e7e5 413 matched_offset = saved_offset;
rolf 0:e0b85a04e7e5 414 *ofs = saved_offset;
rolf 0:e0b85a04e7e5 415 }
rolf 0:e0b85a04e7e5 416
rolf 0:e0b85a04e7e5 417 *ofs = matched_offset;
rolf 0:e0b85a04e7e5 418 }
rolf 0:e0b85a04e7e5 419
rolf 0:e0b85a04e7e5 420 static void
rolf 0:e0b85a04e7e5 421 loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs)
rolf 0:e0b85a04e7e5 422 {
rolf 0:e0b85a04e7e5 423 int saved_offset = *ofs;
rolf 0:e0b85a04e7e5 424
rolf 0:e0b85a04e7e5 425 while (match(r, pc + 2, s, len, ofs, NULL)) {
rolf 0:e0b85a04e7e5 426 saved_offset = *ofs;
rolf 0:e0b85a04e7e5 427 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
rolf 0:e0b85a04e7e5 428 break;
rolf 0:e0b85a04e7e5 429 }
rolf 0:e0b85a04e7e5 430
rolf 0:e0b85a04e7e5 431 *ofs = saved_offset;
rolf 0:e0b85a04e7e5 432 }
rolf 0:e0b85a04e7e5 433
rolf 0:e0b85a04e7e5 434 static int
rolf 0:e0b85a04e7e5 435 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
rolf 0:e0b85a04e7e5 436 {
rolf 0:e0b85a04e7e5 437 int i, ch;
rolf 0:e0b85a04e7e5 438
rolf 0:e0b85a04e7e5 439 ch = s[*ofs];
rolf 0:e0b85a04e7e5 440
rolf 0:e0b85a04e7e5 441 for (i = 0; i < len; i++)
rolf 0:e0b85a04e7e5 442 if (p[i] == ch) {
rolf 0:e0b85a04e7e5 443 (*ofs)++;
rolf 0:e0b85a04e7e5 444 return (1);
rolf 0:e0b85a04e7e5 445 }
rolf 0:e0b85a04e7e5 446
rolf 0:e0b85a04e7e5 447 return (0);
rolf 0:e0b85a04e7e5 448 }
rolf 0:e0b85a04e7e5 449
rolf 0:e0b85a04e7e5 450 static int
rolf 0:e0b85a04e7e5 451 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
rolf 0:e0b85a04e7e5 452 {
rolf 0:e0b85a04e7e5 453 int i, ch;
rolf 0:e0b85a04e7e5 454
rolf 0:e0b85a04e7e5 455 ch = s[*ofs];
rolf 0:e0b85a04e7e5 456
rolf 0:e0b85a04e7e5 457 for (i = 0; i < len; i++)
rolf 0:e0b85a04e7e5 458 if (p[i] == ch)
rolf 0:e0b85a04e7e5 459 return (0);
rolf 0:e0b85a04e7e5 460
rolf 0:e0b85a04e7e5 461 (*ofs)++;
rolf 0:e0b85a04e7e5 462 return (1);
rolf 0:e0b85a04e7e5 463 }
rolf 0:e0b85a04e7e5 464
rolf 0:e0b85a04e7e5 465 static int
rolf 0:e0b85a04e7e5 466 match(const struct slre *r, int pc, const char *s, int len,
rolf 0:e0b85a04e7e5 467 int *ofs, struct cap *caps)
rolf 0:e0b85a04e7e5 468 {
rolf 0:e0b85a04e7e5 469 int n, saved_offset, res = 1;
rolf 0:e0b85a04e7e5 470
rolf 0:e0b85a04e7e5 471 while (res && r->code[pc] != END) {
rolf 0:e0b85a04e7e5 472
rolf 0:e0b85a04e7e5 473 assert(pc < r->code_size);
rolf 0:e0b85a04e7e5 474 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
rolf 0:e0b85a04e7e5 475
rolf 0:e0b85a04e7e5 476 switch (r->code[pc]) {
rolf 0:e0b85a04e7e5 477 case BRANCH:
rolf 0:e0b85a04e7e5 478 saved_offset = *ofs;
rolf 0:e0b85a04e7e5 479 res = match(r, pc + 3, s, len, ofs, caps);
rolf 0:e0b85a04e7e5 480 if (res == 0) {
rolf 0:e0b85a04e7e5 481 *ofs = saved_offset;
rolf 0:e0b85a04e7e5 482 res = match(r, pc + r->code[pc + 1],
rolf 0:e0b85a04e7e5 483 s, len, ofs, caps);
rolf 0:e0b85a04e7e5 484 }
rolf 0:e0b85a04e7e5 485 pc += r->code[pc + 2];
rolf 0:e0b85a04e7e5 486 break;
rolf 0:e0b85a04e7e5 487 case EXACT:
rolf 0:e0b85a04e7e5 488 res = 0;
rolf 0:e0b85a04e7e5 489 n = r->code[pc + 2]; /* String length */
rolf 0:e0b85a04e7e5 490 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
rolf 0:e0b85a04e7e5 491 r->code[pc + 1], n)) {
rolf 0:e0b85a04e7e5 492 (*ofs) += n;
rolf 0:e0b85a04e7e5 493 res = 1;
rolf 0:e0b85a04e7e5 494 }
rolf 0:e0b85a04e7e5 495 pc += 3;
rolf 0:e0b85a04e7e5 496 break;
rolf 0:e0b85a04e7e5 497 case QUEST:
rolf 0:e0b85a04e7e5 498 res = 1;
rolf 0:e0b85a04e7e5 499 saved_offset = *ofs;
rolf 0:e0b85a04e7e5 500 if (!match(r, pc + 2, s, len, ofs, caps))
rolf 0:e0b85a04e7e5 501 *ofs = saved_offset;
rolf 0:e0b85a04e7e5 502 pc += r->code[pc + 1];
rolf 0:e0b85a04e7e5 503 break;
rolf 0:e0b85a04e7e5 504 case STAR:
rolf 0:e0b85a04e7e5 505 res = 1;
rolf 0:e0b85a04e7e5 506 loop_greedy(r, pc, s, len, ofs);
rolf 0:e0b85a04e7e5 507 pc += r->code[pc + 1];
rolf 0:e0b85a04e7e5 508 break;
rolf 0:e0b85a04e7e5 509 case STARQ:
rolf 0:e0b85a04e7e5 510 res = 1;
rolf 0:e0b85a04e7e5 511 loop_non_greedy(r, pc, s, len, ofs);
rolf 0:e0b85a04e7e5 512 pc += r->code[pc + 1];
rolf 0:e0b85a04e7e5 513 break;
rolf 0:e0b85a04e7e5 514 case PLUS:
rolf 0:e0b85a04e7e5 515 if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
rolf 0:e0b85a04e7e5 516 break;
rolf 0:e0b85a04e7e5 517
rolf 0:e0b85a04e7e5 518 loop_greedy(r, pc, s, len, ofs);
rolf 0:e0b85a04e7e5 519 pc += r->code[pc + 1];
rolf 0:e0b85a04e7e5 520 break;
rolf 0:e0b85a04e7e5 521 case PLUSQ:
rolf 0:e0b85a04e7e5 522 if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
rolf 0:e0b85a04e7e5 523 break;
rolf 0:e0b85a04e7e5 524
rolf 0:e0b85a04e7e5 525 loop_non_greedy(r, pc, s, len, ofs);
rolf 0:e0b85a04e7e5 526 pc += r->code[pc + 1];
rolf 0:e0b85a04e7e5 527 break;
rolf 0:e0b85a04e7e5 528 case SPACE:
rolf 0:e0b85a04e7e5 529 res = 0;
rolf 0:e0b85a04e7e5 530 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
rolf 0:e0b85a04e7e5 531 (*ofs)++;
rolf 0:e0b85a04e7e5 532 res = 1;
rolf 0:e0b85a04e7e5 533 }
rolf 0:e0b85a04e7e5 534 pc++;
rolf 0:e0b85a04e7e5 535 break;
rolf 0:e0b85a04e7e5 536 case NONSPACE:
rolf 0:e0b85a04e7e5 537 res = 0;
rolf 0:e0b85a04e7e5 538 if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) {
rolf 0:e0b85a04e7e5 539 (*ofs)++;
rolf 0:e0b85a04e7e5 540 res = 1;
rolf 0:e0b85a04e7e5 541 }
rolf 0:e0b85a04e7e5 542 pc++;
rolf 0:e0b85a04e7e5 543 break;
rolf 0:e0b85a04e7e5 544 case DIGIT:
rolf 0:e0b85a04e7e5 545 res = 0;
rolf 0:e0b85a04e7e5 546 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
rolf 0:e0b85a04e7e5 547 (*ofs)++;
rolf 0:e0b85a04e7e5 548 res = 1;
rolf 0:e0b85a04e7e5 549 }
rolf 0:e0b85a04e7e5 550 pc++;
rolf 0:e0b85a04e7e5 551 break;
rolf 0:e0b85a04e7e5 552 case ANY:
rolf 0:e0b85a04e7e5 553 res = 0;
rolf 0:e0b85a04e7e5 554 if (*ofs < len) {
rolf 0:e0b85a04e7e5 555 (*ofs)++;
rolf 0:e0b85a04e7e5 556 res = 1;
rolf 0:e0b85a04e7e5 557 }
rolf 0:e0b85a04e7e5 558 pc++;
rolf 0:e0b85a04e7e5 559 break;
rolf 0:e0b85a04e7e5 560 case ANYOF:
rolf 0:e0b85a04e7e5 561 res = 0;
rolf 0:e0b85a04e7e5 562 if (*ofs < len)
rolf 0:e0b85a04e7e5 563 res = is_any_of(r->data + r->code[pc + 1],
rolf 0:e0b85a04e7e5 564 r->code[pc + 2], s, ofs);
rolf 0:e0b85a04e7e5 565 pc += 3;
rolf 0:e0b85a04e7e5 566 break;
rolf 0:e0b85a04e7e5 567 case ANYBUT:
rolf 0:e0b85a04e7e5 568 res = 0;
rolf 0:e0b85a04e7e5 569 if (*ofs < len)
rolf 0:e0b85a04e7e5 570 res = is_any_but(r->data + r->code[pc + 1],
rolf 0:e0b85a04e7e5 571 r->code[pc + 2], s, ofs);
rolf 0:e0b85a04e7e5 572 pc += 3;
rolf 0:e0b85a04e7e5 573 break;
rolf 0:e0b85a04e7e5 574 case BOL:
rolf 0:e0b85a04e7e5 575 res = *ofs == 0 ? 1 : 0;
rolf 0:e0b85a04e7e5 576 pc++;
rolf 0:e0b85a04e7e5 577 break;
rolf 0:e0b85a04e7e5 578 case EOL:
rolf 0:e0b85a04e7e5 579 res = *ofs == len ? 1 : 0;
rolf 0:e0b85a04e7e5 580 pc++;
rolf 0:e0b85a04e7e5 581 break;
rolf 0:e0b85a04e7e5 582 case OPEN:
rolf 0:e0b85a04e7e5 583 if (caps != NULL)
rolf 0:e0b85a04e7e5 584 caps[r->code[pc + 1]].ptr = s + *ofs;
rolf 0:e0b85a04e7e5 585 pc += 2;
rolf 0:e0b85a04e7e5 586 break;
rolf 0:e0b85a04e7e5 587 case CLOSE:
rolf 0:e0b85a04e7e5 588 if (caps != NULL)
rolf 0:e0b85a04e7e5 589 caps[r->code[pc + 1]].len = (s + *ofs) -
rolf 0:e0b85a04e7e5 590 caps[r->code[pc + 1]].ptr;
rolf 0:e0b85a04e7e5 591 pc += 2;
rolf 0:e0b85a04e7e5 592 break;
rolf 0:e0b85a04e7e5 593 case END:
rolf 0:e0b85a04e7e5 594 pc++;
rolf 0:e0b85a04e7e5 595 break;
rolf 0:e0b85a04e7e5 596 default:
rolf 0:e0b85a04e7e5 597 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
rolf 0:e0b85a04e7e5 598 assert(0);
rolf 0:e0b85a04e7e5 599 break;
rolf 0:e0b85a04e7e5 600 }
rolf 0:e0b85a04e7e5 601 }
rolf 0:e0b85a04e7e5 602
rolf 0:e0b85a04e7e5 603 return (res);
rolf 0:e0b85a04e7e5 604 }
rolf 0:e0b85a04e7e5 605
rolf 0:e0b85a04e7e5 606 int
rolf 0:e0b85a04e7e5 607 slre_match(const struct slre *r, const char *buf, int len,
rolf 0:e0b85a04e7e5 608 struct cap *caps)
rolf 0:e0b85a04e7e5 609 {
rolf 0:e0b85a04e7e5 610 int i, ofs = 0, res = 0;
rolf 0:e0b85a04e7e5 611
rolf 0:e0b85a04e7e5 612 if (r->anchored) {
rolf 0:e0b85a04e7e5 613 res = match(r, 0, buf, len, &ofs, caps);
rolf 0:e0b85a04e7e5 614 } else {
rolf 0:e0b85a04e7e5 615 for (i = 0; i < len && res == 0; i++) {
rolf 0:e0b85a04e7e5 616 ofs = i;
rolf 0:e0b85a04e7e5 617 res = match(r, 0, buf, len, &ofs, caps);
rolf 0:e0b85a04e7e5 618 }
rolf 0:e0b85a04e7e5 619 }
rolf 0:e0b85a04e7e5 620
rolf 0:e0b85a04e7e5 621 return (res);
rolf 0:e0b85a04e7e5 622 }
rolf 0:e0b85a04e7e5 623