diff options
Diffstat (limited to 'usr/src/lib/libast/common/regex/regcomp.c')
-rw-r--r-- | usr/src/lib/libast/common/regex/regcomp.c | 3519 |
1 files changed, 0 insertions, 3519 deletions
diff --git a/usr/src/lib/libast/common/regex/regcomp.c b/usr/src/lib/libast/common/regex/regcomp.c deleted file mode 100644 index 4573cc217b..0000000000 --- a/usr/src/lib/libast/common/regex/regcomp.c +++ /dev/null @@ -1,3519 +0,0 @@ -/*********************************************************************** -* * -* This software is part of the ast package * -* Copyright (c) 1985-2010 AT&T Intellectual Property * -* and is licensed under the * -* Common Public License, Version 1.0 * -* by AT&T Intellectual Property * -* * -* A copy of the License is available at * -* http://www.opensource.org/licenses/cpl1.0.txt * -* (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * -* * -* Information and Software Systems Research * -* AT&T Research * -* Florham Park NJ * -* * -* Glenn Fowler <gsf@research.att.com> * -* David Korn <dgk@research.att.com> * -* Phong Vo <kpv@research.att.com> * -* * -***********************************************************************/ -#pragma prototyped - -/* - * posix regex compiler - */ - -#include "reglib.h" - -#if _PACKAGE_ast -#include "lclib.h" -#endif - -#define serialize re_serialize /* hp.ia64 <unistd.h>! */ - -#define C_ESC (-1) -#define C_MB (-2) - -#if _AST_REGEX_DEBUG - -#define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) -#define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) -#define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) - -static unsigned long debug; -static unsigned long debug_flag; - -#else - -#define DEBUG_INIT() -#define DEBUG_TEST(f,y,n) (n) -#define DEBUG_CODE(f,y,n) do {n} while(0) - -#endif - -#if _PACKAGE_ast - -typedef struct Cchr_s -{ - Dtlink_t lnk; - unsigned char nam[2]; - Ckey_t key; -} Cchr_t; - -#endif - -#define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) - -/* - * determine whether greedy matching will work, i.e. produce - * the best match first. such expressions are "easy", and - * need no backtracking once a complete match is found. - * if an expression has backreferences or alts it's hard - * else if it has only one closure it's easy - * else if all closures are simple (i.e. one-character) it's easy - * else it's hard. - */ - -typedef struct Stats_s -{ - unsigned long l; /* min length to left of x */ - unsigned long k; /* min length to left of y */ - unsigned long m; /* min length */ - unsigned long n; /* max length */ - unsigned short a; /* number of alternations */ - unsigned short b; /* number of backrefs */ - unsigned short c; /* number of closures */ - unsigned short e; /* $ */ - unsigned short i; /* number of negations */ - unsigned short p; /* number of named subexpressions */ - unsigned short s; /* number of simple closures */ - unsigned short t; /* number of tries */ - unsigned short u; /* number of unnamed subexpressions */ - Rex_t* x; /* max length REX_STRING */ - Rex_t* y; /* max length REX_TRIE */ -} Stats_t; - -typedef struct Token_s -{ - unsigned long min; - unsigned long max; - short lex; - short len; - short esc; - short att; - short push; -} Token_t; - -typedef struct Cenv_s -{ - int delimiter; /* pattern delimiter */ - int error; /* last error */ - int explicit; /* explicit match on this char */ - int mappeddot; /* inverse mapped '.' */ - int mappednewline; /* inverse mapped '\n' */ - int mappedslash; /* inverse mapped '/' */ - regflags_t flags; /* flags arg to regcomp */ - int type; /* BRE,ERE,ARE,SRE,KRE */ - unsigned char* cursor; /* curent point in re */ - unsigned char* pattern; /* the original pattern */ - unsigned char* literal; /* literal restart pattern */ - int parno; /* number of last open paren */ - int parnest; /* paren nest count */ - int posixkludge; /* to make * nonspecial */ - int regexp; /* <regexp.h> compatibility */ - Token_t token; /* token lookahead */ - Stats_t stats; /* RE statistics */ - int terminator; /* pattern terminator */ - Rex_t* paren[2*(BACK_REF_MAX+2)]; - /* paren[i]!=0 if \i defined */ - regex_t* regex; /* user handle */ - regdisc_t* disc; /* user discipline */ - unsigned char* map; /* external to native ccode map */ - unsigned char* MAP; /* fold and/or map */ -} Cenv_t; - -/* - * allocate a new Rex_t node - */ - -static Rex_t* -node(Cenv_t* env, int type, int lo, int hi, size_t extra) -{ - register Rex_t* e; - - if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) - { - memset(e, 0, sizeof(Rex_t) + extra); - e->type = type; - e->marked = 0; - e->lo = lo; - e->hi = hi; - e->flags = env->flags; - e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; - e->explicit = env->explicit; - if (extra) - e->re.data = (char*)e + sizeof(Rex_t); - } - return e; -} - -/* - * free a Trie_node_t node - */ - -static void -triedrop(regdisc_t* disc, Trie_node_t* e) -{ - if (e) - { - triedrop(disc, e->son); - triedrop(disc, e->sib); - alloc(disc, e, 0); - } -} - -/* - * free a Rex_t node - */ - -void -drop(regdisc_t* disc, Rex_t* e) -{ - int i; - Rex_t* x; - - if (e && !(disc->re_flags & REG_NOFREE)) - do - { - switch (e->type) - { - case REX_ALT: - case REX_CONJ: - drop(disc, e->re.group.expr.binary.left); - drop(disc, e->re.group.expr.binary.right); - break; - case REX_GROUP: - case REX_GROUP_AHEAD: - case REX_GROUP_AHEAD_NOT: - case REX_GROUP_BEHIND: - case REX_GROUP_BEHIND_NOT: - case REX_GROUP_CUT: - case REX_NEG: - case REX_REP: - drop(disc, e->re.group.expr.rex); - break; - case REX_TRIE: - for (i = 0; i <= UCHAR_MAX; i++) - triedrop(disc, e->re.trie.root[i]); - break; - } - x = e->next; - alloc(disc, e, 0); - } while (e = x); -} - -/* - * mark e and descendants minimal - */ - -static void -mark(register Rex_t* e, int set) -{ - if (e && !e->marked) - do - { - e->marked = 1; - if (set) - e->flags |= REG_MINIMAL; - else - e->flags &= ~REG_MINIMAL; - switch (e->type) - { - case REX_ALT: - case REX_CONJ: - case REX_GROUP_COND: - if (e->re.group.expr.binary.left) - mark(e->re.group.expr.binary.left, set); - if (e->re.group.expr.binary.right) - mark(e->re.group.expr.binary.right, set); - break; - case REX_GROUP: - case REX_GROUP_AHEAD: - case REX_GROUP_AHEAD_NOT: - case REX_GROUP_BEHIND: - case REX_GROUP_BEHIND_NOT: - case REX_GROUP_CUT: - case REX_NEG: - case REX_REP: - case REX_TRIE: - mark(e->re.group.expr.rex, set); - break; - } - } while (e = e->next); -} - -/* - * assign subexpression numbers by a preorder tree walk - */ - -static int -serialize(Cenv_t* env, Rex_t* e, int n) -{ - do - { - e->serial = n++; - switch (e->type) - { - case REX_ALT: - case REX_GROUP_COND: - if (e->re.group.expr.binary.left) - n = serialize(env, e->re.group.expr.binary.left, n); - e->re.group.expr.binary.serial = n++; - if (e->re.group.expr.binary.right) - n = serialize(env, e->re.group.expr.binary.right, n); - break; - case REX_CONJ: - n = serialize(env, e->re.group.expr.binary.left, n); - n = serialize(env, e->re.group.expr.binary.right, n); - break; - case REX_GROUP: - case REX_GROUP_AHEAD: - case REX_GROUP_AHEAD_NOT: - case REX_GROUP_BEHIND: - case REX_GROUP_BEHIND_NOT: - case REX_GROUP_CUT: - case REX_NEG: - case REX_REP: - n = serialize(env, e->re.group.expr.rex, n); - break; - } - } while (e = e->next); - return n; -} - -/* - * catenate e and f into a sequence, collapsing them if possible - */ - -static Rex_t* -cat(Cenv_t* env, Rex_t* e, Rex_t* f) -{ - Rex_t* g; - - if (!f) - { - drop(env->disc, e); - return 0; - } - if (e->type == REX_NULL) - { - drop(env->disc, e); - return f; - } - if (f->type == REX_NULL) - { - g = f->next; - f->next = 0; - drop(env->disc, f); - f = g; - } - else if (e->type == REX_DOT && f->type == REX_DOT) - { - unsigned int m = e->lo + f->lo; - unsigned int n = e->hi + f->hi; - - if (m <= RE_DUP_MAX) - { - if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) - { - n = RE_DUP_INF; - goto combine; - } - else if (n <= RE_DUP_MAX) - { - combine: - e->lo = m; - e->hi = n; - g = f->next; - f->next = 0; - drop(env->disc, f); - f = g; - } - } - } - e->next = f; - return e; -} - -/* - * collect re statistics - */ - -static int -stats(register Cenv_t* env, register Rex_t* e) -{ - register unsigned long n; - register unsigned long m; - unsigned long cm; - unsigned long nm; - unsigned long cn; - unsigned long nn; - unsigned long l; - unsigned long k; - unsigned long t; - Rex_t* q; - Rex_t* x; - Rex_t* y; - unsigned char c; - unsigned char b; - - do - { - switch (e->type) - { - case REX_ALT: - x = env->stats.x; - l = env->stats.l; - y = env->stats.y; - k = env->stats.k; - t = env->stats.t; - if (++env->stats.a <= 0) - return 1; - cm = env->stats.m; - env->stats.m = 0; - cn = env->stats.n; - env->stats.n = 0; - if (stats(env, e->re.group.expr.binary.left)) - return 1; - m = env->stats.m; - env->stats.m = 0; - n = env->stats.n; - env->stats.n = 0; - if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) - return 1; - if (env->stats.m > m) - env->stats.m = m; - else - m = env->stats.m; - if ((env->stats.m += cm) < m) - return 1; - if (env->stats.n < n) - env->stats.n = n; - else - n = env->stats.n; - if ((env->stats.n += cn) < n) - return 1; - env->stats.x = x; - env->stats.l = l; - env->stats.y = y; - env->stats.k = k; - env->stats.t = t; - break; - case REX_BACK: - if (++env->stats.b <= 0) - return 1; - break; - case REX_CLASS: - case REX_COLL_CLASS: - case REX_DOT: - case REX_ONECHAR: - n = env->stats.m; - if ((env->stats.m += e->lo) < n) - return 1; - if (e->hi != RE_DUP_INF) - { - n = env->stats.n; - if ((env->stats.n += e->hi) < n) - return 1; - } - if (e->lo != e->hi) - { - if (++env->stats.c <= 0) - return 1; - if (++env->stats.s <= 0) - return 1; - } - break; - case REX_CONJ: - cm = env->stats.m; - env->stats.m = 0; - cn = env->stats.n; - env->stats.n = 0; - if (stats(env, e->re.group.expr.binary.left)) - return 1; - nm = env->stats.m; - env->stats.m = 0; - nn = env->stats.n; - env->stats.n = 0; - if (stats(env, e->re.group.expr.binary.right)) - return 1; - if (env->stats.m < nm) - env->stats.m = nm; - else - nm = env->stats.m; - if ((env->stats.m += cm) < nm) - return 1; - if (env->stats.n < nn) - env->stats.n = nn; - else - nn = env->stats.n; - if ((env->stats.n += cn) < nn) - return 1; - break; - case REX_END: - env->stats.e = 1; - break; - case REX_GROUP: - if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) - return 1; - if (stats(env, e->re.group.expr.rex)) - return 1; - break; - case REX_GROUP_AHEAD: - case REX_GROUP_AHEAD_NOT: - case REX_GROUP_BEHIND: - case REX_GROUP_BEHIND_NOT: - m = env->stats.m; - n = env->stats.n; - x = env->stats.x; - y = env->stats.y; - if (stats(env, e->re.group.expr.rex)) - return 1; - env->stats.m = m; - env->stats.n = n; - env->stats.x = x; - env->stats.y = y; - switch (e->type) - { - case REX_GROUP_AHEAD: - case REX_GROUP_BEHIND: - if (++env->stats.u <= 0) - return 1; - break; - } - break; - case REX_GROUP_COND: - if (++env->stats.u <= 0) - return 1; - m = env->stats.m; - n = env->stats.n; - x = env->stats.x; - y = env->stats.y; - if (e->re.group.size > 0 && ++env->stats.b <= 0) - return 1; - if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) - return 1; - if (q = e->re.group.expr.binary.right) - { - if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) - return 1; - if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) - return 1; - } - env->stats.m = m; - env->stats.n = n; - env->stats.x = x; - env->stats.y = y; - break; - case REX_GROUP_CUT: - if (++env->stats.u <= 0) - return 1; - m = env->stats.m; - n = env->stats.n; - x = env->stats.x; - y = env->stats.y; - if (stats(env, e->re.group.expr.rex)) - return 1; - env->stats.m = m; - env->stats.n = n; - env->stats.x = x; - env->stats.y = y; - break; - case REX_NEG: - env->stats.i++; - x = env->stats.x; - l = env->stats.l; - y = env->stats.y; - k = env->stats.k; - t = env->stats.t; - cm = env->stats.m; - env->stats.m = 0; - if (stats(env, e->re.group.expr.rex)) - return 1; - env->stats.m = !env->stats.m; - if ((env->stats.m += cm) < cm) - return 1; - env->stats.x = x; - env->stats.l = l; - env->stats.y = y; - env->stats.k = k; - env->stats.t = t; - break; - case REX_REP: - x = env->stats.x; - l = env->stats.l; - y = env->stats.y; - k = env->stats.k; - t = env->stats.t; - if (++env->stats.c <= 0) - return 1; - b = env->stats.b; - c = env->stats.c; - cm = env->stats.m; - env->stats.m = 0; - if (stats(env, e->re.group.expr.rex)) - return 1; - if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) - return 1; - if (e->lo < 1) - { - env->stats.x = x; - env->stats.l = l; - env->stats.y = y; - env->stats.k = k; - env->stats.t = t; - env->stats.m = cm; - } - else - { - m = env->stats.m; - if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) - return 1; - m = env->stats.m; - if ((env->stats.m += cm) < m) - return 1; - if (env->stats.x != x) - env->stats.l = cm; - if (env->stats.y != y) - env->stats.k = cm; - } - break; - case REX_STRING: - if (!e->map) - { - cm = env->stats.m; - if ((env->stats.m += e->re.string.size) < cm) - return 1; - cn = env->stats.n; - if ((env->stats.n += e->re.string.size) < cn) - return 1; - if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) - { - env->stats.x = e; - env->stats.l = cm; - } - } - break; - case REX_TRIE: - if (++env->stats.s <= 0) - return 1; - cm = env->stats.m; - if ((env->stats.m += e->re.trie.min) < cm) - return 1; - cn = env->stats.n; - if ((env->stats.n += e->re.trie.max) < cn) - return 1; - env->stats.t++; - if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) - { - env->stats.y = e; - env->stats.k = cm; - } - break; - } - } while (e = e->next); - return 0; -} - -static int token(Cenv_t*); - -static int -magic(register Cenv_t* env, register int c, int escaped) -{ - register char* sp; - register int n; - int o = c; - int e = env->error; - int l = env->token.len; - short* mp; - char* ep; - - if (mp = state.magic[c]) - { - c = mp[env->type+escaped]; - if (c >= T_META) - { - sp = (char*)env->cursor + env->token.len; - switch (c) - { - case T_LEFT: - n = 0; - ep = sp; - while (*sp >= '0' && *sp <= '9') - { - if (n > (INT_MAX / 10)) - { - env->error = REG_BADBR; - goto bad; - } - n = n * 10 + *sp++ - '0'; - } - if (sp == ep) - { - if (env->type < SRE || *sp != ',') - { - env->error = *sp ? REG_BADBR : REG_EBRACE; - goto bad; - } - } - else if (n > RE_DUP_MAX) - { - env->error = REG_BADBR; - goto bad; - } - env->token.min = n; - if (*sp == ',') - { - n = 0; - ep = ++sp; - while (*sp >= '0' && *sp <= '9') - { - if (n > (INT_MAX / 10)) - { - env->error = REG_BADBR; - goto bad; - } - n = n * 10 + *sp++ - '0'; - } - if (sp == ep) - n = RE_DUP_INF; - else if (n < env->token.min) - { - env->error = REG_BADBR; - goto bad; - } - } - env->token.max = n; - switch (*sp) - { - case 0: - env->error = REG_EBRACE; - goto bad; - case '\\': - if (!escaped) - { - env->error = REG_BADBR; - goto bad; - } - sp++; - break; - default: - if (escaped) - { - env->error = REG_BADBR; - goto bad; - } - break; - } - switch (*sp++) - { - case 0: - env->error = REG_EBRACE; - goto bad; - case '}': - break; - default: - env->error = REG_BADBR; - goto bad; - } - env->token.len = sp - (char*)env->cursor; - break; - case T_RIGHT: - env->error = REG_EBRACE; - goto bad; - case T_OPEN: - if (env->type < SRE && *sp == '?') - { - env->token.len++; - env->token.lex = 0; - goto group; - } - break; - case T_ESCAPE: - c = chresc(sp - 2, &ep); - if (ep < sp) - goto bad; - env->token.len += ep - sp; - if (c >= T_META) - { - env->token.lex = c; - c = C_ESC; - } - return c; - case T_BACK+0: - case T_BACK+1: - case T_BACK+2: - case T_BACK+3: - case T_BACK+4: - case T_BACK+5: - case T_BACK+6: - case T_BACK+7: - n = chresc(sp - 2, &ep); - if (ep > sp + 1) - { - env->token.len += ep - sp; - return n; - } - /*FALLTHROUGH*/ - case T_BACK+8: - case T_BACK+9: - if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) - { - env->error = REG_BADESC; - goto bad; - } - if ((env->flags & REG_MULTIREF) && isdigit(*sp)) - { - c = (c - T_BACK) * 10 + (*sp - '0'); - if (c > 0 && c <= env->parno && env->paren[c]) - c += T_BACK; - else - c = chresc(sp - 2, &ep); - env->token.len++; - } - if (c == T_BACK) - c = 0; - break; - case T_BAD: - if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) - return c; - goto bad; - } - if (env->type >= SRE) - { - if (c == T_DOT) - c = '.'; - else if (c < T_OPEN) - { - if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') - { - env->token.len++; - env->token.att = 1; - } - if (env->type == KRE && *(env->cursor + env->token.len) == '(') - { - env->token.len++; - switch (c) - { - case T_AT: - break; - case T_PERCENT: - env->token.lex = c; - goto group; - case T_TILDE: - env->token.lex = 0; - goto group; - default: - env->token.lex = c; - break; - } - c = T_OPEN; - } - else if (c == T_STAR) - c = T_DOTSTAR; - else if (c == T_QUES) - c = T_DOT; - else - { - c = o; - env->token.len = l; - } - } - else if (c > T_BACK) - { - c = (c - T_BACK) * 2 - 1; - c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; - } - else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) - { - if (c == T_AND) - c = '&'; - else if (c == T_BAR) - c = '|'; - else if (c == T_OPEN) - c = '('; - } - } - } - } - else if (escaped == 2) - { - if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) - /*ok*/; - else - { - env->error = REG_BADESC; - goto bad; - } - } - else if (escaped && !(env->flags & REG_LENIENT) && c != ']') - { - env->error = REG_BADESC; - goto bad; - } - return c; - group: - sp = (char*)env->cursor + env->token.len; - switch (*sp++) - { - case ')': - break; - case '#': - for (;;) - { - switch (*sp++) - { - case 0: - env->error = REG_EPAREN; - return T_BAD; - case ')': - break; - default: - continue; - } - break; - } - break; - default: - return T_GROUP; - } - env->cursor = (unsigned char*)sp; - return token(env); - bad: - if (escaped == 2) - env->error = e; - else if (env->flags & REG_LENIENT) - return o; - else if (escaped == 1 && !env->error) - { - if (mp || o == ']') - return o; - env->error = REG_BADESC; - } - return T_BAD; -} - -static int -token(register Cenv_t* env) -{ - int c; - int posixkludge; - - if (env->token.push) - return env->token.lex; - env->token.att = env->token.esc = 0; - if ((env->token.len = MBSIZE(env->cursor)) > 1) - return env->token.lex = C_MB; - env->token.lex = 0; - for (;;) - { - c = *env->cursor; - if (c == 0 || c == env->delimiter || c == env->terminator) - return T_END; - if (!(env->flags & REG_COMMENT)) - break; - if (c == '#') - { - do - { - c = *++env->cursor; - if (c == 0 || c == env->delimiter) - return T_END; - } while (c != '\n'); - } - else if (!isspace(c)) - break; - env->cursor++; - } - if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) - { - if (env->parnest) - { - env->error = REG_EPAREN; - return T_BAD; - } - env->parno = 0; - env->pattern = env->cursor + 1; - return T_BAR; - } - if (env->flags & REG_LITERAL) - return c; - if (posixkludge = env->posixkludge) - { - env->posixkludge = 0; - if (c == '*') - return c; - } - if (c == '\\') - { - if (env->flags & REG_SHELL_ESCAPED) - return c; - if (!(c = *(env->cursor + 1)) || c == env->terminator) - { - if (env->flags & REG_LENIENT) - { - if (c) - { - env->token.esc = env->token.len; - env->token.len += MBSIZE(env->cursor + 1); - return c; - } - return '\\'; - } - env->error = REG_EESCAPE; - return T_BAD; - } - env->token.esc = env->token.len; - env->token.len += MBSIZE(env->cursor + 1); - if (env->delimiter && c == 'n') - return '\n'; - else if (c == env->delimiter) - return magic(env, c, 0); - else if (c == '(' && env->type == BRE) - env->posixkludge = 1; - else if (c == ')' && env->type == BRE && env->parnest <= 0) - { - env->error = REG_EPAREN; - return T_BAD; - } - else if (isspace(c) && (env->flags & REG_COMMENT)) - return c; - return magic(env, c, 1); - } - else if (c == '$') - { - if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') - return T_DOLL; - } - else if (c == '^') - { - if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1)) - { - env->posixkludge = 2; - return T_CFLX; - } - } - else if (c == ')') - { - if (env->type != BRE && env->parnest <= 0) - return c; - } - else if (c == '/' && env->explicit == env->mappedslash) - { - while (*(env->cursor + env->token.len) == c) - env->token.len++; - return T_SLASHPLUS; - } - return magic(env, c, 0); -} - -static Celt_t* -col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) -{ - register char* s; - register unsigned char* k; - register unsigned char* e; - register int c; - register int cc; - int bt; - int et; - Ckey_t key; - - cc = 0; - for (;;) - { - k = key; - if (bw == 1) - { - c = bc; - if (ic) - { - if (isupper(c)) - { - c = tolower(c); - cc = -1; - } - else if (islower(c)) - { - c = toupper(c); - cc = -1; - } - } - *k++ = c; - } - else if (bw < COLL_KEY_MAX) - { - s = (char*)bp; - if (ic) - { - c = mbchar(s); - if (iswupper(c)) - { - c = towlower(c); - cc = 1; - } - else if (iswlower(c)) - { - c = towupper(c); - cc = 1; - } - } - if (cc > 0) - { - cc = -1; - k += mbconv((char*)k, c); - } - else - for (e = k + bw; k < e; *k++ = *s++); - } - *k = 0; - mbxfrm(ce->beg, key, COLL_KEY_MAX); - if (ep) - { - k = key; - c = mbchar(k); - if (iswupper(c)) - bt = COLL_range_uc; - else if (iswlower(c)) - bt = COLL_range_lc; - else - bt = COLL_range; - k = key; - if (ew == 1) - { - c = ec; - if (ic) - { - if (isupper(c)) - { - c = tolower(c); - cc = -1; - } - else if (islower(c)) - { - c = toupper(c); - cc = -1; - } - } - *k++ = c; - } - else if (ew < COLL_KEY_MAX) - { - s = (char*)ep; - if (ic) - { - c = mbchar(s); - if (iswupper(c)) - { - c = towlower(c); - cc = 1; - } - else if (iswlower(c)) - { - c = towupper(c); - cc = 1; - } - } - if (cc > 0) - { - cc = -1; - k += mbconv((char*)k, c); - } - else - for (e = k + ew; k < e; *k++ = *s++); - } - *k = 0; - mbxfrm(ce->end, key, COLL_KEY_MAX); - k = key; - c = mbchar(k); - if (iswupper(c)) - et = COLL_range_uc; - else if (iswlower(c)) - et = COLL_range_lc; - else - et = COLL_range; - ce->typ = bt == et ? bt : COLL_range; - } - else - ce->typ = COLL_char; - ce++; - if (!ic || !cc) - break; - ic = 0; - } - return ce; -} - -static Rex_t* -bra(Cenv_t* env) -{ - Rex_t* e; - int c; - int i; - int w; - int neg; - int last; - int inrange; - int complicated; - int collate; - int elements; - unsigned char* first; - unsigned char* start; - unsigned char* begin; - unsigned char* s; - regclass_t f; - unsigned char buf[4 * (COLL_KEY_MAX + 1)]; -#if _PACKAGE_ast - int ic; - char mbc[COLL_KEY_MAX + 1]; -#endif - - if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) - return 0; - collate = complicated = elements = 0; - if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') - { - env->cursor++; - neg = 1; - } - else - neg = 0; - first = env->cursor; - start = first + MBSIZE(first); - if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) - goto error; - begin = env->cursor + MBSIZE(env->cursor); - - /* - * inrange: 0=no, 1=possibly, 2=definitely - */ - - inrange = 0; - for (;;) - { - if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE)) - goto error; - env->cursor += (w = MBSIZE(env->cursor)); - if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) - { - if (*env->cursor) - { - if (*env->cursor == 'n') - { - env->cursor++; - c = '\n'; - } - else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) - { - env->token.len = 1; - w = magic(env, *env->cursor, 2); - if (env->token.len > 1 || w != T_BAD) - { - if (env->token.len == 1 && (f = classfun(w))) - { - if (inrange > 1) - { - if (env->type < SRE && !(env->flags & REG_LENIENT)) - goto erange; - inrange = 0; - } - env->cursor++; - for (c = 0; c <= UCHAR_MAX; c++) - if ((*f)(c)) - setadd(e->re.charclass, c); - complicated++; - elements++; - continue; - } - if (env->token.len > 1 || w >= 0 && w < T_META) - { - c = w; - if (c > UCHAR_MAX) - { - if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) - goto erange; - c = UCHAR_MAX; - } - env->cursor += env->token.len; - } - } - } - } - } - else if (c == ']') - { - if (env->cursor == begin) - { - last = c; - inrange = 1; - continue; - } - if (inrange != 0) - { - setadd(e->re.charclass, last); - elements++; - if (inrange == 2) - { - setadd(e->re.charclass, '-'); - elements++; - } - } - break; - } - else if (c == '-') - { - if (!inrange && env->cursor != begin && *env->cursor != ']') - { - if (env->type < SRE && !(env->flags & REG_LENIENT)) - goto erange; - continue; - } - else if (inrange == 1) - { - inrange = 2; - complicated++; - continue; - } - } - else if (c == '[') - { - switch (*env->cursor) - { - case 0: - goto error; - case ':': - if (env->regexp) - goto normal; - if (inrange == 1) - { - setadd(e->re.charclass, last); - elements++; - } - if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) - { - if (env->cursor == start && (c = *(env->cursor + 1))) - { - s = start = env->cursor + 1; - while (*++s && *s != ':'); - if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') - { - if ((i = (s - start)) == 1) - { - switch (c) - { - case '<': - i = REX_WBEG; - break; - case '>': - i = REX_WEND; - break; - default: - i = 0; - break; - } - if (i) - { - env->cursor = s + 3; - drop(env->disc, e); - return node(env, i, 0, 0, 0); - } - } - } - } - env->error = REG_ECTYPE; - goto error; - } - for (c = 0; c <= UCHAR_MAX; c++) - if ((*f)(c)) - setadd(e->re.charclass, c); - inrange = 0; - complicated++; - elements++; - continue; - case '=': - if (env->regexp) - goto normal; - if (inrange == 2) - goto erange; - if (inrange == 1) - { - setadd(e->re.charclass, last); - elements++; - } - if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) - goto ecollate; - if (c > 1) - collate++; - else - setadd(e->re.charclass, buf[0]); - c = buf[0]; - inrange = 0; - complicated++; - elements++; - continue; - case '.': - if (env->regexp) - goto normal; - if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) - goto ecollate; - if (c > 1) - collate++; - c = buf[0]; - complicated++; - break; - default: - normal: - if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) - goto error; - break; - } - } - else if (w > 1) - complicated++; - if (inrange == 2) - { - if (last <= c) - { - for (i = last; i <= c; i++) - setadd(e->re.charclass, i); - inrange = env->type >= SRE || (env->flags & REG_LENIENT); - elements += 2; - } - else if (env->type >= SRE) - { - setadd(e->re.charclass, last); - setadd(e->re.charclass, c); - elements += 2; - inrange = 1; - } - else if (!(env->flags & REG_LENIENT)) - goto erange; - else - inrange = 0; - } - else if (inrange == 1) - { - setadd(e->re.charclass, last); - elements++; - } - else - inrange = 1; - last = c; - } -#if _PACKAGE_ast - if (complicated && mbcoll()) - { - Dt_t* dt; - Cchr_t* cc; - Cchr_t* tc; - Cchr_t* xc; - Celt_t* ce; - Cchr_t key; - int rw; - int rc; - int wc; - unsigned char* rp; - unsigned char* pp; - char* wp; - char cb[2][COLL_KEY_MAX+1]; - - static Dtdisc_t disc; - - static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - - if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) - { - disc.key = offsetof(Cchr_t, key); - if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) - { - for (i = 0; i < elementsof(primary) - 1; i++, cc++) - { - cc->nam[0] = primary[i]; - mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); - dtinsert(dt, cc); - } - for (i = 0; i < elementsof(cc->key); i++) - cc->key[i] = ~0; - dtinsert(dt, cc); - LCINFO(AST_LC_COLLATE)->data = (void*)dt; - } - else - { - if (cc) - free(cc); - drop(env->disc, e); - return 0; - } - } - if (dt) - { - drop(env->disc, e); - if (ic = env->flags & REG_ICASE) - elements *= 2; - if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t)))) - return 0; - ce = (Celt_t*)e->re.data; - e->re.collate.invert = neg; - e->re.collate.elements = ce; - env->cursor = first; - inrange = 0; - for (;;) - { - if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) - goto error; - pp = env->cursor; - env->cursor += (w = MBSIZE(env->cursor)); - if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) - { - if (*env->cursor) - { - if (*env->cursor == 'n') - { - pp = env->cursor++; - c = '\n'; - } - else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) - { - env->token.len = 1; - w = magic(env, *env->cursor, 2); - if (env->token.len > 1 || w != T_BAD) - { - if (env->token.len == 1 && (f = classfun(w))) - { - if (inrange > 1) - { - if (env->type < SRE && !(env->flags & REG_LENIENT)) - goto erange; - inrange = 0; - } - env->cursor++; - ce->fun = f; - ce->typ = COLL_call; - ce++; - continue; - } - if (env->token.len > 1 || w >= 0 && w < T_META) - { - c = w; - w = mbconv(mbc, c); - pp = (unsigned char*)mbc; - env->cursor += env->token.len; - } - } - } - } - } - else if (c == ']') - { - if (env->cursor == begin) - { - rp = pp; - rw = w; - inrange = 1; - continue; - } - if (inrange != 0) - { - ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); - if (inrange == 2) - ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); - } - break; - } - else if (c == '-') - { - if (!inrange && env->cursor != begin && *env->cursor != ']') - { - if (env->type < SRE && !(env->flags & REG_LENIENT)) - goto erange; - continue; - } - else if (inrange == 1) - { - inrange = 2; - continue; - } - } - else if (c == '[') - { - switch (*env->cursor) - { - case 0: - goto error; - case ':': - if (env->regexp) - goto complicated_normal; - if (inrange == 1) - ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); - if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) - { - if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') - { - switch (c) - { - case '<': - i = REX_WBEG; - break; - case '>': - i = REX_WEND; - break; - default: - i = 0; - break; - } - if (i) - { - env->cursor += 5; - drop(env->disc, e); - return node(env, i, 0, 0, 0); - } - } - env->error = REG_ECTYPE; - goto error; - } - ce->fun = f; - ce->typ = COLL_call; - ce++; - inrange = 0; - continue; - case '=': - if (env->regexp) - goto complicated_normal; - if (inrange == 2) - goto erange; - if (inrange == 1) - ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); - pp = (unsigned char*)cb[inrange]; - rp = env->cursor + 1; - if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) - goto ecollate; - wp = (char*)pp; - wc = mbchar(wp); - c = 0; - if (ic) - { - if (iswupper(wc)) - { - wc = towlower(wc); - rw = mbconv((char*)pp, wc); - c = 'u'; - } - else if (iswlower(wc)) - c = 'l'; - } - for (;;) - { - mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); - if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) - goto ecollate; - xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; - if (c == 'l' || c == 'L' && !(c = 0)) - ce->typ = COLL_range_lc; - else if (c == 'u' || c == 'U' && !(c = 0)) - ce->typ = COLL_range_uc; - else - ce->typ = COLL_range; - strcpy((char*)ce->beg, (char*)xc->key); - if (!(cc = (Cchr_t*)dtnext(dt, cc))) - goto ecollate; - if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) - cc = tc; - strcpy((char*)ce->end, (char*)cc->key); - ce->max = -1; - ce++; - if (!c) - break; - if (c == 'u') - { - wc = towlower(wc); - c = 'L'; - } - else - { - wc = towupper(wc); - c = 'U'; - } - rw = mbconv((char*)pp, wc); - } - inrange = 0; - c = *pp; - continue; - case '.': - if (env->regexp) - goto complicated_normal; - pp = (unsigned char*)cb[inrange]; - if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) - goto ecollate; - c = buf[0]; - break; - default: - complicated_normal: - if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) - goto error; - break; - } - } - if (inrange == 2) - { - ce = col(ce, ic, rp, rw, rc, pp, w, c); - if (strcmp((char*)ce->beg, (char*)ce->end) > 0) - { - if (env->type < SRE && !(env->flags & REG_LENIENT)) - goto erange; - (ce-1)->typ = COLL_char; - strcpy((char*)ce->beg, (char*)(ce-1)->end); - ce->typ = COLL_char; - ce++; - } - inrange = env->type >= SRE || (env->flags & REG_LENIENT); - } - else if (inrange == 1) - ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); - else - inrange = 1; - rp = pp; - rw = w; - rc = c; - } - ce->typ = COLL_end; - return e; - } - } -#endif - if (collate) - goto ecollate; - if (env->flags & REG_ICASE) - for (i = 0; i <= UCHAR_MAX; i++) - if (settst(e->re.charclass, i)) - { - if (isupper(i)) - c = tolower(i); - else if (islower(i)) - c = toupper(i); - else - continue; - setadd(e->re.charclass, c); - } - if (neg) - { - for (i = 0; i < elementsof(e->re.charclass->bits); i++) - e->re.charclass->bits[i] ^= ~0; - if (env->explicit >= 0) - setclr(e->re.charclass, env->explicit); - } - return e; - ecollate: - env->error = REG_ECOLLATE; - goto error; - erange: - env->error = REG_ERANGE; - error: - drop(env->disc, e); - if (!env->error) - env->error = REG_EBRACK; - return 0; -} - -static Rex_t* -ccl(Cenv_t* env, int type) -{ - int i; - Rex_t* e; - Celt_t* ce; - regclass_t f; - - if (!(f = classfun(type))) - { - env->error = REG_BADESC; - return 0; - } - if (!mbcoll()) - { - if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) - return 0; - for (i = 0; i <= UCHAR_MAX; i++) - if ((*f)(i)) - setadd(e->re.charclass, i); - if (env->explicit >= 0) - setclr(e->re.charclass, env->explicit); - } - else - { - if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) - return 0; - ce = (Celt_t*)e->re.data; - e->re.collate.invert = 0; - e->re.collate.elements = ce; - ce->fun = f; - ce->typ = COLL_call; - ce++; - ce->typ = COLL_end; - } - return e; -} - -static Rex_t* -rep(Cenv_t* env, Rex_t* e, int number, int last) -{ - Rex_t* f; - unsigned long m = 0; - unsigned long n = RE_DUP_INF; - int minimal = -1; - - if (!e) - return 0; - switch (token(env)) - { - case T_BANG: - eat(env); - if (!(f = node(env, REX_NEG, m, n, 0))) - { - drop(env->disc, e); - return 0; - } - f->re.group.expr.rex = e; - return f; - case T_QUES: - eat(env); - n = 1; - break; - case T_STAR: - eat(env); - break; - case T_PLUS: - eat(env); - m = 1; - break; - case T_LEFT: - eat(env); - m = env->token.min; - n = env->token.max; - break; - default: - return e; - } - if (env->token.att) - minimal = 1; - else if (env->type < SRE) - switch (token(env)) - { - case T_QUES: - eat(env); - minimal = !(env->flags & REG_MINIMAL); - break; - case T_STAR: /*AST*/ - eat(env); - minimal = !!(env->flags & REG_MINIMAL); - break; - } - switch (e->type) - { - case REX_DOT: - case REX_CLASS: - case REX_COLL_CLASS: - case REX_ONECHAR: - e->lo = m; - e->hi = n; - if (minimal >= 0) - mark(e, minimal); - return e; -#if HUH_2002_08_07 - case REX_BEG: -#endif - case REX_BEG_STR: - case REX_END_STR: - case REX_FIN_STR: - case REX_WBEG: - case REX_WEND: - case REX_WORD: - case REX_WORD_NOT: - env->error = REG_BADRPT; - drop(env->disc, e); - return 0; - } - if (m == 1 && n == 1) - { - if (minimal >= 0) - mark(e, minimal); - return e; - } - if (!(f = node(env, REX_REP, m, n, 0))) - { - drop(env->disc, e); - return 0; - } - f->re.group.expr.rex = e; - f->re.group.number = number; - f->re.group.last = last; - if (minimal >= 0) - mark(f, minimal); - if (m <= n && n) - { - for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); - if (e && e->type == REX_NEG) - f->type = REX_GROUP; - } - return f; -} - -static int -isstring(Rex_t* e) -{ - switch (e->type) - { - case REX_ONECHAR: - return e->lo == 1 && e->hi == 1; - case REX_STRING: - return 1; - } - return 0; -} - -static Trie_node_t* -trienode(Cenv_t* env, int c) -{ - Trie_node_t* t; - - if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) - { - memset(t, 0, sizeof(Trie_node_t)); - t->c = c; - } - return t; -} - -static int -insert(Cenv_t* env, Rex_t* f, Rex_t* g) -{ - unsigned char* s; - unsigned char* e; - Trie_node_t* t; - int len; - unsigned char tmp[2]; - - switch (f->type) - { - case REX_ONECHAR: - *(s = tmp) = f->re.onechar; - e = s + 1; - break; - case REX_STRING: - s = f->re.string.base; - e = s + f->re.string.size; - break; - default: - return 1; - } - if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) - return 1; - for (len = 1;;) - { - if (t->c == *s) - { - if (++s >= e) - break; - if (!t->son && !(t->son = trienode(env, *s))) - return 1; - t = t->son; - len++; - } - else - { - if (!t->sib && !(t->sib = trienode(env, *s))) - return 1; - t = t->sib; - } - } - if (g->re.trie.min > len) - g->re.trie.min = len; - if (g->re.trie.max < len) - g->re.trie.max = len; - t->end = 1; - return 0; -} - -/* - * trie() tries to combine nontrivial e and f into a REX_TRIE - * unless 0 is returned, e and f are deleted as far as possible - * also called by regcomb - */ - -static Rex_t* -trie(Cenv_t* env, Rex_t* e, Rex_t* f) -{ - Rex_t* g; - - if (e->next || f->next || !isstring(e) || e->flags != f->flags) - return 0; - if (isstring(f)) - { - if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) - return 0; - g->re.trie.min = INT_MAX; - if (insert(env, f, g)) - goto nospace; - drop(env->disc, f); - } - else if (f->type != REX_TRIE) - return 0; - else - g = f; - if (insert(env, e, g)) - goto nospace; - drop(env->disc, e); - return g; - nospace: - if (g != f) - drop(env->disc, g); - return 0; -} - -static Rex_t* alt(Cenv_t*, int, int); - -static int -chr(register Cenv_t* env, int* escaped) -{ - unsigned char* p; - int c; - - *escaped = 0; - if (!(c = *env->cursor)) - return -1; - env->cursor++; - if (c == '\\') - { - if (env->flags & REG_SHELL_ESCAPED) - return c; - if (!(c = *(env->cursor + 1)) || c == env->terminator) - { - if (env->flags & REG_LENIENT) - return c ? c : '\\'; - env->error = REG_EESCAPE; - return -1; - } - p = env->cursor; - c = chresc((char*)env->cursor - 1, (char**)&env->cursor); - *escaped = env->cursor - p; - } - return c; -} - -/* - * open the perly gates - */ - -static Rex_t* -grp(Cenv_t* env, int parno) -{ - Rex_t* e; - Rex_t* f; - int c; - int i; - int n; - int x; - int esc; - int typ; - int beg; - unsigned char* p; - - beg = env->pattern == env->cursor - env->token.len; - if (!(c = env->token.lex) && (c = *env->cursor)) - env->cursor++; - env->token.len = 0; - env->parnest++; - typ = -1; - switch (c) - { - case '-': - case '+': - case 'a': - case 'g': - case 'i': - case 'l': - case 'm': - case 'p': - case 'r': - case 's': - case 'x': - case 'A': - case 'B': - case 'E': - case 'F': - case 'G': - case 'I': - case 'K': - case 'L': - case 'M': /* glob(3) */ - case 'N': /* glob(3) */ - case 'O': /* glob(3) */ - case 'P': - case 'R': /* pcre */ - case 'S': - case 'U': /* pcre */ - case 'X': /* pcre */ - x = REX_GROUP; - i = 1; - env->token.push = 1; - for (;;) - { - switch (c) - { - case ')': - if (!(env->flags & REG_LITERAL)) - { - env->error = REG_BADRPT; - return 0; - } - /*FALLTHROUGH*/ - case 0: - case T_CLOSE: - x = 0; - goto done; - case ':': - eat(env); - if (token(env) == T_CLOSE) - x = 0; - goto done; - case '-': - i = 0; - break; - case '+': - i = 1; - break; - case 'a': - if (i) - env->flags |= (REG_LEFT|REG_RIGHT); - else - env->flags &= ~(REG_LEFT|REG_RIGHT); - break; - case 'g': - if (i) - env->flags &= ~REG_MINIMAL; - else - env->flags |= REG_MINIMAL; - break; - case 'i': - if (i) - env->flags |= REG_ICASE; - else - env->flags &= ~REG_ICASE; - break; - case 'l': - if (i) - env->flags |= REG_LEFT; - else - env->flags &= ~REG_LEFT; - break; - case 'm': - if (i) - env->flags |= REG_NEWLINE; - else - env->flags &= ~REG_NEWLINE; - env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; - break; - case 'p': - if (i) - env->flags &= ~REG_LENIENT; - else - env->flags |= REG_LENIENT; - break; - case 'r': - if (i) - env->flags |= REG_RIGHT; - else - env->flags &= ~REG_RIGHT; - break; - case 's': - if (i) - env->flags |= REG_SPAN; - else - env->flags &= ~REG_SPAN; - env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; - break; - case 'x': - if (i) - env->flags |= REG_COMMENT; - else - env->flags &= ~REG_COMMENT; - break; - case 'A': - env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); - env->flags |= REG_AUGMENTED|REG_EXTENDED; - typ = ARE; - break; - case 'B': - case 'G': - env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); - typ = BRE; - break; - case 'E': - env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); - env->flags |= REG_EXTENDED; - typ = ERE; - break; - case 'F': - case 'L': - env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); - env->flags |= REG_LITERAL; - typ = ERE; - break; - case 'K': - env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); - env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; - typ = KRE; - break; - case 'M': - /* used by caller to disable glob(3) GLOB_BRACE */ - break; - case 'N': - /* used by caller to disable glob(3) GLOB_NOCHECK */ - break; - case 'O': - /* used by caller to disable glob(3) GLOB_STARSTAR */ - break; - case 'P': - env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); - env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE; - typ = ERE; - break; - case 'S': - env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); - env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; - typ = SRE; - break; - case 'U': /* PCRE_UNGREEDY */ - if (i) - env->flags |= REG_MINIMAL; - else - env->flags &= ~REG_MINIMAL; - break; - case 'X': /* PCRE_EXTRA */ - break; - default: - env->error = REG_BADRPT; - return 0; - } - eat(env); - c = token(env); - } - done: - break; - case ':': - x = REX_GROUP; - break; - case '=': - x = REX_GROUP_AHEAD; - break; - case '!': - x = REX_GROUP_AHEAD_NOT; - break; - case '<': - switch (token(env)) - { - case '=': - x = REX_GROUP_BEHIND; - break; - case '!': - case T_BANG: - x = REX_GROUP_BEHIND_NOT; - break; - default: - env->error = REG_BADRPT; - return 0; - } - eat(env); - break; - case '>': - x = REX_GROUP_CUT; - break; - case '%': - case T_PERCENT: - e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); - e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; - n = 1; - for (;;) - { - switch (i = chr(env, &esc)) - { - case -1: - case 0: - invalid: - env->cursor -= esc + 1; - env->error = REG_EPAREN; - return 0; - case 'D': - x = REX_NEST_delimiter; - /*FALLTHROUGH*/ - delimiter: - if ((i = chr(env, &esc)) < 0) - goto invalid; - if (e->re.nest.type[i] & ~x) - goto invalid; - e->re.nest.type[i] = x; - continue; - case 'E': - x = REX_NEST_escape; - goto delimiter; - case 'L': - x = REX_NEST_literal; - goto quote; - case 'O': - switch (i = chr(env, &esc)) - { - case 'T': - e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; - break; - default: - goto invalid; - } - continue; - case 'Q': - x = REX_NEST_quote; - /*FALLTHROUGH*/ - quote: - if ((i = chr(env, &esc)) < 0) - goto invalid; - if (e->re.nest.type[i] & ~x) - goto invalid; - e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); - continue; - case 'S': - x = REX_NEST_separator; - goto delimiter; - case 'T': - x = REX_NEST_terminator; - goto delimiter; - case '|': - case '&': - if (!esc) - goto invalid; - goto nesting; - case '(': - if (!esc) - n++; - goto nesting; - case ')': - if (!esc && !--n) - break; - goto nesting; - default: - nesting: - if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) - goto invalid; - e->re.nest.type[i] = REX_NEST_open; - if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) - goto invalid; - if (!esc) - { - if (x == ')' && !--n) - goto invalid; - else if (x == '(') - n++; - } - e->re.nest.type[x] |= REX_NEST_close; - e->re.nest.type[i] |= x << REX_NEST_SHIFT; - continue; - } - break; - } - env->parnest--; - if (c == T_PERCENT) - for (n = 0; n < 2; n++) - { - parno = ++env->parno; - if (!(f = node(env, REX_GROUP, 0, 0, 0))) - { - drop(env->disc, e); - return 0; - } - if (parno < elementsof(env->paren)) - env->paren[parno] = f; - f->re.group.back = 0; - f->re.group.number = parno; - f->re.group.expr.rex = e; - e = f; - } - return e; - case '(': - c = 0; - if (isdigit(*env->cursor)) - { - f = 0; - do - { - if (c > (INT_MAX / 10)) - { - env->error = REG_BADRPT; - return 0; - } - c = c * 10 + (*env->cursor++ - '0'); - } while (isdigit(*env->cursor)); - if (*env->cursor++ != ')') - { - env->error = REG_BADRPT; - return 0; - } - if (c && env->type >= SRE) - c = c * 2 - 1; - if (!c || c > env->parno || !env->paren[c]) - { - if (!(env->flags & REG_LENIENT)) - { - env->error = REG_ESUBREG; - return 0; - } - if (c) - c = -1; - } - } - else - { - if (env->type < SRE && *env->cursor++ != '?') - { - env->error = REG_BADRPT; - return 0; - } - if (!(f = grp(env, parno + 1)) && env->error) - return 0; - } - if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) - { - drop(env->disc, f); - return 0; - } - e->re.group.size = c; - e->re.group.expr.binary.left = f; - if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) - { - drop(env->disc, e); - return 0; - } - if (token(env) != T_CLOSE) - { - env->error = REG_EPAREN; - return 0; - } - eat(env); - env->parnest--; - return rep(env, e, parno, parno); - case '{': - p = env->cursor; - n = 1; - while (c = *env->cursor) - { - if (c == '\\' && *(env->cursor + 1)) - env->cursor++; - else if (c == '{') - n++; - else if (c == '}' && !--n) - break; - else if (c == env->delimiter || c == env->terminator) - break; - env->cursor++; - } - if (c != '}') - { - env->error = REG_EBRACE; - return 0; - } - if (*++env->cursor != ')') - { - env->error = REG_EPAREN; - return 0; - } - env->cursor++; - env->parnest--; - if (env->disc->re_version < REG_VERSION_EXEC) - { - env->error = REG_BADRPT; - return 0; - } - if (!env->disc->re_execf) - return 0; - if (!(e = node(env, REX_EXEC, 0, 0, 0))) - return 0; - e->re.exec.text = (const char*)p; - e->re.exec.size = env->cursor - p - 2; - if (!env->disc->re_compf) - e->re.exec.data = 0; - else - e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); - return e; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - c -= '0'; - while (isdigit(*env->cursor)) - { - if (c > (INT_MAX / 10)) - { - env->error = REG_ESUBREG; - return 0; - } - c = c * 10 + *env->cursor++ - '0'; - } - if (*env->cursor == ')') - { - env->cursor++; - env->parnest--; - env->token.len = 1; - if (c > env->parno || !env->paren[c]) - { - env->error = REG_ESUBREG; - return 0; - } - env->paren[c]->re.group.back = 1; - return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); - } - /*FALLTHROUGH*/ - default: - env->error = REG_BADRPT; - return 0; - } - if (x && !(e = alt(env, parno, 0))) - return 0; - c = token(env); - env->parnest--; - if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) - { - env->error = REG_EPAREN; - return 0; - } - eat(env); - if (typ >= 0) - { - if (beg) - env->pattern = env->cursor; - env->type = typ; - } - if (!x) - return 0; - if (!(f = node(env, x, 0, 0, 0))) - { - drop(env->disc, e); - return 0; - } - f->re.group.expr.rex = e; - if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) - { - if (stats(env, e)) - { - drop(env->disc, f); - if (!env->error) - env->error = REG_ECOUNT; - return 0; - } - f->re.group.size = env->stats.m; - memset(&env->stats, 0, sizeof(env->stats)); - } - switch (x) - { - case REX_GROUP: - case REX_GROUP_CUT: - f = rep(env, f, parno, env->parno); - break; - } - return f; -} - -static Rex_t* -seq(Cenv_t* env) -{ - Rex_t* e; - Rex_t* f; - Token_t tok; - int c; - int i; - int n; - int x; - int parno; - int type; - regflags_t flags; - unsigned char* s; - unsigned char* p; - unsigned char* t; - unsigned char* u; - unsigned char buf[256]; - - for (;;) - { - s = buf; - while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) - { - x = c; - p = env->cursor; - if (c >= 0) - { - n = 1; - *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; - } - else if (c == C_ESC || (env->flags & REG_ICASE)) - { - c = (c == C_ESC) ? env->token.lex : mbchar(p); - if (env->flags & REG_ICASE) - c = towupper(c); - if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) - break; - if ((n = mbconv((char*)s, c)) < 0) - *s++ = c; - else if (n) - s += n; - else - { - n = 1; - *s++ = 0; - } - } - else - { - n = env->token.len - env->token.esc; - for (t = p, u = s + n; s < u; *s++ = *t++); - } - eat(env); - } - if (c == T_BAD) - return 0; - if (s > buf) - switch (c) - { - case T_STAR: - case T_PLUS: - case T_LEFT: - case T_QUES: - case T_BANG: - if ((s -= n) == buf) - e = 0; - else - { - i = s - buf; - if (!(e = node(env, REX_STRING, 0, 0, i))) - return 0; - memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); - e->re.string.size = i; - } - if (x >= 0) - { - if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) - { - drop(env->disc, e); - return 0; - } - f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; - } - else - { - if (!(f = node(env, REX_STRING, 0, 0, n))) - return 0; - memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); - f->re.string.size = n; - } - if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) - { - drop(env->disc, e); - return 0; - } - if (e) - f = cat(env, e, f); - return f; - default: - c = s - buf; - if (!(e = node(env, REX_STRING, 0, 0, c))) - return 0; - memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); - e->re.string.size = c; - return cat(env, e, seq(env)); - } - else if (c > T_BACK) - { - eat(env); - c -= T_BACK; - if (c > env->parno || !env->paren[c]) - { - env->error = REG_ESUBREG; - return 0; - } - env->paren[c]->re.group.back = 1; - e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); - } - else - switch (c) - { - case T_AND: - case T_CLOSE: - case T_BAR: - case T_END: - return node(env, REX_NULL, 0, 0, 0); - case T_DOLL: - eat(env); - e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); - break; - case T_CFLX: - eat(env); - if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) - e = rep(env, e, 0, 0); - break; - case T_OPEN: - tok = env->token; - eat(env); - flags = env->flags; - type = env->type; - if (env->token.att) - env->flags |= REG_MINIMAL; - env->parnest++; - if (env->type == KRE) - ++env->parno; - parno = ++env->parno; - if (!(e = alt(env, parno + 1, 0))) - break; - if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) - { - drop(env->disc, e); - env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; - return 0; - } - if (token(env) != T_CLOSE) - { - drop(env->disc, e); - env->error = REG_EPAREN; - return 0; - } - env->parnest--; - eat(env); - if (!(f = node(env, REX_GROUP, 0, 0, 0))) - { - drop(env->disc, e); - return 0; - } - if (parno < elementsof(env->paren)) - env->paren[parno] = f; - f->re.group.back = 0; - f->re.group.number = parno; - f->re.group.expr.rex = e; - if (tok.lex) - { - tok.push = 1; - env->token = tok; - } - if (!(e = rep(env, f, parno, env->parno))) - return 0; - if (env->type == KRE) - { - if (!(f = node(env, REX_GROUP, 0, 0, 0))) - { - drop(env->disc, e); - return 0; - } - if (--parno < elementsof(env->paren)) - env->paren[parno] = f; - f->re.group.back = 0; - f->re.group.number = parno; - f->re.group.expr.rex = e; - e = f; - } - env->flags = flags; - env->type = type; - break; - case T_GROUP: - p = env->cursor; - eat(env); - flags = env->flags; - type = env->type; - if (!(e = grp(env, env->parno + 1))) - { - if (env->error) - return 0; - if (env->literal == env->pattern && env->literal == p) - env->literal = env->cursor; - continue; - } - env->flags = flags; - env->type = type; - break; - case T_BRA: - eat(env); - if (e = bra(env)) - e = rep(env, e, 0, 0); - break; - case T_ALNUM: - case T_ALNUM_NOT: - case T_DIGIT: - case T_DIGIT_NOT: - case T_SPACE: - case T_SPACE_NOT: - eat(env); - if (e = ccl(env, c)) - e = rep(env, e, 0, 0); - break; - case T_LT: - eat(env); - e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); - break; - case T_GT: - eat(env); - e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); - break; - case T_DOT: - eat(env); - e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); - break; - case T_DOTSTAR: - eat(env); - env->token.lex = T_STAR; - env->token.push = 1; - e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); - break; - case T_SLASHPLUS: - eat(env); - env->token.lex = T_PLUS; - env->token.push = 1; - if (e = node(env, REX_ONECHAR, 1, 1, 0)) - { - e->re.onechar = '/'; - e = rep(env, e, 0, 0); - } - break; - case T_WORD: - eat(env); - e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); - break; - case T_WORD_NOT: - eat(env); - e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); - break; - case T_BEG_STR: - eat(env); - e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); - break; - case T_END_STR: - eat(env); - e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); - break; - case T_FIN_STR: - eat(env); - e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); - break; - default: - env->error = REG_BADRPT; - return 0; - } - if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) - e = cat(env, e, seq(env)); - return e; - } -} - -static Rex_t* -con(Cenv_t* env) -{ - Rex_t* e; - Rex_t* f; - Rex_t* g; - - if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) - return e; - eat(env); - if (!(f = con(env))) - { - drop(env->disc, e); - return 0; - } - if (!(g = node(env, REX_CONJ, 0, 0, 0))) - { - drop(env->disc, e); - drop(env->disc, f); - return 0; - } - g->re.group.expr.binary.left = e; - g->re.group.expr.binary.right = f; - return g; -} - -static Rex_t* -alt(Cenv_t* env, int number, int cond) -{ - Rex_t* e; - Rex_t* f; - Rex_t* g; - - if (!(e = con(env))) - return 0; - else if (token(env) != T_BAR) - { - if (!cond) - return e; - f = 0; - if (e->type == REX_NULL) - goto bad; - } - else - { - eat(env); - if (!(f = alt(env, number, 0))) - { - drop(env->disc, e); - return 0; - } - if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) - goto bad; - if (!cond && (g = trie(env, e, f))) - return g; - } - if (!(g = node(env, REX_ALT, 0, 0, 0))) - { - env->error = REG_ESPACE; - goto bad; - } - g->re.group.number = number; - g->re.group.last = env->parno; - g->re.group.expr.binary.left = e; - g->re.group.expr.binary.right = f; - return g; - bad: - drop(env->disc, e); - drop(env->disc, f); - if (!env->error) - env->error = REG_ENULL; - return 0; -} - -/* - * add v to REX_BM tables - */ - -static void -bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) -{ - int c; - int m; - size_t z; - - for (m = 0; m < n; m++) - { - if (!(z = n - m - 1)) - z = HIT; - c = v[m]; - a->re.bm.mask[m][c] |= b; - if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) - a->re.bm.skip[c] = z; - if (a->flags & REG_ICASE) - { - if (isupper(c)) - c = tolower(c); - else if (islower(c)) - c = toupper(c); - else - continue; - a->re.bm.mask[m][c] |= b; - if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) - a->re.bm.skip[c] = z; - } - } -} - -/* - * set up BM table from trie - */ - -static int -bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) -{ - do - { - v[m] = x->c; - if (m >= (n - 1)) - { - bmstr(env, a, v, n, b); - if (!(b <<= 1)) - { - b = 1; - a->re.bm.complete = 0; - } - else if (x->son) - a->re.bm.complete = 0; - } - else if (x->son) - b = bmtrie(env, a, v, x->son, n, m + 1, b); - } while (x = x->sib); - return b; -} - -/* - * rewrite the expression tree for some special cases - * 1. it is a null expression - illegal - * 2. max length fixed string found -- use BM algorithm - * 3. it begins with an unanchored string - use KMP algorithm - * 0 returned on success - */ - -static int -special(Cenv_t* env, regex_t* p) -{ - Rex_t* a; - Rex_t* e; - Rex_t* t; - Rex_t* x; - Rex_t* y; - unsigned char* s; - int* f; - int n; - int m; - int k; - - DEBUG_INIT(); - if (e = p->env->rex) - { - if ((x = env->stats.x) && x->re.string.size < 3) - x = 0; - if ((t = env->stats.y) && t->re.trie.min < 3) - t = 0; - if (x && t) - { - if (x->re.string.size >= t->re.trie.min) - t = 0; - else - x = 0; - } - if (x || t) - { - Bm_mask_t** mask; - Bm_mask_t* h; - unsigned char* v; - size_t* q; - unsigned long l; - int i; - int j; - - if (x) - { - y = x; - n = m = x->re.string.size; - l = env->stats.l; - } - else - { - y = t; - n = t->re.trie.min; - m = t->re.trie.max; - l = env->stats.k; - } - if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) - return 1; - if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) - { - alloc(env->disc, q, 0); - return 1; - } - a->flags = y->flags; - a->map = y->map; - a->re.bm.size = n; - a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; - a->re.bm.left = l - 1; - a->re.bm.right = env->stats.m - l - n; - a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; - h = (Bm_mask_t*)&a->re.bm.mask[n]; - a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); - a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; - for (m = 0; m <= UCHAR_MAX; m++) - a->re.bm.skip[m] = n; - a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); - for (i = 1; i <= n; i++) - a->re.bm.fail[i] = 2 * n - i; - mask = a->re.bm.mask; - for (m = 0; m < n; m++) - { - mask[m] = h; - h += UCHAR_MAX + 1; - } - if (x) - bmstr(env, a, x->re.string.base, n, 1); - else - { - v = (unsigned char*)q; - memset(v, 0, n); - m = 1; - for (i = 0; i <= UCHAR_MAX; i++) - if (t->re.trie.root[i]) - m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); - } - mask--; - memset(q, 0, n * sizeof(*q)); - for (k = (j = n) + 1; j > 0; j--, k--) - { - DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); - for (q[j] = k; k <= n; k = q[k]) - { - for (m = 0; m <= UCHAR_MAX; m++) - if (mask[k][m] == mask[j][m]) - { - DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); - goto cut; - } - DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); - if (a->re.bm.fail[k] > n - j) - a->re.bm.fail[k] = n - j; - } - cut: ; - } - for (i = 1; i <= n; i++) - if (a->re.bm.fail[i] > n + k - i) - { - DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); - a->re.bm.fail[i] = n + k - i; - } -#if _AST_REGEX_DEBUG - if (DEBUG_TEST(0x0020,(1),(0))) - { - sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); - for (m = 0; m < n; m++) - for (i = 1; i <= UCHAR_MAX; i++) - if (a->re.bm.mask[m][i]) - sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); - for (i = ' '; i <= UCHAR_MAX; i++) - if (a->re.bm.skip[i] >= HIT) - sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); - else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) - sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); - for (j = 31; j >= 0; j--) - { - sfprintf(sfstderr, " "); - next: - for (m = 0; m < n; m++) - { - for (i = 0040; i < 0177; i++) - if (a->re.bm.mask[m][i] & (1 << j)) - { - sfprintf(sfstderr, " %c", i); - break; - } - if (i >= 0177) - { - if (j > 0) - { - j--; - goto next; - } - sfprintf(sfstderr, " ?"); - } - } - sfprintf(sfstderr, "\n"); - } - sfprintf(sfstderr, "FAIL: "); - for (m = 1; m <= n; m++) - sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); - sfprintf(sfstderr, "\n"); - } -#endif - alloc(env->disc, q, 0); - a->next = e; - p->env->rex = a; - return 0; - } - switch (e->type) - { - case REX_BEG: - if (env->flags & REG_NEWLINE) - return 0; - break; - case REX_GROUP: - if (env->stats.b) - return 0; - e = e->re.group.expr.rex; - if (e->type != REX_DOT) - return 0; - /*FALLTHROUGH*/ - case REX_DOT: - if (e->lo == 0 && e->hi == RE_DUP_INF) - break; - return 0; - case REX_NULL: - if (env->flags & REG_NULL) - break; - env->error = REG_ENULL; - return 1; - case REX_STRING: - if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) - return 0; - s = e->re.string.base; - n = e->re.string.size; - if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) - return 1; - a->flags = e->flags; - a->map = e->map; - f = a->re.string.fail; - memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); - s = a->re.string.base; - a->re.string.size = n; - f[0] = m = -1; - for (k = 1; k < n; k++) - { - while (m >= 0 && s[m+1] != s[k]) - m = f[m]; - if (s[m+1] == s[k]) - m++; - f[k] = m; - } - a->next = e->next; - p->env->rex = a; - e->next = 0; - drop(env->disc, e); - break; - default: - return 0; - } - } - p->env->once = 1; - return 0; -} - -int -regcomp(regex_t* p, const char* pattern, regflags_t flags) -{ - Rex_t* e; - Rex_t* f; - regdisc_t* disc; - unsigned char* fold; - int i; - Cenv_t env; - - if (!p) - return REG_BADPAT; - if (flags & REG_DISCIPLINE) - { - flags &= ~REG_DISCIPLINE; - disc = p->re_disc; - } - else - disc = &state.disc; - if (!disc->re_errorlevel) - disc->re_errorlevel = 2; - p->env = 0; - if (!pattern) - return fatal(disc, REG_BADPAT, pattern); - if (!state.initialized) - { - state.initialized = 1; - for (i = 0; i < elementsof(state.escape); i++) - state.magic[state.escape[i].key] = state.escape[i].val; - } - if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) - { - if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) - return fatal(disc, REG_ESPACE, pattern); - for (i = 0; i <= UCHAR_MAX; i++) - fold[i] = toupper(i); - LCINFO(AST_LC_CTYPE)->data = (void*)fold; - } - again: - if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) - return fatal(disc, REG_ESPACE, pattern); - memset(p->env, 0, sizeof(*p->env)); - memset(&env, 0, sizeof(env)); - env.regex = p; - env.flags = flags; - env.disc = p->env->disc = disc; - if (env.flags & REG_AUGMENTED) - env.flags |= REG_EXTENDED; - env.mappeddot = '.'; - env.mappednewline = '\n'; - env.mappedslash = '/'; - if (disc->re_version >= REG_VERSION_MAP && disc->re_map) - { - env.map = disc->re_map; - env.MAP = p->env->fold; - for (i = 0; i <= UCHAR_MAX; i++) - { - env.MAP[i] = fold[env.map[i]]; - if (env.map[i] == '.') - env.mappeddot = i; - if (env.map[i] == '\n') - env.mappednewline = i; - if (env.map[i] == '/') - env.mappedslash = i; - } - } - else - env.MAP = fold; - env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; - env.explicit = -1; - if (env.flags & REG_SHELL) - { - if (env.flags & REG_SHELL_PATH) - env.explicit = env.mappedslash; - env.flags |= REG_LENIENT|REG_NULL; - env.type = env.type == BRE ? SRE : KRE; - } - if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) - env.explicit = env.mappednewline; - p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; - env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); - env.regexp = !!(env.flags & REG_REGEXP); - env.token.lex = 0; - env.token.push = 0; - if (env.flags & REG_DELIMITED) - { - switch (env.delimiter = *pattern++) - { - case 0: - case '\\': - case '\n': - case '\r': - env.error = REG_EDELIM; - goto bad; - } - env.terminator = '\n'; - } - env.literal = env.pattern = env.cursor = (unsigned char*)pattern; - if (!(p->env->rex = alt(&env, 1, 0))) - goto bad; - if (env.parnest) - { - env.error = REG_EPAREN; - goto bad; - } - p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); - if (env.flags & REG_LEFT) - { - if (p->env->rex->type != REX_BEG) - { - if (p->env->rex->type == REX_ALT) - env.flags &= ~REG_FIRST; - if (!(e = node(&env, REX_BEG, 0, 0, 0))) - { - regfree(p); - return fatal(disc, REG_ESPACE, pattern); - } - e->next = p->env->rex; - p->env->rex = e; - p->env->once = 1; - } - p->env->stats.re_flags |= REG_LEFT; - } - for (e = p->env->rex; e->next; e = e->next); - p->env->done.type = REX_DONE; - p->env->done.flags = e->flags; - if (env.flags & REG_RIGHT) - { - if (e->type != REX_END) - { - if (p->env->rex->type == REX_ALT) - env.flags &= ~REG_FIRST; - if (!(f = node(&env, REX_END, 0, 0, 0))) - { - regfree(p); - return fatal(disc, REG_ESPACE, pattern); - } - f->flags = e->flags; - f->map = e->map; - e->next = f; - } - p->env->stats.re_flags |= REG_RIGHT; - } - if (stats(&env, p->env->rex)) - { - if (!env.error) - env.error = REG_ECOUNT; - goto bad; - } - if (env.stats.b) - p->env->hard = p->env->separate = 1; - else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) - p->env->hard = 1; - if (p->env->hard || env.stats.c || env.stats.i) - p->env->stats.re_min = p->env->stats.re_max = -1; - else - { - if (!(p->env->stats.re_min = env.stats.m)) - p->env->stats.re_min = -1; - if (!(p->env->stats.re_max = env.stats.n)) - p->env->stats.re_max = -1; - } - if (special(&env, p)) - goto bad; - serialize(&env, p->env->rex, 1); - p->re_nsub = env.stats.p; - if (env.type == KRE) - p->re_nsub /= 2; - if (env.flags & REG_DELIMITED) - { - p->re_npat = env.cursor - env.pattern + 1; - if (*env.cursor == env.delimiter) - p->re_npat++; - else if (env.flags & REG_MUSTDELIM) - { - env.error = REG_EDELIM; - goto bad; - } - else - env.flags &= ~REG_DELIMITED; - } - p->env->explicit = env.explicit; - p->env->flags = env.flags & REG_COMP; - p->env->min = env.stats.m; - p->env->nsub = env.stats.p + env.stats.u; - p->env->refs = 1; - return 0; - bad: - regfree(p); - if (!env.error) - env.error = REG_ESPACE; - if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) - { - flags |= REG_LITERAL; - pattern = (const char*)env.literal; - goto again; - } - return fatal(disc, env.error, pattern); -} - -/* - * regcomp() on sized pattern - * the lazy way around adding and checking env.end - */ - -int -regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) -{ - char* s; - int r; - - if (!(s = malloc(size + 1))) - return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); - memcpy(s, pattern, size); - s[size] = 0; - r = regcomp(p, s, flags); - free(s); - return r; -} - -/* - * combine two compiled regular expressions if possible, - * replacing first with the combination and freeing second. - * return 0 on success. - * the only combinations handled are building a Trie - * from String|Kmp|Trie and String|Kmp - */ - -int -regcomb(regex_t* p, regex_t* q) -{ - Rex_t* e = p->env->rex; - Rex_t* f = q->env->rex; - Rex_t* g; - Rex_t* h; - Cenv_t env; - - if (!e || !f) - return fatal(p->env->disc, REG_BADPAT, NiL); - if (p->env->separate || q->env->separate) - return REG_ESUBREG; - memset(&env, 0, sizeof(env)); - env.disc = p->env->disc; - if (e->type == REX_BM) - { - p->env->rex = e->next; - e->next = 0; - drop(env.disc, e); - e = p->env->rex; - } - if (f->type == REX_BM) - { - q->env->rex = f->next; - f->next = 0; - drop(env.disc, f); - f = q->env->rex; - } - if (e->type == REX_BEG && f->type == REX_BEG) - { - p->env->flags |= REG_LEFT; - p->env->rex = e->next; - e->next = 0; - drop(env.disc, e); - e = p->env->rex; - q->env->rex = f->next; - f->next = 0; - drop(env.disc, f); - f = q->env->rex; - } - for (g = e; g->next; g = g->next); - for (h = f; h->next; h = h->next); - if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END) - { - p->env->flags |= REG_RIGHT; - drop(env.disc, g->next); - g->next = 0; - drop(env.disc, h->next); - h->next = 0; - } - if (!(g = trie(&env, f, e))) - return fatal(p->env->disc, REG_BADPAT, NiL); - p->env->rex = g; - if (!q->env->once) - p->env->once = 0; - q->env->rex = 0; - if (p->env->flags & REG_LEFT) - { - if (!(e = node(&env, REX_BEG, 0, 0, 0))) - { - regfree(p); - return fatal(p->env->disc, REG_ESPACE, NiL); - } - e->next = p->env->rex; - p->env->rex = e; - p->env->once = 1; - } - if (p->env->flags & REG_RIGHT) - { - for (f = p->env->rex; f->next; f = f->next); - if (f->type != REX_END) - { - if (!(e = node(&env, REX_END, 0, 0, 0))) - { - regfree(p); - return fatal(p->env->disc, REG_ESPACE, NiL); - } - f->next = e; - } - } - env.explicit = p->env->explicit; - env.flags = p->env->flags; - env.disc = p->env->disc; - if (stats(&env, p->env->rex)) - { - regfree(p); - return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); - } - if (special(&env, p)) - { - regfree(p); - return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); - } - p->env->min = g->re.trie.min; - return 0; -} - -/* - * copy a reference of p into q - * p and q may then have separate regsubcomp() instantiations - */ - -int -regdup(regex_t* p, regex_t* q) -{ - if (!p || !q) - return REG_BADPAT; - *q = *p; - p->env->refs++; - q->re_sub = 0; - return 0; -} |