diff options
author | Igor Pashev <pashev.igor@gmail.com> | 2012-06-24 22:28:35 +0000 |
---|---|---|
committer | Igor Pashev <pashev.igor@gmail.com> | 2012-06-24 22:28:35 +0000 |
commit | 3950ffe2a485479f6561c27364d3d7df5a21d124 (patch) | |
tree | 468c6e14449d1b1e279222ec32f676b0311917d2 /src/lib/libast/regex | |
download | ksh-upstream.tar.gz |
Imported Upstream version 93u+upstream
Diffstat (limited to 'src/lib/libast/regex')
-rw-r--r-- | src/lib/libast/regex/regalloc.c | 36 | ||||
-rw-r--r-- | src/lib/libast/regex/regcache.c | 198 | ||||
-rw-r--r-- | src/lib/libast/regex/regclass.c | 298 | ||||
-rw-r--r-- | src/lib/libast/regex/regcoll.c | 120 | ||||
-rw-r--r-- | src/lib/libast/regex/regcomp.c | 3544 | ||||
-rw-r--r-- | src/lib/libast/regex/regdecomp.c | 448 | ||||
-rw-r--r-- | src/lib/libast/regex/regerror.c | 95 | ||||
-rw-r--r-- | src/lib/libast/regex/regexec.c | 54 | ||||
-rw-r--r-- | src/lib/libast/regex/regfatal.c | 49 | ||||
-rw-r--r-- | src/lib/libast/regex/reginit.c | 412 | ||||
-rw-r--r-- | src/lib/libast/regex/reglib.h | 582 | ||||
-rw-r--r-- | src/lib/libast/regex/regnexec.c | 2045 | ||||
-rw-r--r-- | src/lib/libast/regex/regrecord.c | 34 | ||||
-rw-r--r-- | src/lib/libast/regex/regrexec.c | 145 | ||||
-rw-r--r-- | src/lib/libast/regex/regstat.c | 53 | ||||
-rw-r--r-- | src/lib/libast/regex/regsub.c | 269 | ||||
-rw-r--r-- | src/lib/libast/regex/regsubcomp.c | 377 | ||||
-rw-r--r-- | src/lib/libast/regex/regsubexec.c | 196 |
18 files changed, 8955 insertions, 0 deletions
diff --git a/src/lib/libast/regex/regalloc.c b/src/lib/libast/regex/regalloc.c new file mode 100644 index 0000000..03807a4 --- /dev/null +++ b/src/lib/libast/regex/regalloc.c @@ -0,0 +1,36 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex alloc control + */ + +#include "reglib.h" + +void +regalloc(void* handle, void*(*resize)(void*,void*,size_t), regflags_t flags) +{ + state.disc.re_flags = flags; + state.disc.re_resizef = resize; + state.disc.re_resizehandle = handle; +} diff --git a/src/lib/libast/regex/regcache.c b/src/lib/libast/regex/regcache.c new file mode 100644 index 0000000..a45f0e3 --- /dev/null +++ b/src/lib/libast/regex/regcache.c @@ -0,0 +1,198 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * regcomp() regex_t cache + * at&t research + */ + +#include <ast.h> +#include <regex.h> + +#define CACHE 8 /* default # cached re's */ +#define ROUND 64 /* pattern buffer size round */ + +typedef unsigned long Key_t; + +typedef struct Cache_s +{ + char* pattern; + regex_t re; + unsigned long serial; + regflags_t reflags; + int keep; + int size; +} Cache_t; + +typedef struct State_s +{ + unsigned int size; + unsigned long serial; + char* locale; + Cache_t** cache; +} State_t; + +static State_t matchstate; + +/* + * flush the cache + */ + +static void +flushcache(void) +{ + register int i; + + for (i = matchstate.size; i--;) + if (matchstate.cache[i] && matchstate.cache[i]->keep) + { + matchstate.cache[i]->keep = 0; + regfree(&matchstate.cache[i]->re); + } +} + +/* + * return regcomp() compiled re for pattern and reflags + */ + +regex_t* +regcache(const char* pattern, regflags_t reflags, int* status) +{ + register Cache_t* cp; + register int i; + char* s; + int empty; + int unused; + int old; + Key_t key; + + /* + * 0 pattern flushes the cache and reflags>0 extends cache + */ + + if (!pattern) + { + flushcache(); + i = 0; + if (reflags > matchstate.size) + { + if (matchstate.cache = newof(matchstate.cache, Cache_t*, reflags, 0)) + matchstate.size = reflags; + else + { + matchstate.size = 0; + i = 1; + } + } + if (status) + *status = i; + return 0; + } + if (!matchstate.cache) + { + if (!(matchstate.cache = newof(0, Cache_t*, CACHE, 0))) + return 0; + matchstate.size = CACHE; + } + + /* + * flush the cache if the locale changed + * the ast setlocale() intercept maintains + * persistent setlocale() return values + */ + + if ((s = setlocale(LC_CTYPE, NiL)) != matchstate.locale) + { + matchstate.locale = s; + flushcache(); + } + + /* + * check if the pattern is in the cache + */ + + for (i = 0; i < sizeof(key) && pattern[i]; i++) + ((char*)&key)[i] = pattern[i]; + for (; i < sizeof(key); i++) + ((char*)&key)[i] = 0; + empty = unused = -1; + old = 0; + for (i = matchstate.size; i--;) + if (!matchstate.cache[i]) + empty = i; + else if (!matchstate.cache[i]->keep) + unused = i; + else if (*(Key_t*)matchstate.cache[i]->pattern == key && !strcmp(matchstate.cache[i]->pattern, pattern) && matchstate.cache[i]->reflags == reflags) + break; + else if (!matchstate.cache[old] || matchstate.cache[old]->serial > matchstate.cache[i]->serial) + old = i; + if (i < 0) + { + if (unused < 0) + { + if (empty < 0) + unused = old; + else + unused = empty; + } + if (!(cp = matchstate.cache[unused]) && !(cp = matchstate.cache[unused] = newof(0, Cache_t, 1, 0))) + { + if (status) + *status = REG_ESPACE; + return 0; + } + if (cp->keep) + { + cp->keep = 0; + regfree(&cp->re); + } + if ((i = strlen(pattern) + 1) > cp->size) + { + cp->size = roundof(i, ROUND); + if (!(cp->pattern = newof(cp->pattern, char, cp->size, 0))) + { + if (status) + *status = REG_ESPACE; + return 0; + } + } + strcpy(cp->pattern, pattern); + while (++i < sizeof(Key_t)) + cp->pattern[i] = 0; + pattern = (const char*)cp->pattern; + if (i = regcomp(&cp->re, pattern, reflags)) + { + if (status) + *status = i; + return 0; + } + cp->keep = 1; + cp->reflags = reflags; + } + else + cp = matchstate.cache[i]; + cp->serial = ++matchstate.serial; + if (status) + *status = 0; + return &cp->re; +} diff --git a/src/lib/libast/regex/regclass.c b/src/lib/libast/regex/regclass.c new file mode 100644 index 0000000..87ada74 --- /dev/null +++ b/src/lib/libast/regex/regclass.c @@ -0,0 +1,298 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped +/* + * RE character class support + */ + +#include "reglib.h" + +struct Ctype_s; typedef struct Ctype_s Ctype_t; + +struct Ctype_s +{ + const char* name; + size_t size; + regclass_t ctype; + Ctype_t* next; +#if _lib_wctype + wctype_t wtype; +#endif +}; + +static Ctype_t* ctypes; + +/* + * this stuff gets around posix failure to define isblank, + * and the fact that ctype functions are macros + * and any local extensions that may not even have functions or macros + */ + +#if _need_iswblank + +int +_reg_iswblank(wint_t wc) +{ + static int initialized; + static wctype_t wt; + + if (!initialized) + { + initialized = 1; + wt = wctype("blank"); + } + return iswctype(wc, wt); +} + +#endif + +static int Isalnum(int c) { return iswalnum(c); } +static int Isalpha(int c) { return iswalpha(c); } +static int Isblank(int c) { return iswblank(c); } +static int Iscntrl(int c) { return iswcntrl(c); } +static int Isdigit(int c) { return iswdigit(c); } +static int Notdigit(int c) { return !iswdigit(c); } +static int Isgraph(int c) { return iswgraph(c); } +static int Islower(int c) { return iswlower(c); } +static int Isprint(int c) { return iswprint(c); } +static int Ispunct(int c) { return iswpunct(c); } +static int Isspace(int c) { return iswspace(c); } +static int Notspace(int c) { return !iswspace(c); } +static int Isupper(int c) { return iswupper(c); } +static int Isword(int c) { return iswalnum(c) || c == '_'; } +static int Notword(int c) { return !iswalnum(c) && c != '_'; } +static int Isxdigit(int c) { return iswxdigit(c);} + +#if _lib_wctype + +static int Is_wc_1(int); +static int Is_wc_2(int); +static int Is_wc_3(int); +static int Is_wc_4(int); +static int Is_wc_5(int); +static int Is_wc_6(int); +static int Is_wc_7(int); +static int Is_wc_8(int); +static int Is_wc_9(int); +static int Is_wc_10(int); +static int Is_wc_11(int); +static int Is_wc_12(int); +static int Is_wc_13(int); +static int Is_wc_14(int); +static int Is_wc_15(int); +static int Is_wc_16(int); + +#endif + +#define SZ(s) s,(sizeof(s)-1) + +static Ctype_t ctype[] = +{ + { SZ("alnum"), Isalnum }, + { SZ("alpha"), Isalpha }, + { SZ("blank"), Isblank }, + { SZ("cntrl"), Iscntrl }, + { SZ("digit"), Isdigit }, + { SZ("graph"), Isgraph }, + { SZ("lower"), Islower }, + { SZ("print"), Isprint }, + { SZ("punct"), Ispunct }, + { SZ("space"), Isspace }, + { SZ("upper"), Isupper }, + { SZ("word"), Isword }, + { SZ("xdigit"),Isxdigit}, + +#define CTYPES 13 + +#if _lib_wctype + { 0, 0, Is_wc_1 }, + { 0, 0, Is_wc_2 }, + { 0, 0, Is_wc_3 }, + { 0, 0, Is_wc_4 }, + { 0, 0, Is_wc_5 }, + { 0, 0, Is_wc_6 }, + { 0, 0, Is_wc_7 }, + { 0, 0, Is_wc_8 }, + { 0, 0, Is_wc_9 }, + { 0, 0, Is_wc_10 }, + { 0, 0, Is_wc_11 }, + { 0, 0, Is_wc_12 }, + { 0, 0, Is_wc_13 }, + { 0, 0, Is_wc_14 }, + { 0, 0, Is_wc_15 }, + { 0, 0, Is_wc_16 }, + +#define WTYPES 16 + +#else + +#define WTYPES 0 + +#endif +}; + +#if _lib_wctype + +static int Is_wc_1(int c) { return iswctype(c, ctype[CTYPES+0].wtype); } +static int Is_wc_2(int c) { return iswctype(c, ctype[CTYPES+1].wtype); } +static int Is_wc_3(int c) { return iswctype(c, ctype[CTYPES+2].wtype); } +static int Is_wc_4(int c) { return iswctype(c, ctype[CTYPES+3].wtype); } +static int Is_wc_5(int c) { return iswctype(c, ctype[CTYPES+4].wtype); } +static int Is_wc_6(int c) { return iswctype(c, ctype[CTYPES+5].wtype); } +static int Is_wc_7(int c) { return iswctype(c, ctype[CTYPES+6].wtype); } +static int Is_wc_8(int c) { return iswctype(c, ctype[CTYPES+7].wtype); } +static int Is_wc_9(int c) { return iswctype(c, ctype[CTYPES+8].wtype); } +static int Is_wc_10(int c) { return iswctype(c, ctype[CTYPES+9].wtype); } +static int Is_wc_11(int c) { return iswctype(c, ctype[CTYPES+10].wtype); } +static int Is_wc_12(int c) { return iswctype(c, ctype[CTYPES+11].wtype); } +static int Is_wc_13(int c) { return iswctype(c, ctype[CTYPES+12].wtype); } +static int Is_wc_14(int c) { return iswctype(c, ctype[CTYPES+13].wtype); } +static int Is_wc_15(int c) { return iswctype(c, ctype[CTYPES+14].wtype); } +static int Is_wc_16(int c) { return iswctype(c, ctype[CTYPES+15].wtype); } + +#endif + +/* + * return pointer to ctype function for :class:] in s + * s points to the first char after the initial [ + * dynamic wctype classes are locale-specific + * dynamic entry locale is punned in Ctype_t.next + * the search does a lazy (one entry at a time) flush on locale mismatch + * if e!=0 it points to next char in s + * 0 returned on error + */ + +regclass_t +regclass(const char* s, char** e) +{ + register Ctype_t* cp; + register int c; + register size_t n; + register const char* t; + Ctype_t* lc; + Ctype_t* xp; + Ctype_t* zp; + + if (!(c = *s++)) + return 0; + for (t = s; *t && (*t != c || *(t + 1) != ']'); t++); + if (*t != c || !(n = t - s)) + return 0; + for (cp = ctypes; cp; cp = cp->next) + if (n == cp->size && strneq(s, cp->name, n)) + goto found; + xp = zp = 0; + lc = (Ctype_t*)setlocale(LC_CTYPE, NiL); + for (cp = ctype; cp < &ctype[elementsof(ctype)]; cp++) + { +#if _lib_wctype + if (!zp) + { + if (!cp->size) + zp = cp; + else if (!xp && cp->next && cp->next != lc) + xp = cp; + } +#endif + if (n == cp->size && strneq(s, cp->name, n) && (!cp->next || cp->next == lc)) + goto found; + } +#if _lib_wctype + if (!(cp = zp)) + { + if (!(cp = xp)) + return 0; + cp->size = 0; + if (!streq(cp->name, s)) + { + free((char*)cp->name); + cp->name = 0; + } + } + if (!cp->name) + { + if (!(cp->name = (const char*)memdup(s, n + 1))) + return 0; + *((char*)cp->name + n) = 0; + } + /* mvs.390 needs the (char*) cast -- barf */ + if (!(cp->wtype = wctype((char*)cp->name))) + { + free((char*)cp->name); + cp->name = 0; + return 0; + } + cp->size = n; + cp->next = lc; +#endif + found: + if (e) + *e = (char*)t + 2; + return cp->ctype; +} + +/* + * associate the ctype function fun with name + */ + +int +regaddclass(const char* name, regclass_t fun) +{ + register Ctype_t* cp; + register Ctype_t* np; + register size_t n; + + n = strlen(name); + for (cp = ctypes; cp; cp = cp->next) + if (cp->size == n && strneq(name, cp->name, n)) + { + cp->ctype = fun; + return 0; + } + if (!(np = newof(0, Ctype_t, 1, n + 1))) + return REG_ESPACE; + np->size = n; + np->name = strcpy((char*)(np + 1), name); + np->ctype = fun; + np->next = ctypes; + ctypes = np; + return 0; +} + +/* + * return pointer to ctype function for token + */ + +regclass_t +classfun(int type) +{ + switch (type) + { + case T_ALNUM: return Isword; + case T_ALNUM_NOT: return Notword; + case T_DIGIT: return Isdigit; + case T_DIGIT_NOT: return Notdigit; + case T_SPACE: return Isspace; + case T_SPACE_NOT: return Notspace; + } + return 0; +} diff --git a/src/lib/libast/regex/regcoll.c b/src/lib/libast/regex/regcoll.c new file mode 100644 index 0000000..64dc7a8 --- /dev/null +++ b/src/lib/libast/regex/regcoll.c @@ -0,0 +1,120 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped +/* + * regex collation symbol support + */ + +#include "reglib.h" + +/* + * return the collating symbol delimited by [c c], where c is either '=' or '.' + * s points to the first char after the initial [ + * if e!=0 it is set to point to the next char in s on return + * + * the collating symbol is converted to multibyte in <buf,size> + * the return value is: + * -1 syntax error / invalid collating element + * >=0 size with 0-terminated mb character (*wc != 0) + * or collating element (*wc == 0) in buf + */ + +int +regcollate(register const char* s, char** e, char* buf, size_t size, wchar_t* wc) +{ + register int c; + register char* b; + register char* x; + const char* t; + int i; + int r; + int term; + wchar_t w; + char xfm[256]; + char tmp[sizeof(xfm)]; + + if (size < 2 || (term = *s) != '.' && term != '=' || !*++s || *s == term && *(s + 1) == ']') + goto nope; + t = s; + w = mbchar(s); + if ((r = (s - t)) > 1) + { + if (*s++ != term || *s++ != ']') + goto oops; + goto done; + } + if (*s == term && *(s + 1) == ']') + { + s += 2; + goto done; + } + b = buf; + x = buf + size - 2; + s = t; + for (;;) + { + if (!(c = *s++)) + goto oops; + if (c == term) + { + if (!(c = *s++)) + goto oops; + if (c != term) + { + if (c != ']') + goto oops; + break; + } + } + if (b < x) + *b++ = c; + } + r = s - t - 2; + w = 0; + if (b >= x) + goto done; + *b = 0; + for (i = 0; i < r && i < sizeof(tmp) - 1; i++) + tmp[i] = '0'; + tmp[i] = 0; + if (mbxfrm(xfm, buf, sizeof(xfm)) >= mbxfrm(xfm, tmp, sizeof(xfm))) + goto nope; + t = (const char*)buf; + done: + if (r <= size) + { + memcpy(buf, t, r); + if (r < size) + buf[r] = 0; + } + if (wc) + *wc = w; + if (e) + *e = (char*)s; + return r; + oops: + s--; + nope: + if (e) + *e = (char*)s; + return -1; +} diff --git a/src/lib/libast/regex/regcomp.c b/src/lib/libast/regex/regcomp.c new file mode 100644 index 0000000..416d453 --- /dev/null +++ b/src/lib/libast/regex/regcomp.c @@ -0,0 +1,3544 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex compiler + */ + +#include "reglib.h" + +#if _PACKAGE_ast +#include "lclib.h" +#endif + +#define serialize re_serialize /* hp.ia64 <unistd.h>! */ + +#define C_ESC (-1) +#define C_MB (-2) + +#if _AST_REGEX_DEBUG + +#define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) +#define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) +#define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) + +static unsigned long debug; +static unsigned long debug_flag; + +#else + +#define DEBUG_INIT() +#define DEBUG_TEST(f,y,n) (n) +#define DEBUG_CODE(f,y,n) do {n} while(0) + +#endif + +#if _PACKAGE_ast + +typedef struct Cchr_s +{ + Dtlink_t lnk; + unsigned char nam[2]; + Ckey_t key; +} Cchr_t; + +#endif + +#define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) + +/* + * determine whether greedy matching will work, i.e. produce + * the best match first. such expressions are "easy", and + * need no backtracking once a complete match is found. + * if an expression has backreferences or alts it's hard + * else if it has only one closure it's easy + * else if all closures are simple (i.e. one-character) it's easy + * else it's hard. + */ + +typedef struct Stats_s +{ + unsigned long l; /* min length to left of x */ + unsigned long k; /* min length to left of y */ + unsigned long m; /* min length */ + unsigned long n; /* max length */ + unsigned short a; /* number of alternations */ + unsigned short b; /* number of backrefs */ + unsigned short c; /* number of closures */ + unsigned short e; /* $ */ + unsigned short i; /* number of negations */ + unsigned short p; /* number of named subexpressions */ + unsigned short s; /* number of simple closures */ + unsigned short t; /* number of tries */ + unsigned short u; /* number of unnamed subexpressions */ + Rex_t* x; /* max length REX_STRING */ + Rex_t* y; /* max length REX_TRIE */ +} Stats_t; + +typedef struct Token_s +{ + unsigned long min; + unsigned long max; + short lex; + short len; + short esc; + short att; + short push; +} Token_t; + +typedef struct Cenv_s +{ + int delimiter; /* pattern delimiter */ + int error; /* last error */ + int explicit; /* explicit match on this char */ + int mappeddot; /* inverse mapped '.' */ + int mappednewline; /* inverse mapped '\n' */ + int mappedslash; /* inverse mapped '/' */ + regflags_t flags; /* flags arg to regcomp */ + int type; /* BRE,ERE,ARE,SRE,KRE */ + unsigned char* cursor; /* curent point in re */ + unsigned char* pattern; /* the original pattern */ + unsigned char* literal; /* literal restart pattern */ + int parno; /* number of last open paren */ + int parnest; /* paren nest count */ + int posixkludge; /* to make * nonspecial */ + int regexp; /* <regexp.h> compatibility */ + Token_t token; /* token lookahead */ + Stats_t stats; /* RE statistics */ + int terminator; /* pattern terminator */ + Rex_t* paren[2*(BACK_REF_MAX+2)]; + /* paren[i]!=0 if \i defined */ + regex_t* regex; /* user handle */ + regdisc_t* disc; /* user discipline */ + unsigned char* map; /* external to native ccode map */ + unsigned char* MAP; /* fold and/or map */ +} Cenv_t; + +/* + * allocate a new Rex_t node + */ + +#if _BLD_DEBUG +#define node(a,b,c,d,e) node(a,b,c,d,e, const char* file, unsigned int line) +#endif + +static Rex_t* +node(Cenv_t* env, int type, int lo, int hi, size_t extra) +{ + register Rex_t* e; + + DEBUG_TEST(0x0800,(sfprintf(sfstdout, "%s:%d node(%d,%d,%d,%u)\n", file, line, type, lo, hi, sizeof(Rex_t) + extra)),(0)); + if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) + { + memset(e, 0, sizeof(Rex_t) + extra); + e->type = type; + e->marked = 0; + e->lo = lo; + e->hi = hi; + e->flags = env->flags; + e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; + e->explicit = env->explicit; + if (extra) + e->re.data = (char*)e + sizeof(Rex_t); + } + return e; +} + +#if _BLD_DEBUG +#undef node +#define node(a,b,c,d,e) node(a,b,c,d,e,__FILE__,__LINE__) +#endif + +/* + * free a Trie_node_t node + */ + +static void +triedrop(regdisc_t* disc, Trie_node_t* e) +{ + if (e) + { + triedrop(disc, e->son); + triedrop(disc, e->sib); + alloc(disc, e, 0); + } +} + +/* + * free a Rex_t node + */ + +void +drop(regdisc_t* disc, Rex_t* e) +{ + int i; + Rex_t* x; + + if (e && !(disc->re_flags & REG_NOFREE)) + do + { + switch (e->type) + { + case REX_ALT: + case REX_CONJ: + drop(disc, e->re.group.expr.binary.left); + drop(disc, e->re.group.expr.binary.right); + break; + case REX_GROUP: + case REX_GROUP_AHEAD: + case REX_GROUP_AHEAD_NOT: + case REX_GROUP_BEHIND: + case REX_GROUP_BEHIND_NOT: + case REX_GROUP_CUT: + case REX_NEG: + case REX_REP: + drop(disc, e->re.group.expr.rex); + break; + case REX_TRIE: + for (i = 0; i <= UCHAR_MAX; i++) + triedrop(disc, e->re.trie.root[i]); + break; + } + x = e->next; + alloc(disc, e, 0); + } while (e = x); +} + +/* + * mark e and descendants minimal + */ + +static void +mark(register Rex_t* e, int set) +{ + if (e && !e->marked) + do + { + e->marked = 1; + if (set) + e->flags |= REG_MINIMAL; + else + e->flags &= ~REG_MINIMAL; + switch (e->type) + { + case REX_ALT: + case REX_CONJ: + case REX_GROUP_COND: + if (e->re.group.expr.binary.left) + mark(e->re.group.expr.binary.left, set); + if (e->re.group.expr.binary.right) + mark(e->re.group.expr.binary.right, set); + break; + case REX_GROUP: + case REX_GROUP_AHEAD: + case REX_GROUP_AHEAD_NOT: + case REX_GROUP_BEHIND: + case REX_GROUP_BEHIND_NOT: + case REX_GROUP_CUT: + case REX_NEG: + case REX_REP: + case REX_TRIE: + mark(e->re.group.expr.rex, set); + break; + } + } while (e = e->next); +} + +/* + * assign subexpression numbers by a preorder tree walk + */ + +static int +serialize(Cenv_t* env, Rex_t* e, int n) +{ + do + { + e->serial = n++; + switch (e->type) + { + case REX_ALT: + case REX_GROUP_COND: + if (e->re.group.expr.binary.left) + n = serialize(env, e->re.group.expr.binary.left, n); + e->re.group.expr.binary.serial = n++; + if (e->re.group.expr.binary.right) + n = serialize(env, e->re.group.expr.binary.right, n); + break; + case REX_CONJ: + n = serialize(env, e->re.group.expr.binary.left, n); + n = serialize(env, e->re.group.expr.binary.right, n); + break; + case REX_GROUP: + case REX_GROUP_AHEAD: + case REX_GROUP_AHEAD_NOT: + case REX_GROUP_BEHIND: + case REX_GROUP_BEHIND_NOT: + case REX_GROUP_CUT: + case REX_NEG: + case REX_REP: + n = serialize(env, e->re.group.expr.rex, n); + break; + } + } while (e = e->next); + return n; +} + +/* + * catenate e and f into a sequence, collapsing them if possible + */ + +static Rex_t* +cat(Cenv_t* env, Rex_t* e, Rex_t* f) +{ + Rex_t* g; + + if (!f) + { + drop(env->disc, e); + return 0; + } + if (e->type == REX_NULL) + { + drop(env->disc, e); + return f; + } + if (f->type == REX_NULL) + { + g = f->next; + f->next = 0; + drop(env->disc, f); + f = g; + } + else if (e->type == REX_DOT && f->type == REX_DOT) + { + unsigned int m = e->lo + f->lo; + unsigned int n = e->hi + f->hi; + + if (m <= RE_DUP_MAX) + { + if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) + { + n = RE_DUP_INF; + goto combine; + } + else if (n <= RE_DUP_MAX) + { + combine: + e->lo = m; + e->hi = n; + g = f->next; + f->next = 0; + drop(env->disc, f); + f = g; + } + } + } + e->next = f; + return e; +} + +/* + * collect re statistics + */ + +static int +stats(register Cenv_t* env, register Rex_t* e) +{ + register unsigned long n; + register unsigned long m; + unsigned long cm; + unsigned long nm; + unsigned long cn; + unsigned long nn; + unsigned long l; + unsigned long k; + unsigned long t; + Rex_t* q; + Rex_t* x; + Rex_t* y; + unsigned char c; + unsigned char b; + + do + { + switch (e->type) + { + case REX_ALT: + x = env->stats.x; + l = env->stats.l; + y = env->stats.y; + k = env->stats.k; + t = env->stats.t; + if (++env->stats.a <= 0) + return 1; + cm = env->stats.m; + env->stats.m = 0; + cn = env->stats.n; + env->stats.n = 0; + if (stats(env, e->re.group.expr.binary.left)) + return 1; + m = env->stats.m; + env->stats.m = 0; + n = env->stats.n; + env->stats.n = 0; + if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) + return 1; + if (env->stats.m > m) + env->stats.m = m; + else + m = env->stats.m; + if ((env->stats.m += cm) < m) + return 1; + if (env->stats.n < n) + env->stats.n = n; + else + n = env->stats.n; + if ((env->stats.n += cn) < n) + return 1; + env->stats.x = x; + env->stats.l = l; + env->stats.y = y; + env->stats.k = k; + env->stats.t = t; + break; + case REX_BACK: + if (++env->stats.b <= 0) + return 1; + break; + case REX_CLASS: + case REX_COLL_CLASS: + case REX_DOT: + case REX_ONECHAR: + n = env->stats.m; + if ((env->stats.m += e->lo) < n) + return 1; + if (e->hi != RE_DUP_INF) + { + n = env->stats.n; + if ((env->stats.n += e->hi) < n) + return 1; + } + if (e->lo != e->hi) + { + if (++env->stats.c <= 0) + return 1; + if (++env->stats.s <= 0) + return 1; + } + break; + case REX_CONJ: + cm = env->stats.m; + env->stats.m = 0; + cn = env->stats.n; + env->stats.n = 0; + if (stats(env, e->re.group.expr.binary.left)) + return 1; + nm = env->stats.m; + env->stats.m = 0; + nn = env->stats.n; + env->stats.n = 0; + if (stats(env, e->re.group.expr.binary.right)) + return 1; + if (env->stats.m < nm) + env->stats.m = nm; + else + nm = env->stats.m; + if ((env->stats.m += cm) < nm) + return 1; + if (env->stats.n < nn) + env->stats.n = nn; + else + nn = env->stats.n; + if ((env->stats.n += cn) < nn) + return 1; + break; + case REX_END: + env->stats.e = 1; + break; + case REX_GROUP: + if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) + return 1; + if (stats(env, e->re.group.expr.rex)) + return 1; + break; + case REX_GROUP_AHEAD: + case REX_GROUP_AHEAD_NOT: + case REX_GROUP_BEHIND: + case REX_GROUP_BEHIND_NOT: + m = env->stats.m; + n = env->stats.n; + x = env->stats.x; + y = env->stats.y; + if (stats(env, e->re.group.expr.rex)) + return 1; + env->stats.m = m; + env->stats.n = n; + env->stats.x = x; + env->stats.y = y; + switch (e->type) + { + case REX_GROUP_AHEAD: + case REX_GROUP_BEHIND: + if (++env->stats.u <= 0) + return 1; + break; + } + break; + case REX_GROUP_COND: + if (++env->stats.u <= 0) + return 1; + m = env->stats.m; + n = env->stats.n; + x = env->stats.x; + y = env->stats.y; + if (e->re.group.size > 0 && ++env->stats.b <= 0) + return 1; + if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) + return 1; + if (q = e->re.group.expr.binary.right) + { + if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) + return 1; + if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) + return 1; + } + env->stats.m = m; + env->stats.n = n; + env->stats.x = x; + env->stats.y = y; + break; + case REX_GROUP_CUT: + if (++env->stats.u <= 0) + return 1; + m = env->stats.m; + n = env->stats.n; + x = env->stats.x; + y = env->stats.y; + if (stats(env, e->re.group.expr.rex)) + return 1; + env->stats.m = m; + env->stats.n = n; + env->stats.x = x; + env->stats.y = y; + break; + case REX_NEG: + env->stats.i++; + x = env->stats.x; + l = env->stats.l; + y = env->stats.y; + k = env->stats.k; + t = env->stats.t; + cm = env->stats.m; + env->stats.m = 0; + if (stats(env, e->re.group.expr.rex)) + return 1; + env->stats.m = !env->stats.m; + if ((env->stats.m += cm) < cm) + return 1; + env->stats.x = x; + env->stats.l = l; + env->stats.y = y; + env->stats.k = k; + env->stats.t = t; + break; + case REX_REP: + x = env->stats.x; + l = env->stats.l; + y = env->stats.y; + k = env->stats.k; + t = env->stats.t; + if (++env->stats.c <= 0) + return 1; + b = env->stats.b; + c = env->stats.c; + cm = env->stats.m; + env->stats.m = 0; + if (stats(env, e->re.group.expr.rex)) + return 1; + if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) + return 1; + if (e->lo < 1) + { + env->stats.x = x; + env->stats.l = l; + env->stats.y = y; + env->stats.k = k; + env->stats.t = t; + env->stats.m = cm; + } + else + { + m = env->stats.m; + if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) + return 1; + m = env->stats.m; + if ((env->stats.m += cm) < m) + return 1; + if (env->stats.x != x) + env->stats.l = cm; + if (env->stats.y != y) + env->stats.k = cm; + } + break; + case REX_STRING: + if (!e->map) + { + cm = env->stats.m; + if ((env->stats.m += e->re.string.size) < cm) + return 1; + cn = env->stats.n; + if ((env->stats.n += e->re.string.size) < cn) + return 1; + if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) + { + env->stats.x = e; + env->stats.l = cm; + } + } + break; + case REX_TRIE: + if (++env->stats.s <= 0) + return 1; + cm = env->stats.m; + if ((env->stats.m += e->re.trie.min) < cm) + return 1; + cn = env->stats.n; + if ((env->stats.n += e->re.trie.max) < cn) + return 1; + env->stats.t++; + if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) + { + env->stats.y = e; + env->stats.k = cm; + } + break; + } + } while (e = e->next); + return 0; +} + +static int token(Cenv_t*); + +static int +magic(register Cenv_t* env, register int c, int escaped) +{ + register char* sp; + register int n; + int o = c; + int e = env->error; + int l = env->token.len; + short* mp; + char* ep; + + if (mp = state.magic[c]) + { + c = mp[env->type+escaped]; + if (c >= T_META) + { + sp = (char*)env->cursor + env->token.len; + switch (c) + { + case T_LEFT: + n = 0; + ep = sp; + while (*sp >= '0' && *sp <= '9') + { + if (n > (INT_MAX / 10)) + { + env->error = REG_BADBR; + goto bad; + } + n = n * 10 + *sp++ - '0'; + } + if (sp == ep) + { + if (env->type < SRE || *sp != ',') + { + env->error = *sp ? REG_BADBR : REG_EBRACE; + goto bad; + } + } + else if (n > RE_DUP_MAX) + { + env->error = REG_BADBR; + goto bad; + } + env->token.min = n; + if (*sp == ',') + { + n = 0; + ep = ++sp; + while (*sp >= '0' && *sp <= '9') + { + if (n > (INT_MAX / 10)) + { + env->error = REG_BADBR; + goto bad; + } + n = n * 10 + *sp++ - '0'; + } + if (sp == ep) + n = RE_DUP_INF; + else if (n < env->token.min) + { + env->error = REG_BADBR; + goto bad; + } + } + env->token.max = n; + switch (*sp) + { + case 0: + env->error = REG_EBRACE; + goto bad; + case '\\': + if (!escaped) + { + env->error = REG_BADBR; + goto bad; + } + sp++; + break; + default: + if (escaped) + { + env->error = REG_BADBR; + goto bad; + } + break; + } + switch (*sp++) + { + case 0: + env->error = REG_EBRACE; + goto bad; + case '}': + break; + default: + env->error = REG_BADBR; + goto bad; + } + env->token.len = sp - (char*)env->cursor; + break; + case T_RIGHT: + env->error = REG_EBRACE; + goto bad; + case T_OPEN: + if (env->type < SRE && *sp == '?') + { + env->token.len++; + env->token.lex = 0; + goto group; + } + break; + case T_ESCAPE: + c = chresc(sp - 2, &ep); + if (ep < sp) + goto bad; + env->token.len += ep - sp; + if (c >= T_META) + { + env->token.lex = c; + c = C_ESC; + } + return c; + case T_BACK+0: + case T_BACK+1: + case T_BACK+2: + case T_BACK+3: + case T_BACK+4: + case T_BACK+5: + case T_BACK+6: + case T_BACK+7: + n = chresc(sp - 2, &ep); + if (ep > sp + 1) + { + env->token.len += ep - sp; + return n; + } + /*FALLTHROUGH*/ + case T_BACK+8: + case T_BACK+9: + if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) + { + env->error = REG_BADESC; + goto bad; + } + if ((env->flags & REG_MULTIREF) && isdigit(*sp)) + { + c = (c - T_BACK) * 10 + (*sp - '0'); + if (c > 0 && c <= env->parno && env->paren[c]) + c += T_BACK; + else + c = chresc(sp - 2, &ep); + env->token.len++; + } + if (c == T_BACK) + c = 0; + break; + case T_BAD: + if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) + return c; + goto bad; + } + if (env->type >= SRE) + { + if (c == T_DOT) + c = '.'; + else if (c < T_OPEN) + { + if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') + { + env->token.len++; + env->token.att = 1; + } + if (env->type == KRE && *(env->cursor + env->token.len) == '(') + { + env->token.len++; + switch (c) + { + case T_AT: + break; + case T_PERCENT: + env->token.lex = c; + goto group; + case T_TILDE: + env->token.lex = 0; + goto group; + default: + env->token.lex = c; + break; + } + c = T_OPEN; + } + else if (c == T_STAR) + c = T_DOTSTAR; + else if (c == T_QUES) + c = T_DOT; + else + { + c = o; + env->token.len = l; + } + } + else if (c > T_BACK) + { + c = (c - T_BACK) * 2 - 1; + c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; + } + else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) + { + if (c == T_AND) + c = '&'; + else if (c == T_BAR) + c = '|'; + else if (c == T_OPEN) + c = '('; + } + } + } + } + else if (escaped == 2) + { + if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) + /*ok*/; + else + { + env->error = REG_BADESC; + goto bad; + } + } + else if (escaped && !(env->flags & REG_LENIENT) && c != ']') + { + env->error = REG_BADESC; + goto bad; + } + return c; + group: + sp = (char*)env->cursor + env->token.len; + switch (*sp++) + { + case ')': + break; + case '#': + for (;;) + { + switch (*sp++) + { + case 0: + env->error = REG_EPAREN; + return T_BAD; + case ')': + break; + default: + continue; + } + break; + } + break; + default: + return T_GROUP; + } + env->cursor = (unsigned char*)sp; + return token(env); + bad: + if (escaped == 2) + env->error = e; + else if (env->flags & REG_LENIENT) + return o; + else if (escaped == 1 && !env->error) + { + if (mp || o == ']') + return o; + env->error = REG_BADESC; + } + return T_BAD; +} + +static int +token(register Cenv_t* env) +{ + int c; + int posixkludge; + + if (env->token.push) + return env->token.lex; + env->token.att = env->token.esc = 0; + if ((env->token.len = MBSIZE(env->cursor)) > 1) + return env->token.lex = C_MB; + env->token.lex = 0; + for (;;) + { + c = *env->cursor; + if (c == 0 || c == env->delimiter || c == env->terminator) + return T_END; + if (!(env->flags & REG_COMMENT)) + break; + if (c == '#') + { + do + { + c = *++env->cursor; + if (c == 0 || c == env->delimiter) + return T_END; + } while (c != '\n'); + } + else if (!isspace(c)) + break; + env->cursor++; + } + if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) + { + if (env->parnest) + { + env->error = REG_EPAREN; + return T_BAD; + } + env->parno = 0; + env->pattern = env->cursor + 1; + return T_BAR; + } + if (env->flags & REG_LITERAL) + return c; + if (posixkludge = env->posixkludge) + { + env->posixkludge = 0; + if (c == '*') + return c; + } + if (c == '\\') + { + if (env->flags & REG_SHELL_ESCAPED) + return c; + if (!(c = *(env->cursor + 1)) || c == env->terminator) + { + if (env->flags & REG_LENIENT) + { + if (c) + { + env->token.esc = env->token.len; + env->token.len += MBSIZE(env->cursor + 1); + return c; + } + return '\\'; + } + env->error = REG_EESCAPE; + return T_BAD; + } + env->token.esc = env->token.len; + env->token.len += MBSIZE(env->cursor + 1); + if (env->delimiter && c == 'n') + return '\n'; + else if (c == env->delimiter) + return magic(env, c, 0); + else if (c == '(' && env->type == BRE) + env->posixkludge = 1; + else if (c == ')' && env->type == BRE && env->parnest <= 0) + { + env->error = REG_EPAREN; + return T_BAD; + } + else if (isspace(c) && (env->flags & REG_COMMENT)) + return c; + return magic(env, c, 1); + } + else if (c == '$') + { + if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') + return T_DOLL; + } + else if (c == '^') + { + if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1)) + { + env->posixkludge = 2; + return T_CFLX; + } + } + else if (c == ')') + { + if (env->type != BRE && env->parnest <= 0) + return c; + } + else if (c == '/' && env->explicit == env->mappedslash) + { + while (*(env->cursor + env->token.len) == c) + env->token.len++; + return T_SLASHPLUS; + } + return magic(env, c, 0); +} + +static Celt_t* +col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) +{ + register char* s; + register unsigned char* k; + register unsigned char* e; + register int c; + register int cc; + int bt; + int et; + Ckey_t key; + + cc = 0; + for (;;) + { + k = key; + if (bw == 1) + { + c = bc; + if (ic) + { + if (isupper(c)) + { + c = tolower(c); + cc = -1; + } + else if (islower(c)) + { + c = toupper(c); + cc = -1; + } + } + *k++ = c; + } + else if (bw < COLL_KEY_MAX) + { + s = (char*)bp; + if (ic) + { + c = mbchar(s); + if (iswupper(c)) + { + c = towlower(c); + cc = 1; + } + else if (iswlower(c)) + { + c = towupper(c); + cc = 1; + } + } + if (cc > 0) + { + cc = -1; + k += mbconv((char*)k, c); + } + else + for (e = k + bw; k < e; *k++ = *s++); + } + *k = 0; + mbxfrm(ce->beg, key, COLL_KEY_MAX); + if (ep) + { + k = key; + c = mbchar(k); + if (iswupper(c)) + bt = COLL_range_uc; + else if (iswlower(c)) + bt = COLL_range_lc; + else + bt = COLL_range; + k = key; + if (ew == 1) + { + c = ec; + if (ic) + { + if (isupper(c)) + { + c = tolower(c); + cc = -1; + } + else if (islower(c)) + { + c = toupper(c); + cc = -1; + } + } + *k++ = c; + } + else if (ew < COLL_KEY_MAX) + { + s = (char*)ep; + if (ic) + { + c = mbchar(s); + if (iswupper(c)) + { + c = towlower(c); + cc = 1; + } + else if (iswlower(c)) + { + c = towupper(c); + cc = 1; + } + } + if (cc > 0) + { + cc = -1; + k += mbconv((char*)k, c); + } + else + for (e = k + ew; k < e; *k++ = *s++); + } + *k = 0; + mbxfrm(ce->end, key, COLL_KEY_MAX); + k = key; + c = mbchar(k); + if (iswupper(c)) + et = COLL_range_uc; + else if (iswlower(c)) + et = COLL_range_lc; + else + et = COLL_range; + ce->typ = bt == et ? bt : COLL_range; + } + else + ce->typ = COLL_char; + ce++; + if (!ic || !cc) + break; + ic = 0; + } + return ce; +} + +static Rex_t* +bra(Cenv_t* env) +{ + Rex_t* e; + int c; + int i; + int w; + int neg; + int last; + int inrange; + int complicated; + int collate; + int elements; + unsigned char* first; + unsigned char* start; + unsigned char* begin; + unsigned char* s; + regclass_t f; + unsigned char buf[4 * (COLL_KEY_MAX + 1)]; +#if _PACKAGE_ast + int ic; + char mbc[COLL_KEY_MAX + 1]; +#endif + + if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) + return 0; + collate = complicated = elements = 0; + if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') + { + env->cursor++; + neg = 1; + } + else + neg = 0; + first = env->cursor; + start = first + MBSIZE(first); + if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) + goto error; + begin = env->cursor + MBSIZE(env->cursor); + + /* + * inrange: 0=no, 1=possibly, 2=definitely + */ + + inrange = 0; + for (;;) + { + if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE)) + goto error; + env->cursor += (w = MBSIZE(env->cursor)); + if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) + { + if (*env->cursor) + { + if (*env->cursor == 'n') + { + env->cursor++; + c = '\n'; + } + else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) + { + env->token.len = 1; + w = magic(env, *env->cursor, 2); + if (env->token.len > 1 || w != T_BAD) + { + if (env->token.len == 1 && (f = classfun(w))) + { + if (inrange > 1) + { + if (env->type < SRE && !(env->flags & REG_LENIENT)) + goto erange; + inrange = 0; + } + env->cursor++; + for (c = 0; c <= UCHAR_MAX; c++) + if ((*f)(c)) + setadd(e->re.charclass, c); + complicated++; + elements++; + continue; + } + if (env->token.len > 1 || w >= 0 && w < T_META) + { + c = w; + if (c > UCHAR_MAX) + { + if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) + goto erange; + c = UCHAR_MAX; + } + env->cursor += env->token.len; + } + } + } + } + } + else if (c == ']') + { + if (env->cursor == begin) + { + last = c; + inrange = 1; + continue; + } + if (inrange != 0) + { + setadd(e->re.charclass, last); + elements++; + if (inrange == 2) + { + setadd(e->re.charclass, '-'); + elements++; + } + } + break; + } + else if (c == '-') + { + if (!inrange && env->cursor != begin && *env->cursor != ']') + { + if (env->type < SRE && !(env->flags & REG_LENIENT)) + goto erange; + continue; + } + else if (inrange == 1) + { + inrange = 2; + complicated++; + continue; + } + } + else if (c == '[') + { + switch (*env->cursor) + { + case 0: + goto error; + case ':': + if (env->regexp) + goto normal; + if (inrange == 1) + { + setadd(e->re.charclass, last); + elements++; + } + if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) + { + if (env->cursor == start && (c = *(env->cursor + 1))) + { + s = start = env->cursor + 1; + while (*++s && *s != ':'); + if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') + { + if ((i = (s - start)) == 1) + { + switch (c) + { + case '<': + i = REX_WBEG; + break; + case '>': + i = REX_WEND; + break; + default: + i = 0; + break; + } + if (i) + { + env->cursor = s + 3; + drop(env->disc, e); + return node(env, i, 0, 0, 0); + } + } + } + } + env->error = REG_ECTYPE; + goto error; + } + for (c = 0; c <= UCHAR_MAX; c++) + if ((*f)(c)) + setadd(e->re.charclass, c); + inrange = 0; + complicated++; + elements++; + continue; + case '=': + if (env->regexp) + goto normal; + if (inrange == 2) + goto erange; + if (inrange == 1) + { + setadd(e->re.charclass, last); + elements++; + } + if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0) + goto ecollate; + if (c > 1) + collate++; + else + setadd(e->re.charclass, buf[0]); + c = buf[0]; + inrange = 0; + complicated++; + elements++; + continue; + case '.': + if (env->regexp) + goto normal; + if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0) + goto ecollate; + if (c > 1) + collate++; + c = buf[0]; + complicated++; + break; + default: + normal: + if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) + goto error; + break; + } + } + else if (w > 1) + complicated++; + if (inrange == 2) + { + if (last <= c) + { + for (i = last; i <= c; i++) + setadd(e->re.charclass, i); + inrange = env->type >= SRE || (env->flags & REG_LENIENT); + elements += 2; + } + else if (env->type >= SRE) + { + setadd(e->re.charclass, last); + setadd(e->re.charclass, c); + elements += 2; + inrange = 1; + } + else if (!(env->flags & REG_LENIENT)) + goto erange; + else + inrange = 0; + } + else if (inrange == 1) + { + setadd(e->re.charclass, last); + elements++; + } + else + inrange = 1; + last = c; + } +#if _PACKAGE_ast + if (complicated && mbcoll()) + { + Dt_t* dt; + Cchr_t* cc; + Cchr_t* tc; + Cchr_t* xc; + Celt_t* ce; + Cchr_t key; + int rw; + int rc; + wchar_t wc; + unsigned char* rp; + unsigned char* pp; + char* wp; + char cb[2][COLL_KEY_MAX+1]; + + static Dtdisc_t disc; + + static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + + if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) + { + disc.key = offsetof(Cchr_t, key); + if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dtoset))) + { + for (i = 0; i < elementsof(primary) - 1; i++, cc++) + { + cc->nam[0] = primary[i]; + mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); + dtinsert(dt, cc); + } + for (i = 0; i < elementsof(cc->key); i++) + cc->key[i] = ~0; + dtinsert(dt, cc); + LCINFO(AST_LC_COLLATE)->data = (void*)dt; + } + else + { + if (cc) + free(cc); + drop(env->disc, e); + return 0; + } + } + if (dt) + { + drop(env->disc, e); + if (ic = env->flags & REG_ICASE) + elements *= 2; + if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 3) * sizeof(Celt_t)))) + return 0; + ce = (Celt_t*)e->re.data; + e->re.collate.invert = neg; + e->re.collate.elements = ce; + env->cursor = first; + inrange = 0; + for (;;) + { + if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) + goto error; + pp = env->cursor; + env->cursor += (w = MBSIZE(env->cursor)); + if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) + { + if (*env->cursor) + { + if (*env->cursor == 'n') + { + pp = env->cursor++; + c = '\n'; + } + else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) + { + env->token.len = 1; + w = magic(env, *env->cursor, 2); + if (env->token.len > 1 || w != T_BAD) + { + if (env->token.len == 1 && (f = classfun(w))) + { + if (inrange > 1) + { + if (env->type < SRE && !(env->flags & REG_LENIENT)) + goto erange; + inrange = 0; + } + env->cursor++; + ce->fun = f; + ce->typ = COLL_call; + ce++; + continue; + } + if (env->token.len > 1 || w >= 0 && w < T_META) + { + c = w; + w = mbconv(mbc, c); + pp = (unsigned char*)mbc; + env->cursor += env->token.len; + } + } + } + } + } + else if (c == ']') + { + if (env->cursor == begin) + { + rp = pp; + rw = w; + inrange = 1; + continue; + } + if (inrange != 0) + { + ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); + if (inrange == 2) + ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); + } + break; + } + else if (c == '-') + { + if (!inrange && env->cursor != begin && *env->cursor != ']') + { + if (env->type < SRE && !(env->flags & REG_LENIENT)) + goto erange; + continue; + } + else if (inrange == 1) + { + inrange = 2; + continue; + } + } + else if (c == '[') + { + switch (*env->cursor) + { + case 0: + goto error; + case ':': + if (env->regexp) + goto complicated_normal; + if (inrange == 1) + ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); + if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) + { + if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') + { + switch (c) + { + case '<': + i = REX_WBEG; + break; + case '>': + i = REX_WEND; + break; + default: + i = 0; + break; + } + if (i) + { + env->cursor += 5; + drop(env->disc, e); + return node(env, i, 0, 0, 0); + } + } + env->error = REG_ECTYPE; + goto error; + } + ce->fun = f; + ce->typ = COLL_call; + ce++; + inrange = 0; + continue; + case '=': + if (env->regexp) + goto complicated_normal; + if (inrange == 2) + goto erange; + if (inrange == 1) + ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); + pp = (unsigned char*)cb[inrange]; + rp = env->cursor + 1; + if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, &wc)) < 0) + goto ecollate; + c = 0; + if (ic) + { + if (iswupper(wc)) + { + wc = towlower(wc); + rw = mbconv((char*)pp, wc); + c = 'u'; + } + else if (iswlower(wc)) + c = 'l'; + } + i = 1; + for (;;) + { + mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); + if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) + { + if (i) + { + c = *pp; + goto singleton; + } + goto ecollate; + } + xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; + if (c == 'l' || c == 'L' && !(c = 0)) + ce->typ = COLL_range_lc; + else if (c == 'u' || c == 'U' && !(c = 0)) + ce->typ = COLL_range_uc; + else + ce->typ = COLL_range; + strcpy((char*)ce->beg, (char*)xc->key); + if (!(cc = (Cchr_t*)dtnext(dt, cc))) + { + if (i) + { + c = *pp; + goto singleton; + } + goto ecollate; + } + if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) + cc = tc; + strcpy((char*)ce->end, (char*)cc->key); + ce->max = -1; + ce++; + if (!c) + break; + if (c == 'u') + { + wc = towlower(wc); + c = 'L'; + } + else + { + wc = towupper(wc); + c = 'U'; + } + rw = mbconv((char*)pp, wc); + i = 0; + } + inrange = 0; + c = *pp; + continue; + case '.': + if (env->regexp) + goto complicated_normal; + pp = (unsigned char*)cb[inrange]; + if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, NiL)) < 0) + goto ecollate; + c = *pp; + break; + default: + complicated_normal: + if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) + goto error; + break; + } + } + singleton: + if (inrange == 2) + { + ce = col(ce, ic, rp, rw, rc, pp, w, c); + if (strcmp((char*)ce->beg, (char*)ce->end) > 0) + { + if (env->type < SRE && !(env->flags & REG_LENIENT)) + goto erange; + (ce-1)->typ = COLL_char; + strcpy((char*)ce->beg, (char*)(ce-1)->end); + ce->typ = COLL_char; + ce++; + } + inrange = env->type >= SRE || (env->flags & REG_LENIENT); + } + else if (inrange == 1) + ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); + else + inrange = 1; + rp = pp; + rw = w; + rc = c; + } + ce->typ = COLL_end; + return e; + } + } +#endif + if (collate) + goto ecollate; + if (env->flags & REG_ICASE) + for (i = 0; i <= UCHAR_MAX; i++) + if (settst(e->re.charclass, i)) + { + if (isupper(i)) + c = tolower(i); + else if (islower(i)) + c = toupper(i); + else + continue; + setadd(e->re.charclass, c); + } + if (neg) + { + for (i = 0; i < elementsof(e->re.charclass->bits); i++) + e->re.charclass->bits[i] ^= ~0; + if (env->explicit >= 0) + setclr(e->re.charclass, env->explicit); + } + return e; + ecollate: + env->error = REG_ECOLLATE; + goto error; + erange: + env->error = REG_ERANGE; + error: + drop(env->disc, e); + if (!env->error) + env->error = REG_EBRACK; + return 0; +} + +static Rex_t* +ccl(Cenv_t* env, int type) +{ + int i; + Rex_t* e; + Celt_t* ce; + regclass_t f; + + if (!(f = classfun(type))) + { + env->error = REG_BADESC; + return 0; + } + if (!mbcoll()) + { + if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) + return 0; + for (i = 0; i <= UCHAR_MAX; i++) + if ((*f)(i)) + setadd(e->re.charclass, i); + if (env->explicit >= 0) + setclr(e->re.charclass, env->explicit); + } + else + { + if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) + return 0; + ce = (Celt_t*)e->re.data; + e->re.collate.invert = 0; + e->re.collate.elements = ce; + ce->fun = f; + ce->typ = COLL_call; + ce++; + ce->typ = COLL_end; + } + return e; +} + +static Rex_t* +rep(Cenv_t* env, Rex_t* e, int number, int last) +{ + Rex_t* f; + unsigned long m = 0; + unsigned long n = RE_DUP_INF; + int minimal = -1; + + if (!e) + return 0; + switch (token(env)) + { + case T_BANG: + eat(env); + if (!(f = node(env, REX_NEG, m, n, 0))) + { + drop(env->disc, e); + return 0; + } + f->re.group.expr.rex = e; + return f; + case T_QUES: + eat(env); + n = 1; + break; + case T_STAR: + eat(env); + break; + case T_PLUS: + eat(env); + m = 1; + break; + case T_LEFT: + eat(env); + m = env->token.min; + n = env->token.max; + break; + default: + return e; + } + if (env->token.att) + minimal = 1; + else if (env->type < SRE) + switch (token(env)) + { + case T_QUES: + eat(env); + minimal = !(env->flags & REG_MINIMAL); + break; + case T_STAR: /*AST*/ + eat(env); + minimal = !!(env->flags & REG_MINIMAL); + break; + } + switch (e->type) + { + case REX_DOT: + case REX_CLASS: + case REX_COLL_CLASS: + case REX_ONECHAR: + e->lo = m; + e->hi = n; + if (minimal >= 0) + mark(e, minimal); + return e; +#if HUH_2002_08_07 + case REX_BEG: +#endif + case REX_BEG_STR: + case REX_END_STR: + case REX_FIN_STR: + case REX_WBEG: + case REX_WEND: + case REX_WORD: + case REX_WORD_NOT: + env->error = REG_BADRPT; + drop(env->disc, e); + return 0; + } + if (m == 1 && n == 1) + { + if (minimal >= 0) + mark(e, minimal); + return e; + } + if (!(f = node(env, REX_REP, m, n, 0))) + { + drop(env->disc, e); + return 0; + } + f->re.group.expr.rex = e; + f->re.group.number = number; + f->re.group.last = last; + if (minimal >= 0) + mark(f, minimal); + if (m <= n && n) + { + for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); + if (e && e->type == REX_NEG) + f->type = REX_GROUP; + } + return f; +} + +static int +isstring(Rex_t* e) +{ + switch (e->type) + { + case REX_ONECHAR: + return e->lo == 1 && e->hi == 1; + case REX_STRING: + return 1; + } + return 0; +} + +static Trie_node_t* +trienode(Cenv_t* env, int c) +{ + Trie_node_t* t; + + if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) + { + memset(t, 0, sizeof(Trie_node_t)); + t->c = c; + } + return t; +} + +static int +insert(Cenv_t* env, Rex_t* f, Rex_t* g) +{ + unsigned char* s; + unsigned char* e; + Trie_node_t* t; + int len; + unsigned char tmp[2]; + + switch (f->type) + { + case REX_ONECHAR: + *(s = tmp) = f->re.onechar; + e = s + 1; + break; + case REX_STRING: + s = f->re.string.base; + e = s + f->re.string.size; + break; + default: + return 1; + } + if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) + return 1; + for (len = 1;;) + { + if (t->c == *s) + { + if (++s >= e) + break; + if (!t->son && !(t->son = trienode(env, *s))) + return 1; + t = t->son; + len++; + } + else + { + if (!t->sib && !(t->sib = trienode(env, *s))) + return 1; + t = t->sib; + } + } + if (g->re.trie.min > len) + g->re.trie.min = len; + if (g->re.trie.max < len) + g->re.trie.max = len; + t->end = 1; + return 0; +} + +/* + * trie() tries to combine nontrivial e and f into a REX_TRIE + * unless 0 is returned, e and f are deleted as far as possible + * also called by regcomb + */ + +static Rex_t* +trie(Cenv_t* env, Rex_t* e, Rex_t* f) +{ + Rex_t* g; + + if (e->next || f->next || !isstring(e) || e->flags != f->flags) + return 0; + if (isstring(f)) + { + if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) + return 0; + g->re.trie.min = INT_MAX; + if (insert(env, f, g)) + goto nospace; + drop(env->disc, f); + } + else if (f->type != REX_TRIE) + return 0; + else + g = f; + if (insert(env, e, g)) + goto nospace; + drop(env->disc, e); + return g; + nospace: + if (g != f) + drop(env->disc, g); + return 0; +} + +static Rex_t* alt(Cenv_t*, int, int); + +static int +chr(register Cenv_t* env, int* escaped) +{ + unsigned char* p; + int c; + + *escaped = 0; + if (!(c = *env->cursor)) + return -1; + env->cursor++; + if (c == '\\') + { + if (env->flags & REG_SHELL_ESCAPED) + return c; + if (!(c = *(env->cursor + 1)) || c == env->terminator) + { + if (env->flags & REG_LENIENT) + return c ? c : '\\'; + env->error = REG_EESCAPE; + return -1; + } + p = env->cursor; + c = chresc((char*)env->cursor - 1, (char**)&env->cursor); + *escaped = env->cursor - p; + } + return c; +} + +/* + * open the perly gates + */ + +static Rex_t* +grp(Cenv_t* env, int parno) +{ + Rex_t* e; + Rex_t* f; + int c; + int i; + int n; + int x; + int esc; + int typ; + int beg; + unsigned char* p; + + beg = env->pattern == env->cursor - env->token.len; + if (!(c = env->token.lex) && (c = *env->cursor)) + env->cursor++; + env->token.len = 0; + env->parnest++; + typ = -1; + switch (c) + { + case '-': + case '+': + case 'a': + case 'g': + case 'i': + case 'l': + case 'm': + case 'p': + case 'r': + case 's': + case 'x': + case 'A': + case 'B': + case 'E': + case 'F': + case 'G': + case 'I': + case 'K': + case 'L': + case 'M': /* glob(3) */ + case 'N': /* glob(3) */ + case 'O': /* glob(3) */ + case 'P': + case 'R': /* pcre */ + case 'S': + case 'U': /* pcre */ + case 'X': /* pcre */ + x = REX_GROUP; + i = 1; + env->token.push = 1; + for (;;) + { + switch (c) + { + case ')': + if (!(env->flags & REG_LITERAL)) + { + env->error = REG_BADRPT; + return 0; + } + /*FALLTHROUGH*/ + case 0: + case T_CLOSE: + x = 0; + goto done; + case ':': + eat(env); + if (token(env) == T_CLOSE) + x = 0; + goto done; + case '-': + i = 0; + break; + case '+': + i = 1; + break; + case 'a': + if (i) + env->flags |= (REG_LEFT|REG_RIGHT); + else + env->flags &= ~(REG_LEFT|REG_RIGHT); + break; + case 'g': + if (i) + env->flags &= ~REG_MINIMAL; + else + env->flags |= REG_MINIMAL; + break; + case 'i': + if (i) + env->flags |= REG_ICASE; + else + env->flags &= ~REG_ICASE; + break; + case 'l': + if (i) + env->flags |= REG_LEFT; + else + env->flags &= ~REG_LEFT; + break; + case 'm': + if (i) + env->flags |= REG_NEWLINE; + else + env->flags &= ~REG_NEWLINE; + env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; + break; + case 'p': + if (i) + env->flags &= ~REG_LENIENT; + else + env->flags |= REG_LENIENT; + break; + case 'r': + if (i) + env->flags |= REG_RIGHT; + else + env->flags &= ~REG_RIGHT; + break; + case 's': + if (i) + env->flags |= REG_SPAN; + else + env->flags &= ~REG_SPAN; + env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; + break; + case 'x': + if (i) + env->flags |= REG_COMMENT; + else + env->flags &= ~REG_COMMENT; + break; + case 'X': + if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE)) + break; /* PCRE_EXTRA */ + /*FALLTHROUGH*/ + case 'A': + env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); + env->flags |= REG_AUGMENTED|REG_EXTENDED; + typ = ARE; + break; + case 'B': + case 'G': + env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); + typ = BRE; + break; + case 'E': + env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); + env->flags |= REG_EXTENDED; + typ = ERE; + break; + case 'F': + case 'L': + env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); + env->flags |= REG_LITERAL; + typ = ERE; + break; + case 'K': + env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); + env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; + typ = KRE; + break; + case 'M': + /* used by caller to disable glob(3) GLOB_BRACE */ + break; + case 'N': + /* used by caller to disable glob(3) GLOB_NOCHECK */ + break; + case 'O': + /* used by caller to disable glob(3) GLOB_STARSTAR */ + break; + case 'P': + env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); + env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE; + typ = ERE; + break; + case 'S': + env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); + env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; + typ = SRE; + break; + case 'U': /* PCRE_UNGREEDY */ + if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE)) + { + if (i) + env->flags |= REG_MINIMAL; + else + env->flags &= ~REG_MINIMAL; + } + break; + default: + env->error = REG_BADRPT; + return 0; + } + eat(env); + c = token(env); + } + done: + break; + case ':': + x = REX_GROUP; + break; + case '=': + x = REX_GROUP_AHEAD; + break; + case '!': + x = REX_GROUP_AHEAD_NOT; + break; + case '<': + switch (token(env)) + { + case '=': + x = REX_GROUP_BEHIND; + break; + case '!': + case T_BANG: + x = REX_GROUP_BEHIND_NOT; + break; + default: + env->error = REG_BADRPT; + return 0; + } + eat(env); + break; + case '>': + x = REX_GROUP_CUT; + break; + case '%': + case T_PERCENT: + e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); + e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; + n = 1; + for (;;) + { + switch (i = chr(env, &esc)) + { + case -1: + case 0: + invalid: + env->cursor -= esc + 1; + env->error = REG_EPAREN; + return 0; + case 'D': + x = REX_NEST_delimiter; + /*FALLTHROUGH*/ + delimiter: + if ((i = chr(env, &esc)) < 0) + goto invalid; + if (e->re.nest.type[i] & ~x) + goto invalid; + e->re.nest.type[i] = x; + continue; + case 'E': + x = REX_NEST_escape; + goto delimiter; + case 'L': + x = REX_NEST_literal; + goto quote; + case 'O': + switch (i = chr(env, &esc)) + { + case 'T': + e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; + break; + default: + goto invalid; + } + continue; + case 'Q': + x = REX_NEST_quote; + /*FALLTHROUGH*/ + quote: + if ((i = chr(env, &esc)) < 0) + goto invalid; + if (e->re.nest.type[i] & ~x) + goto invalid; + e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); + continue; + case 'S': + x = REX_NEST_separator; + goto delimiter; + case 'T': + x = REX_NEST_terminator; + goto delimiter; + case '|': + case '&': + if (!esc) + goto invalid; + goto nesting; + case '(': + if (!esc) + n++; + goto nesting; + case ')': + if (!esc && !--n) + break; + goto nesting; + default: + nesting: + if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) + goto invalid; + e->re.nest.type[i] = REX_NEST_open; + if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) + goto invalid; + if (!esc) + { + if (x == ')' && !--n) + goto invalid; + else if (x == '(') + n++; + } + e->re.nest.type[x] |= REX_NEST_close; + e->re.nest.type[i] |= x << REX_NEST_SHIFT; + continue; + } + break; + } + env->parnest--; + if (c == T_PERCENT) + for (n = 0; n < 2; n++) + { + parno = ++env->parno; + if (!(f = node(env, REX_GROUP, 0, 0, 0))) + { + drop(env->disc, e); + return 0; + } + if (parno < elementsof(env->paren)) + env->paren[parno] = f; + f->re.group.back = 0; + f->re.group.number = parno; + f->re.group.expr.rex = e; + e = f; + } + return e; + case '(': + c = 0; + if (isdigit(*env->cursor)) + { + f = 0; + do + { + if (c > (INT_MAX / 10)) + { + env->error = REG_BADRPT; + return 0; + } + c = c * 10 + (*env->cursor++ - '0'); + } while (isdigit(*env->cursor)); + if (*env->cursor++ != ')') + { + env->error = REG_BADRPT; + return 0; + } + if (c && env->type >= SRE) + c = c * 2 - 1; + if (!c || c > env->parno || !env->paren[c]) + { + if (!(env->flags & REG_LENIENT)) + { + env->error = REG_ESUBREG; + return 0; + } + if (c) + c = -1; + } + } + else + { + if (env->type < SRE && *env->cursor++ != '?') + { + env->error = REG_BADRPT; + return 0; + } + if (!(f = grp(env, parno + 1)) && env->error) + return 0; + } + if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) + { + drop(env->disc, f); + return 0; + } + e->re.group.size = c; + e->re.group.expr.binary.left = f; + if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) + { + drop(env->disc, e); + return 0; + } + if (token(env) != T_CLOSE) + { + env->error = REG_EPAREN; + return 0; + } + eat(env); + env->parnest--; + return rep(env, e, parno, parno); + case '{': + p = env->cursor; + n = 1; + while (c = *env->cursor) + { + if (c == '\\' && *(env->cursor + 1)) + env->cursor++; + else if (c == '{') + n++; + else if (c == '}' && !--n) + break; + else if (c == env->delimiter || c == env->terminator) + break; + env->cursor++; + } + if (c != '}') + { + env->error = REG_EBRACE; + return 0; + } + if (*++env->cursor != ')') + { + env->error = REG_EPAREN; + return 0; + } + env->cursor++; + env->parnest--; + if (env->disc->re_version < REG_VERSION_EXEC) + { + env->error = REG_BADRPT; + return 0; + } + if (!env->disc->re_execf) + return 0; + if (!(e = node(env, REX_EXEC, 0, 0, 0))) + return 0; + e->re.exec.text = (const char*)p; + e->re.exec.size = env->cursor - p - 2; + if (!env->disc->re_compf) + e->re.exec.data = 0; + else + e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); + return e; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c -= '0'; + while (isdigit(*env->cursor)) + { + if (c > (INT_MAX / 10)) + { + env->error = REG_ESUBREG; + return 0; + } + c = c * 10 + *env->cursor++ - '0'; + } + if (*env->cursor == ')') + { + env->cursor++; + env->parnest--; + env->token.len = 1; + if (c > env->parno || !env->paren[c]) + { + env->error = REG_ESUBREG; + return 0; + } + env->paren[c]->re.group.back = 1; + return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); + } + /*FALLTHROUGH*/ + default: + env->error = REG_BADRPT; + return 0; + } + if (x && !(e = alt(env, parno, 0))) + return 0; + c = token(env); + env->parnest--; + if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) + { + env->error = REG_EPAREN; + return 0; + } + eat(env); + if (typ >= 0) + { + if (beg) + env->pattern = env->cursor; + env->type = typ; + } + if (!x) + return 0; + if (!(f = node(env, x, 0, 0, 0))) + { + drop(env->disc, e); + return 0; + } + f->re.group.expr.rex = e; + if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) + { + if (stats(env, e)) + { + drop(env->disc, f); + if (!env->error) + env->error = REG_ECOUNT; + return 0; + } + f->re.group.size = env->stats.m; + memset(&env->stats, 0, sizeof(env->stats)); + } + switch (x) + { + case REX_GROUP: + case REX_GROUP_CUT: + f = rep(env, f, parno, env->parno); + break; + } + return f; +} + +static Rex_t* +seq(Cenv_t* env) +{ + Rex_t* e; + Rex_t* f; + Token_t tok; + int c; + int i; + int n; + int x; + int parno; + int type; + regflags_t flags; + unsigned char* s; + unsigned char* p; + unsigned char* t; + unsigned char* u; + unsigned char buf[256]; + + for (;;) + { + s = buf; + while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) + { + x = c; + p = env->cursor; + if (c >= 0) + { + n = 1; + *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; + } + else if (c == C_ESC || (env->flags & REG_ICASE)) + { + c = (c == C_ESC) ? env->token.lex : mbchar(p); + if (env->flags & REG_ICASE) + c = towupper(c); + if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) + break; + if ((n = mbconv((char*)s, c)) < 0) + *s++ = c; + else if (n) + s += n; + else + { + n = 1; + *s++ = 0; + } + } + else + { + n = env->token.len - env->token.esc; + for (t = p, u = s + n; s < u; *s++ = *t++); + } + eat(env); + } + if (c == T_BAD) + return 0; + if (s > buf) + switch (c) + { + case T_STAR: + case T_PLUS: + case T_LEFT: + case T_QUES: + case T_BANG: + if ((s -= n) == buf) + e = 0; + else + { + i = s - buf; + if (!(e = node(env, REX_STRING, 0, 0, i))) + return 0; + memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); + e->re.string.size = i; + } + if (x >= 0) + { + if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) + { + drop(env->disc, e); + return 0; + } + f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; + } + else + { + if (!(f = node(env, REX_STRING, 0, 0, n))) + return 0; + memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); + f->re.string.size = n; + } + if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) + { + drop(env->disc, e); + return 0; + } + if (e) + f = cat(env, e, f); + return f; + default: + c = s - buf; + if (!(e = node(env, REX_STRING, 0, 0, c))) + return 0; + memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); + e->re.string.size = c; + return cat(env, e, seq(env)); + } + else if (c > T_BACK) + { + eat(env); + c -= T_BACK; + if (c > env->parno || !env->paren[c]) + { + env->error = REG_ESUBREG; + return 0; + } + env->paren[c]->re.group.back = 1; + e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); + } + else + switch (c) + { + case T_AND: + case T_CLOSE: + case T_BAR: + case T_END: + return node(env, REX_NULL, 0, 0, 0); + case T_DOLL: + eat(env); + e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); + break; + case T_CFLX: + eat(env); + if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) + e = rep(env, e, 0, 0); + break; + case T_OPEN: + tok = env->token; + eat(env); + flags = env->flags; + type = env->type; + if (env->token.att) + env->flags |= REG_MINIMAL; + env->parnest++; + if (env->type == KRE) + ++env->parno; + parno = ++env->parno; + if (!(e = alt(env, parno + 1, 0))) + break; + if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) + { + drop(env->disc, e); + env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; + return 0; + } + if (token(env) != T_CLOSE) + { + drop(env->disc, e); + env->error = REG_EPAREN; + return 0; + } + env->parnest--; + eat(env); + if (!(f = node(env, REX_GROUP, 0, 0, 0))) + { + drop(env->disc, e); + return 0; + } + if (parno < elementsof(env->paren)) + env->paren[parno] = f; + f->re.group.back = 0; + f->re.group.number = parno; + f->re.group.expr.rex = e; + if (tok.lex) + { + tok.push = 1; + env->token = tok; + } + if (!(e = rep(env, f, parno, env->parno))) + return 0; + if (env->type == KRE) + { + if (!(f = node(env, REX_GROUP, 0, 0, 0))) + { + drop(env->disc, e); + return 0; + } + if (--parno < elementsof(env->paren)) + env->paren[parno] = f; + f->re.group.back = 0; + f->re.group.number = parno; + f->re.group.expr.rex = e; + e = f; + } + env->flags = flags; + env->type = type; + break; + case T_GROUP: + p = env->cursor; + eat(env); + flags = env->flags; + type = env->type; + if (!(e = grp(env, env->parno + 1))) + { + if (env->error) + return 0; + if (env->literal == env->pattern && env->literal == p) + env->literal = env->cursor; + continue; + } + env->flags = flags; + env->type = type; + break; + case T_BRA: + eat(env); + if (e = bra(env)) + e = rep(env, e, 0, 0); + break; + case T_ALNUM: + case T_ALNUM_NOT: + case T_DIGIT: + case T_DIGIT_NOT: + case T_SPACE: + case T_SPACE_NOT: + eat(env); + if (e = ccl(env, c)) + e = rep(env, e, 0, 0); + break; + case T_LT: + eat(env); + e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); + break; + case T_GT: + eat(env); + e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); + break; + case T_DOT: + eat(env); + e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); + break; + case T_DOTSTAR: + eat(env); + env->token.lex = T_STAR; + env->token.push = 1; + e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); + break; + case T_SLASHPLUS: + eat(env); + env->token.lex = T_PLUS; + env->token.push = 1; + if (e = node(env, REX_ONECHAR, 1, 1, 0)) + { + e->re.onechar = '/'; + e = rep(env, e, 0, 0); + } + break; + case T_WORD: + eat(env); + e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); + break; + case T_WORD_NOT: + eat(env); + e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); + break; + case T_BEG_STR: + eat(env); + e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); + break; + case T_END_STR: + eat(env); + e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); + break; + case T_FIN_STR: + eat(env); + e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); + break; + default: + env->error = REG_BADRPT; + return 0; + } + if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) + e = cat(env, e, seq(env)); + return e; + } +} + +static Rex_t* +con(Cenv_t* env) +{ + Rex_t* e; + Rex_t* f; + Rex_t* g; + + if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) + return e; + eat(env); + if (!(f = con(env))) + { + drop(env->disc, e); + return 0; + } + if (!(g = node(env, REX_CONJ, 0, 0, 0))) + { + drop(env->disc, e); + drop(env->disc, f); + return 0; + } + g->re.group.expr.binary.left = e; + g->re.group.expr.binary.right = f; + return g; +} + +static Rex_t* +alt(Cenv_t* env, int number, int cond) +{ + Rex_t* e; + Rex_t* f; + Rex_t* g; + + if (!(e = con(env))) + return 0; + else if (token(env) != T_BAR) + { + if (!cond) + return e; + f = 0; + if (e->type == REX_NULL) + goto bad; + } + else + { + eat(env); + if (!(f = alt(env, number, 0))) + { + drop(env->disc, e); + return 0; + } + if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) + goto bad; + if (!cond && (g = trie(env, e, f))) + return g; + } + if (!(g = node(env, REX_ALT, 0, 0, 0))) + { + env->error = REG_ESPACE; + goto bad; + } + g->re.group.number = number; + g->re.group.last = env->parno; + g->re.group.expr.binary.left = e; + g->re.group.expr.binary.right = f; + return g; + bad: + drop(env->disc, e); + drop(env->disc, f); + if (!env->error) + env->error = REG_ENULL; + return 0; +} + +/* + * add v to REX_BM tables + */ + +static void +bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) +{ + int c; + int m; + size_t z; + + for (m = 0; m < n; m++) + { + if (!(z = n - m - 1)) + z = HIT; + c = v[m]; + a->re.bm.mask[m][c] |= b; + if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) + a->re.bm.skip[c] = z; + if (a->flags & REG_ICASE) + { + if (isupper(c)) + c = tolower(c); + else if (islower(c)) + c = toupper(c); + else + continue; + a->re.bm.mask[m][c] |= b; + if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) + a->re.bm.skip[c] = z; + } + } +} + +/* + * set up BM table from trie + */ + +static int +bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) +{ + do + { + v[m] = x->c; + if (m >= (n - 1)) + { + bmstr(env, a, v, n, b); + if (!(b <<= 1)) + { + b = 1; + a->re.bm.complete = 0; + } + else if (x->son) + a->re.bm.complete = 0; + } + else if (x->son) + b = bmtrie(env, a, v, x->son, n, m + 1, b); + } while (x = x->sib); + return b; +} + +/* + * rewrite the expression tree for some special cases + * 1. it is a null expression - illegal + * 2. max length fixed string found -- use BM algorithm + * 3. it begins with an unanchored string - use KMP algorithm + * 0 returned on success + */ + +static int +special(Cenv_t* env, regex_t* p) +{ + Rex_t* a; + Rex_t* e; + Rex_t* t; + Rex_t* x; + Rex_t* y; + unsigned char* s; + int* f; + int n; + int m; + int k; + + DEBUG_INIT(); + if (e = p->env->rex) + { + if ((x = env->stats.x) && x->re.string.size < 3) + x = 0; + if ((t = env->stats.y) && t->re.trie.min < 3) + t = 0; + if (x && t) + { + if (x->re.string.size >= t->re.trie.min) + t = 0; + else + x = 0; + } + if (x || t) + { + Bm_mask_t** mask; + Bm_mask_t* h; + unsigned char* v; + size_t* q; + unsigned long l; + int i; + int j; + + if (x) + { + y = x; + n = m = x->re.string.size; + l = env->stats.l; + } + else + { + y = t; + n = t->re.trie.min; + m = t->re.trie.max; + l = env->stats.k; + } + if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) + return 1; + if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) + { + alloc(env->disc, q, 0); + return 1; + } + a->flags = y->flags; + a->map = y->map; + a->re.bm.size = n; + a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; + a->re.bm.left = l - 1; + a->re.bm.right = env->stats.m - l - n; + a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; + h = (Bm_mask_t*)&a->re.bm.mask[n]; + a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); + a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; + for (m = 0; m <= UCHAR_MAX; m++) + a->re.bm.skip[m] = n; + a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); + for (i = 1; i <= n; i++) + a->re.bm.fail[i] = 2 * n - i; + mask = a->re.bm.mask; + for (m = 0; m < n; m++) + { + mask[m] = h; + h += UCHAR_MAX + 1; + } + if (x) + bmstr(env, a, x->re.string.base, n, 1); + else + { + v = (unsigned char*)q; + memset(v, 0, n); + m = 1; + for (i = 0; i <= UCHAR_MAX; i++) + if (t->re.trie.root[i]) + m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); + } + mask--; + memset(q, 0, n * sizeof(*q)); + for (k = (j = n) + 1; j > 0; j--, k--) + { + DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); + for (q[j] = k; k <= n; k = q[k]) + { + for (m = 0; m <= UCHAR_MAX; m++) + if (mask[k][m] == mask[j][m]) + { + DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); + goto cut; + } + DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); + if (a->re.bm.fail[k] > n - j) + a->re.bm.fail[k] = n - j; + } + cut: ; + } + for (i = 1; i <= n; i++) + if (a->re.bm.fail[i] > n + k - i) + { + DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); + a->re.bm.fail[i] = n + k - i; + } +#if _AST_REGEX_DEBUG + if (DEBUG_TEST(0x0020,(1),(0))) + { + sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); + for (m = 0; m < n; m++) + for (i = 1; i <= UCHAR_MAX; i++) + if (a->re.bm.mask[m][i]) + sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); + for (i = ' '; i <= UCHAR_MAX; i++) + if (a->re.bm.skip[i] >= HIT) + sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); + else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) + sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); + for (j = 31; j >= 0; j--) + { + sfprintf(sfstderr, " "); + next: + for (m = 0; m < n; m++) + { + for (i = 0040; i < 0177; i++) + if (a->re.bm.mask[m][i] & (1 << j)) + { + sfprintf(sfstderr, " %c", i); + break; + } + if (i >= 0177) + { + if (j > 0) + { + j--; + goto next; + } + sfprintf(sfstderr, " ?"); + } + } + sfprintf(sfstderr, "\n"); + } + sfprintf(sfstderr, "FAIL: "); + for (m = 1; m <= n; m++) + sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); + sfprintf(sfstderr, "\n"); + } +#endif + alloc(env->disc, q, 0); + a->next = e; + p->env->rex = a; + return 0; + } + switch (e->type) + { + case REX_BEG: + if (env->flags & REG_NEWLINE) + return 0; + break; + case REX_GROUP: + if (env->stats.b) + return 0; + e = e->re.group.expr.rex; + if (e->type != REX_DOT) + return 0; + /*FALLTHROUGH*/ + case REX_DOT: + if (e->lo == 0 && e->hi == RE_DUP_INF) + break; + return 0; + case REX_NULL: + if (env->flags & REG_NULL) + break; + env->error = REG_ENULL; + return 1; + case REX_STRING: + if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) + return 0; + s = e->re.string.base; + n = e->re.string.size; + if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) + return 1; + a->flags = e->flags; + a->map = e->map; + f = a->re.string.fail; + memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); + s = a->re.string.base; + a->re.string.size = n; + f[0] = m = -1; + for (k = 1; k < n; k++) + { + while (m >= 0 && s[m+1] != s[k]) + m = f[m]; + if (s[m+1] == s[k]) + m++; + f[k] = m; + } + a->next = e->next; + p->env->rex = a; + e->next = 0; + drop(env->disc, e); + break; + default: + return 0; + } + } + p->env->once = 1; + return 0; +} + +int +regcomp(regex_t* p, const char* pattern, regflags_t flags) +{ + Rex_t* e; + Rex_t* f; + regdisc_t* disc; + unsigned char* fold; + int i; + Cenv_t env; + + if (!p) + return REG_BADPAT; + if (flags & REG_DISCIPLINE) + { + flags &= ~REG_DISCIPLINE; + disc = p->re_disc; + } + else + disc = &state.disc; + if (!disc->re_errorlevel) + disc->re_errorlevel = 2; + p->env = 0; + if (!pattern) + return fatal(disc, REG_BADPAT, pattern); + if (!state.initialized) + { + state.initialized = 1; + for (i = 0; i < elementsof(state.escape); i++) + state.magic[state.escape[i].key] = state.escape[i].val; + } + if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) + { + if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) + return fatal(disc, REG_ESPACE, pattern); + for (i = 0; i <= UCHAR_MAX; i++) + fold[i] = toupper(i); + LCINFO(AST_LC_CTYPE)->data = (void*)fold; + } + again: + if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) + return fatal(disc, REG_ESPACE, pattern); + memset(p->env, 0, sizeof(*p->env)); + memset(&env, 0, sizeof(env)); + env.regex = p; + env.flags = flags; + env.disc = p->env->disc = disc; + if (env.flags & REG_AUGMENTED) + env.flags |= REG_EXTENDED; + env.mappeddot = '.'; + env.mappednewline = '\n'; + env.mappedslash = '/'; + if (disc->re_version >= REG_VERSION_MAP && disc->re_map) + { + env.map = disc->re_map; + env.MAP = p->env->fold; + for (i = 0; i <= UCHAR_MAX; i++) + { + env.MAP[i] = fold[env.map[i]]; + if (env.map[i] == '.') + env.mappeddot = i; + if (env.map[i] == '\n') + env.mappednewline = i; + if (env.map[i] == '/') + env.mappedslash = i; + } + } + else + env.MAP = fold; + env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; + env.explicit = -1; + if (env.flags & REG_SHELL) + { + if (env.flags & REG_SHELL_PATH) + env.explicit = env.mappedslash; + if (!(env.flags & REG_SHELL_ESCAPED)) + env.flags |= REG_CLASS_ESCAPE; + env.flags |= REG_LENIENT|REG_NULL; + env.type = env.type == BRE ? SRE : KRE; + } + else + env.flags &= ~(REG_SHELL_DOT|REG_SHELL_ESCAPED|REG_SHELL_GROUP|REG_SHELL_PATH); + if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) + env.explicit = env.mappednewline; + p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; + env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); + env.regexp = !!(env.flags & REG_REGEXP); + env.token.lex = 0; + env.token.push = 0; + if (env.flags & REG_DELIMITED) + { + switch (env.delimiter = *pattern++) + { + case 0: + case '\\': + case '\n': + case '\r': + env.error = REG_EDELIM; + goto bad; + } + env.terminator = '\n'; + } + env.literal = env.pattern = env.cursor = (unsigned char*)pattern; + if (!(p->env->rex = alt(&env, 1, 0))) + goto bad; + if (env.parnest) + { + env.error = REG_EPAREN; + goto bad; + } + if ((env.flags & REG_LEFT) && p->env->rex->type != REX_BEG) + { + if (p->env->rex->type == REX_ALT) + env.flags &= ~REG_FIRST; + if (!(e = node(&env, REX_BEG, 0, 0, 0))) + { + regfree(p); + return fatal(disc, REG_ESPACE, pattern); + } + e->next = p->env->rex; + p->env->rex = e; + p->env->once = 1; + } + for (e = p->env->rex; e->next; e = e->next); + p->env->done.type = REX_DONE; + p->env->done.flags = e->flags; + if ((env.flags & REG_RIGHT) && e->type != REX_END) + { + if (p->env->rex->type == REX_ALT) + env.flags &= ~REG_FIRST; + if (!(f = node(&env, REX_END, 0, 0, 0))) + { + regfree(p); + return fatal(disc, REG_ESPACE, pattern); + } + f->flags = e->flags; + f->map = e->map; + e->next = f; + } + if (stats(&env, p->env->rex)) + { + if (!env.error) + env.error = REG_ECOUNT; + goto bad; + } + if (env.stats.b) + p->env->hard = p->env->separate = 1; + else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) + p->env->hard = 1; + if (p->env->hard || env.stats.c || env.stats.i) + p->env->stats.re_min = p->env->stats.re_max = -1; + else + { + if (!(p->env->stats.re_min = env.stats.m)) + p->env->stats.re_min = -1; + if (!(p->env->stats.re_max = env.stats.n)) + p->env->stats.re_max = -1; + } + if (special(&env, p)) + goto bad; + serialize(&env, p->env->rex, 1); + p->re_nsub = env.stats.p; + if (env.type == KRE) + p->re_nsub /= 2; + if (env.flags & REG_DELIMITED) + { + p->re_npat = env.cursor - env.pattern + 1; + if (*env.cursor == env.delimiter) + p->re_npat++; + else if (env.flags & REG_MUSTDELIM) + { + env.error = REG_EDELIM; + goto bad; + } + else + env.flags &= ~REG_DELIMITED; + } + p->env->explicit = env.explicit; + p->env->flags = env.flags & REG_COMP; + p->env->min = env.stats.m; + p->env->nsub = env.stats.p + env.stats.u; + p->env->refs = 1; + return 0; + bad: + regfree(p); + if (!env.error) + env.error = REG_ESPACE; + if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) + { + flags |= REG_LITERAL; + pattern = (const char*)env.literal; + goto again; + } + return fatal(disc, env.error, pattern); +} + +/* + * regcomp() on sized pattern + * the lazy way around adding and checking env.end + */ + +int +regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) +{ + char* s; + int r; + + if (!(s = malloc(size + 1))) + return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); + memcpy(s, pattern, size); + s[size] = 0; + r = regcomp(p, s, flags); + free(s); + return r; +} + +/* + * combine two compiled regular expressions if possible, + * replacing first with the combination and freeing second. + * return 0 on success. + * the only combinations handled are building a Trie + * from String|Kmp|Trie and String|Kmp + */ + +int +regcomb(regex_t* p, regex_t* q) +{ + Rex_t* e = p->env->rex; + Rex_t* f = q->env->rex; + Rex_t* g; + Rex_t* h; + Cenv_t env; + + if (!e || !f) + return fatal(p->env->disc, REG_BADPAT, NiL); + if (p->env->separate || q->env->separate) + return REG_ESUBREG; + memset(&env, 0, sizeof(env)); + env.disc = p->env->disc; + if (e->type == REX_BM) + { + p->env->rex = e->next; + e->next = 0; + drop(env.disc, e); + e = p->env->rex; + } + if (f->type == REX_BM) + { + q->env->rex = f->next; + f->next = 0; + drop(env.disc, f); + f = q->env->rex; + } + if (e->type == REX_BEG && f->type == REX_BEG) + { + p->env->flags |= REG_LEFT; + p->env->rex = e->next; + e->next = 0; + drop(env.disc, e); + e = p->env->rex; + q->env->rex = f->next; + f->next = 0; + drop(env.disc, f); + f = q->env->rex; + } + for (g = e; g->next; g = g->next); + for (h = f; h->next; h = h->next); + if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END) + { + p->env->flags |= REG_RIGHT; + drop(env.disc, g->next); + g->next = 0; + drop(env.disc, h->next); + h->next = 0; + } + if (!(g = trie(&env, f, e))) + return fatal(p->env->disc, REG_BADPAT, NiL); + p->env->rex = g; + if (!q->env->once) + p->env->once = 0; + q->env->rex = 0; + if (p->env->flags & REG_LEFT) + { + if (!(e = node(&env, REX_BEG, 0, 0, 0))) + { + regfree(p); + return fatal(p->env->disc, REG_ESPACE, NiL); + } + e->next = p->env->rex; + p->env->rex = e; + p->env->once = 1; + } + if (p->env->flags & REG_RIGHT) + { + for (f = p->env->rex; f->next; f = f->next); + if (f->type != REX_END) + { + if (!(e = node(&env, REX_END, 0, 0, 0))) + { + regfree(p); + return fatal(p->env->disc, REG_ESPACE, NiL); + } + f->next = e; + } + } + env.explicit = p->env->explicit; + env.flags = p->env->flags; + env.disc = p->env->disc; + if (stats(&env, p->env->rex)) + { + regfree(p); + return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); + } + if (special(&env, p)) + { + regfree(p); + return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); + } + p->env->min = g->re.trie.min; + return 0; +} + +/* + * copy a reference of p into q + * p and q may then have separate regsubcomp() instantiations + */ + +int +regdup(regex_t* p, regex_t* q) +{ + if (!p || !q) + return REG_BADPAT; + *q = *p; + p->env->refs++; + q->re_sub = 0; + return 0; +} diff --git a/src/lib/libast/regex/regdecomp.c b/src/lib/libast/regex/regdecomp.c new file mode 100644 index 0000000..a9fe6f3 --- /dev/null +++ b/src/lib/libast/regex/regdecomp.c @@ -0,0 +1,448 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex decompiler + */ + +#include "reglib.h" + +#undef ismeta +#define ismeta(c,t,e,d) (state.magic[c] && state.magic[c][(t)+(e)] >= T_META || (c) == (d)) +#define meta(f,c,t,e,d) do { if (ismeta(c,t,e,d)) sfputc(f, '\\'); sfputc(f, c); } while (0) + +static void +detrie(Trie_node_t* x, Sfio_t* sp, char* b, char* p, char* e, int delimiter) +{ + register Trie_node_t* y; + char* o; + int k; + + o = p; + k = 1; + do + { + if (k) + { + o = p; + if (p < e) + *p++ = x->c; + } + sfputc(sp, x->c); + for (y = x->sib; y; y = y->sib) + { + sfputc(sp, '|'); + sfputc(sp, '<'); + sfwrite(sp, b, p - b); + sfputc(sp, '>'); + detrie(y, sp, b, p, e, delimiter); + } + if (x->end && x->son) + { + sfputc(sp, '|'); + sfputc(sp, '{'); + sfwrite(sp, b, p - b); + sfputc(sp, '}'); + p = o; + } + } while (x = x->son); +} + +static int +decomp(register Rex_t* e, Sfio_t* sp, int type, int delimiter, regflags_t flags) +{ + Rex_t* q; + unsigned char* s; + unsigned char* t; + int c; + int m; + int cb; + int cd; + int cr; + int ib; + int ie; + int nb; + int ne; + unsigned char ic[2*UCHAR_MAX]; + unsigned char nc[2*UCHAR_MAX]; + + do + { + switch (e->type) + { + case REX_ALT: + if (decomp(e->re.group.expr.binary.left, sp, type, delimiter, flags)) + return 1; + sfputc(sp, '|'); + if (e->re.group.expr.binary.right && decomp(e->re.group.expr.binary.right, sp, type, delimiter, flags)) + return 1; + break; + case REX_BACK: + sfprintf(sp, "\\%d", e->lo); + break; + case REX_BEG: + if (type < SRE) + sfputc(sp, '^'); + break; + case REX_END: + if (type < SRE) + sfputc(sp, '$'); + break; + case REX_WBEG: + meta(sp, '<', type, 1, delimiter); + break; + case REX_WEND: + meta(sp, '<', type, 1, delimiter); + break; + case REX_WORD: + sfprintf(sp, "\\w"); + break; + case REX_CLASS: + case REX_COLL_CLASS: + case REX_ONECHAR: + case REX_DOT: + case REX_REP: + if (type >= SRE) + { + c = ')'; + if (e->hi == RE_DUP_INF) + { + if (!e->lo) + sfputc(sp, '*'); + else if (e->lo == 1) + sfputc(sp, '+'); + else + sfprintf(sp, "{%d,}", e->lo); + } + else if (e->hi != 1) + sfprintf(sp, "{%d,%d}", e->lo, e->hi); + else if (e->lo == 0) + sfputc(sp, '?'); + else + c = 0; + } + switch (e->type) + { + case REX_REP: + if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) + return 1; + break; + case REX_CLASS: + sfputc(sp, '['); + nb = ne = ib = ie = -2; + cb = cd = cr = 0; + s = nc; + t = ic; + for (m = 0; m <= UCHAR_MAX; m++) + if (settst(e->re.charclass, m)) + { + if (m == ']') + cb = 1; + else if (m == '-') + cr = 1; + else if (m == delimiter) + cd = 1; + else if (nb < 0) + ne = nb = m; + else if (ne == (m - 1)) + ne = m; + else + { + if (ne == nb) + *s++ = ne; + else + { + *s++ = nb; + *s++ = '-'; + *s++ = ne; + } + ne = nb = m; + } + } + else + { + if (m == ']') + cb = -1; + else if (m == '-') + cr = -1; + else if (m == delimiter) + cd = -1; + else if (ib < 0) + ie = ib = m; + else if (ie == (m - 1)) + ie = m; + else + { + if (ie == ib) + *t++ = ie; + else + { + *t++ = ib; + *t++ = '-'; + *t++ = ie; + } + ie = ib = m; + } + } + if (nb >= 0) + { + *s++ = nb; + if (ne != nb) + { + *s++ = '-'; + *s++ = ne; + } + } + if (ib >= 0) + { + *t++ = ib; + if (ie != ib) + { + *t++ = '-'; + *t++ = ie; + } + } + if ((t - ic + 1) < (s - nc + (nc[0] == '^'))) + { + sfputc(sp, '^'); + if (cb < 0) + sfputc(sp, ']'); + if (cr < 0) + sfputc(sp, '-'); + if (cd < 0 && delimiter > 0) + { + if (flags & REG_ESCAPE) + sfputc(sp, '\\'); + sfputc(sp, delimiter); + } + sfwrite(sp, ic, t - ic); + } + else + { + if (cb > 0) + sfputc(sp, ']'); + if (cr > 0) + sfputc(sp, '-'); + if (cd > 0 && delimiter > 0) + { + if (flags & REG_ESCAPE) + sfputc(sp, '\\'); + sfputc(sp, delimiter); + } + if (nc[0] == '^') + { + sfwrite(sp, nc + 1, s - nc - 1); + sfputc(sp, '^'); + } + else + sfwrite(sp, nc, s - nc); + } + sfputc(sp, ']'); + break; + case REX_COLL_CLASS: + break; + case REX_ONECHAR: + meta(sp, e->re.onechar, type, 0, delimiter); + break; + case REX_DOT: + sfputc(sp, '.'); + break; + } + if (type < SRE) + { + if (e->hi == RE_DUP_INF) + { + if (!e->lo) + sfputc(sp, '*'); + else if (e->lo == 1 && ismeta('+', type, 0, delimiter)) + meta(sp, '+', type, 1, delimiter); + else + { + meta(sp, '{', type, 1, delimiter); + sfprintf(sp, "%d,", e->lo); + meta(sp, '}', type, 1, delimiter); + } + } + else if (e->hi != 1 || e->lo == 0 && !ismeta('?', type, 0, delimiter)) + { + meta(sp, '{', type, 1, delimiter); + sfprintf(sp, "%d,%d", e->lo, e->hi); + meta(sp, '}', type, 1, delimiter); + } + else if (e->lo == 0) + meta(sp, '?', type, 1, delimiter); + } + else if (c) + sfputc(sp, c); + break; + case REX_STRING: + case REX_KMP: + t = (s = e->re.string.base) + e->re.string.size; + while (s < t) + { + c = *s++; + meta(sp, c, type, 0, delimiter); + } + break; + case REX_TRIE: + ib = 0; + for (c = 0; c <= UCHAR_MAX; c++) + if (e->re.trie.root[c]) + { + char pfx[1024]; + + if (ib) + sfputc(sp, '|'); + else + ib = 1; + detrie(e->re.trie.root[c], sp, pfx, pfx, &pfx[sizeof(pfx)], delimiter); + } + break; + case REX_NEG: + if (type >= SRE) + sfprintf(sp, "!("); + if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) + return 1; + if (type >= SRE) + sfputc(sp, ')'); + else + sfputc(sp, '!'); + break; + case REX_CONJ: + if (decomp(e->re.group.expr.binary.left, sp, type, delimiter, flags)) + return 1; + sfputc(sp, '&'); + if (decomp(e->re.group.expr.binary.right, sp, type, delimiter, flags)) + return 1; + break; + case REX_GROUP: + if (type >= SRE) + sfputc(sp, '@'); + meta(sp, '(', type, 1, delimiter); + if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) + return 1; + meta(sp, ')', type, 1, delimiter); + break; + case REX_GROUP_AHEAD: + case REX_GROUP_AHEAD_NOT: + case REX_GROUP_BEHIND: + case REX_GROUP_BEHIND_NOT: + meta(sp, '(', type, 1, delimiter); + sfputc(sp, '?'); + if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) + return 1; + meta(sp, ')', type, 1, delimiter); + break; + case REX_GROUP_COND: + meta(sp, '(', type, 1, delimiter); + sfputc(sp, '?'); + if (e->re.group.expr.binary.left && decomp(e->re.group.expr.binary.left, sp, type, delimiter, flags)) + return 1; + if (q = e->re.group.expr.binary.right) + { + sfputc(sp, ':'); + if (q->re.group.expr.binary.left && decomp(q->re.group.expr.binary.left, sp, type, delimiter, flags)) + return 1; + sfputc(sp, ':'); + if (q->re.group.expr.binary.right && decomp(q->re.group.expr.binary.right, sp, type, delimiter, flags)) + return 1; + } + meta(sp, ')', type, 1, delimiter); + break; + case REX_GROUP_CUT: + meta(sp, '(', type, 1, delimiter); + sfputc(sp, '?'); + if (decomp(e->re.group.expr.rex, sp, type, delimiter, flags)) + return 1; + meta(sp, ')', type, 1, delimiter); + break; + case REX_BM: + break; + default: + sfprintf(sp, "<ERROR:REX_%d>", e->type); + break; + } + } while (e = e->next); + return 0; +} + +/* + * reconstruct pattern from compiled re p into sp + */ + +size_t +regdecomp(regex_t* p, regflags_t flags, char* buf, size_t n) +{ + Sfio_t* sp; + char* s; + int type; + int delimiter; + size_t r; + + if (!(sp = sfstropen())) + return 0; + if (flags == (regflags_t)~0) + flags = p->env->flags; + switch (flags & (REG_AUGMENTED|REG_EXTENDED|REG_SHELL)) + { + case 0: + type = BRE; + break; + case REG_AUGMENTED: + case REG_AUGMENTED|REG_EXTENDED: + type = ARE; + break; + case REG_EXTENDED: + type = ERE; + break; + case REG_SHELL: + type = SRE; + break; + default: + type = KRE; + break; + } + if (flags & REG_DELIMITED) + { + delimiter = '/'; + sfputc(sp, delimiter); + } + else + delimiter = -1; + if (decomp(p->env->rex, sp, type, delimiter, flags)) + r = 0; + else + { + if (delimiter > 0) + sfputc(sp, delimiter); + if ((r = sfstrtell(sp) + 1) <= n) + { + if (!(s = sfstruse(sp))) + r = 0; + else + memcpy(buf, s, r); + } + } + sfstrclose(sp); + return r; +} diff --git a/src/lib/libast/regex/regerror.c b/src/lib/libast/regex/regerror.c new file mode 100644 index 0000000..3d0e40d --- /dev/null +++ b/src/lib/libast/regex/regerror.c @@ -0,0 +1,95 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex error message handler + */ + +static const char id[] = "\n@(#)$Id: regex (AT&T Research) 2010-09-22 $\0\n"; + +#include "reglib.h" + +static const char* reg_error[] = +{ + /* REG_ENOSYS */ "not supported", + /* REG_SUCCESS */ "success", + /* REG_NOMATCH */ "no match", + /* REG_BADPAT */ "invalid regular expression", + /* REG_ECOLLATE */ "invalid collation element", + /* REG_ECTYPE */ "invalid character class", + /* REG_EESCAPE */ "trailing \\ in pattern", + /* REG_ESUBREG */ "invalid \\digit backreference", + /* REG_EBRACK */ "[...] imbalance", + /* REG_EPAREN */ "\\(...\\) or (...) imbalance", + /* REG_EBRACE */ "\\{...\\} or {...} imbalance", + /* REG_BADBR */ "invalid {...} digits", + /* REG_ERANGE */ "invalid [...] range endpoint", + /* REG_ESPACE */ "out of space", + /* REG_BADRPT */ "unary op not preceded by re", + /* REG_ENULL */ "empty subexpr in pattern", + /* REG_ECOUNT */ "re component count overflow", + /* REG_BADESC */ "invalid \\char escape", + /* REG_VERSIONID*/ &id[10], + /* REG_EFLAGS */ "conflicting flags", + /* REG_EDELIM */ "invalid or omitted delimiter", + /* REG_PANIC */ "unrecoverable internal error", +}; + +size_t +regerror(int code, const regex_t* p, char* buf, size_t size) +{ + const char* s; + + NoP(p); + if (code++ == REG_VERSIONID) + s = (const char*)fmtident(&id[1]); + else if (code >= 0 && code < elementsof(reg_error)) + s = reg_error[code]; + else + s = (const char*)"unknown error"; + if (size) + { + strlcpy(buf, s, size); + buf[size - 1] = 0; + } + else + buf = (char*)s; + return strlen(buf) + 1; +} + +/* + * discipline error intercept + */ + +int +fatal(regdisc_t* disc, int code, const char* pattern) +{ + if (disc->re_errorf) + { + if (pattern) + (*disc->re_errorf)(NiL, disc, disc->re_errorlevel, "regular expression: %s: %s", pattern, reg_error[code+1]); + else + (*disc->re_errorf)(NiL, disc, disc->re_errorlevel, "regular expression: %s", reg_error[code+1]); + } + return code; +} diff --git a/src/lib/libast/regex/regexec.c b/src/lib/libast/regex/regexec.c new file mode 100644 index 0000000..fb8f428 --- /dev/null +++ b/src/lib/libast/regex/regexec.c @@ -0,0 +1,54 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex executor + * single unsized-string interface + */ + +#include "reglib.h" + +/* + * standard wrapper for the sized-record interface + */ + +int +regexec(const regex_t* p, const char* s, size_t nmatch, regmatch_t* match, regflags_t flags) +{ + if (flags & REG_STARTEND) + { + int r; + int m = match->rm_so; + regmatch_t* e; + + if (!(r = regnexec(p, s + m, match->rm_eo - m, nmatch, match, flags)) && m > 0) + for (e = match + nmatch; match < e; match++) + if (match->rm_so >= 0) + { + match->rm_so += m; + match->rm_eo += m; + } + return r; + } + return regnexec(p, s, s ? strlen(s) : 0, nmatch, match, flags); +} diff --git a/src/lib/libast/regex/regfatal.c b/src/lib/libast/regex/regfatal.c new file mode 100644 index 0000000..09e8689 --- /dev/null +++ b/src/lib/libast/regex/regfatal.c @@ -0,0 +1,49 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex fatal error interface to error() + */ + +#include "reglib.h" + +#include <error.h> + +void +regfatalpat(regex_t* p, int level, int code, const char* pat) +{ + char buf[128]; + + regerror(code, p, buf, sizeof(buf)); + regfree(p); + if (pat) + error(level, "regular expression: %s: %s", pat, buf); + else + error(level, "regular expression: %s", buf); +} + +void +regfatal(regex_t* p, int level, int code) +{ + regfatalpat(p, level, code, NiL); +} diff --git a/src/lib/libast/regex/reginit.c b/src/lib/libast/regex/reginit.c new file mode 100644 index 0000000..e4719b4 --- /dev/null +++ b/src/lib/libast/regex/reginit.c @@ -0,0 +1,412 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex state and alloc + */ + +#include "reglib.h" + +#if _PACKAGE_ast + +#include <ccode.h> + +#else + +#define CC_bel '\a' +#define CC_esc '\033' +#define CC_vt '\v' + +#endif + +/* + * state shared by all threads + */ + +State_t state = +{ + { -1, -1 }, + + /* + * escape code table + * the "funny" things get special treatment at ends of BRE + * + * BRE 0:normal 1:escaped 2:escaped-char-class + * ERE 3:normal 4:escaped 5:escaped-char-class + * ARE 6:normal 7:escaped 8:escaped-char-class + * SRE 9:normal 10:escaped 11:escaped-char-class + * KRE 12:normal 13:escaped 14:escaped-char-class + */ + + '\\', + '\\', '\\', '\\', + '\\', '\\', '\\', + '\\', '\\', '\\', + '\\', '\\', '\\', + '\\', '\\', '\\', + '^', /* funny */ + '^', '^', '^', + T_CFLX, '^', '^', + T_CFLX, '^', '^', + '^', '^', '^', + '^', '^', '^', + '.', + T_DOT, '.', T_BAD, + T_DOT, '.', T_BAD, + T_DOT, '.', T_BAD, + '.', '.', T_BAD, + '.', '.', T_BAD, + '$', /* funny */ + '$', '$', T_BAD, + T_DOLL, '$', T_BAD, + T_DOLL, '$', T_BAD, + '$', '$', T_BAD, + '$', '$', T_BAD, + '*', + T_STAR, '*', T_BAD, + T_STAR, '*', T_BAD, + T_STAR, '*', T_BAD, + T_STAR, '*', '*', + T_STAR, '*', '*', + '[', + T_BRA, '[', '[', + T_BRA, '[', '[', + T_BRA, '[', '[', + T_BRA, '[', '[', + T_BRA, '[', '[', + '|', + '|', T_BAD, T_BAD, + T_BAR, '|', T_BAD, + T_BAR, '|', T_BAD, + '|', '|', T_BAD, + T_BAR, '|', T_BAD, + '+', + '+', T_BAD, T_BAD, + T_PLUS, '+', T_BAD, + T_PLUS, '+', T_BAD, + '+', '+', T_BAD, + T_PLUS, '+', T_BAD, + '?', + '?', T_BAD, T_BAD, + T_QUES, '?', T_BAD, + T_QUES, '?', T_BAD, + T_QUES, '?', '?', + T_QUES, '?', '?', + '(', + '(', T_OPEN, T_BAD, + T_OPEN, '(', T_BAD, + T_OPEN, '(', T_BAD, + '(', '(', '(', + T_OPEN, '(', '(', + ')', + ')', T_CLOSE, T_BAD, + T_CLOSE, ')', T_BAD, + T_CLOSE, ')', T_BAD, + ')', ')', ')', + T_CLOSE, ')', ')', + '{', + '{', T_LEFT, T_BAD, + T_LEFT, '{', T_BAD, + T_LEFT, '{', T_BAD, + '{', '{', '{', + T_LEFT, '{', '{', + '}', + '}', T_RIGHT, T_BAD, + '}', T_BAD, T_BAD, + '}', T_BAD, T_BAD, + '}', '}', '}', + '}', '}', '}', + '&', + '&', T_BAD, T_BAD, + '&', T_AND, T_BAD, + T_AND, '&', T_BAD, + '&', '&', T_BAD, + T_AND, '&', T_BAD, + '!', + '!', T_BAD, T_BAD, + '!', T_BANG, T_BAD, + T_BANG, '!', T_BAD, + '!', '!', T_BAD, + T_BANG, '!', T_BAD, + '@', + '@', T_BAD, T_BAD, + '@', T_BAD, T_BAD, + '@', T_BAD, T_BAD, + '@', '@', T_BAD, + T_AT, '@', T_BAD, + '~', + '~', T_BAD, T_BAD, + '~', T_BAD, T_BAD, + '~', T_BAD, T_BAD, + '~', '~', T_BAD, + T_TILDE, '~', T_BAD, + '%', + '%', T_BAD, T_BAD, + '%', T_BAD, T_BAD, + '%', T_BAD, T_BAD, + '%', '%', T_BAD, + T_PERCENT, '%', T_BAD, + '<', + '<', T_LT, T_BAD, + '<', T_LT, T_BAD, + T_LT, '<', T_BAD, + '<', '<', T_BAD, + '<', '<', T_BAD, + '>', + '>', T_GT, T_BAD, + '>', T_GT, T_BAD, + T_GT, '>', T_BAD, + '>', '>', T_BAD, + '>', '>', T_BAD, + + /* backrefs */ + + '0', + '0', T_BACK+0, T_ESCAPE, + '0', T_BACK+0, T_ESCAPE, + '0', T_BACK+0, T_ESCAPE, + '0', T_BACK+0, T_ESCAPE, + '0', T_BACK+0, T_ESCAPE, + '1', + '1', T_BACK+1, T_ESCAPE, + '1', T_BACK+1, T_ESCAPE, + '1', T_BACK+1, T_ESCAPE, + '1', T_BACK+1, T_ESCAPE, + '1', T_BACK+1, T_ESCAPE, + '2', + '2', T_BACK+2, T_ESCAPE, + '2', T_BACK+2, T_ESCAPE, + '2', T_BACK+2, T_ESCAPE, + '2', T_BACK+2, T_ESCAPE, + '2', T_BACK+2, T_ESCAPE, + '3', + '3', T_BACK+3, T_ESCAPE, + '3', T_BACK+3, T_ESCAPE, + '3', T_BACK+3, T_ESCAPE, + '3', T_BACK+3, T_ESCAPE, + '3', T_BACK+3, T_ESCAPE, + '4', + '4', T_BACK+4, T_ESCAPE, + '4', T_BACK+4, T_ESCAPE, + '4', T_BACK+4, T_ESCAPE, + '4', T_BACK+4, T_ESCAPE, + '4', T_BACK+4, T_ESCAPE, + '5', + '5', T_BACK+5, T_ESCAPE, + '5', T_BACK+5, T_ESCAPE, + '5', T_BACK+5, T_ESCAPE, + '5', T_BACK+5, T_ESCAPE, + '5', T_BACK+5, T_ESCAPE, + '6', + '6', T_BACK+6, T_ESCAPE, + '6', T_BACK+6, T_ESCAPE, + '6', T_BACK+6, T_ESCAPE, + '6', T_BACK+6, T_ESCAPE, + '6', T_BACK+6, T_ESCAPE, + '7', + '7', T_BACK+7, T_ESCAPE, + '7', T_BACK+7, T_ESCAPE, + '7', T_BACK+7, T_ESCAPE, + '7', T_BACK+7, T_ESCAPE, + '7', T_BACK+7, T_ESCAPE, + '8', + '8', T_BACK+8, T_ESCAPE, + '8', T_BACK+8, T_ESCAPE, + '8', T_BACK+8, T_ESCAPE, + '8', '8', T_ESCAPE, + '8', T_BACK+8, T_ESCAPE, + '9', + '9', T_BACK+9, T_ESCAPE, + '9', T_BACK+9, T_ESCAPE, + '9', T_BACK+9, T_ESCAPE, + '9', '9', T_ESCAPE, + '9', T_BACK+9, T_ESCAPE, + + /* perl */ + + 'A', + 'A', T_BEG_STR, T_BAD, + 'A', T_BEG_STR, T_BAD, + 'A', T_BEG_STR, T_BAD, + 'A', T_BEG_STR, T_BAD, + 'A', T_BEG_STR, T_BAD, + 'b', + 'b', T_WORD, '\b', + 'b', T_WORD, '\b', + 'b', T_WORD, '\b', + 'b', T_WORD, '\b', + 'b', T_WORD, '\b', + 'B', + 'B', T_WORD_NOT, T_BAD, + 'B', T_WORD_NOT, T_BAD, + 'B', T_WORD_NOT, T_BAD, + 'B', T_WORD_NOT, T_BAD, + 'B', T_WORD_NOT, T_BAD, + 'd', + 'd', T_DIGIT, T_DIGIT, + 'd', T_DIGIT, T_DIGIT, + 'd', T_DIGIT, T_DIGIT, + 'd', T_DIGIT, T_DIGIT, + 'd', T_DIGIT, T_DIGIT, + 'D', + 'D', T_DIGIT_NOT, T_DIGIT_NOT, + 'D', T_DIGIT_NOT, T_DIGIT_NOT, + 'D', T_DIGIT_NOT, T_DIGIT_NOT, + 'D', T_DIGIT_NOT, T_DIGIT_NOT, + 'D', T_DIGIT_NOT, T_DIGIT_NOT, + 's', + 's', T_SPACE, T_SPACE, + 's', T_SPACE, T_SPACE, + 's', T_SPACE, T_SPACE, + 's', T_SPACE, T_SPACE, + 's', T_SPACE, T_SPACE, + 'S', + 'S', T_SPACE_NOT, T_SPACE_NOT, + 'S', T_SPACE_NOT, T_SPACE_NOT, + 'S', T_SPACE_NOT, T_SPACE_NOT, + 'S', T_SPACE_NOT, T_SPACE_NOT, + 'S', T_SPACE_NOT, T_SPACE_NOT, + 'w', + 'w', T_ALNUM, T_ALNUM, + 'w', T_ALNUM, T_ALNUM, + 'w', T_ALNUM, T_ALNUM, + 'w', T_ALNUM, T_ALNUM, + 'w', T_ALNUM, T_ALNUM, + 'W', + 'W', T_ALNUM_NOT, T_ALNUM_NOT, + 'W', T_ALNUM_NOT, T_ALNUM_NOT, + 'W', T_ALNUM_NOT, T_ALNUM_NOT, + 'W', T_ALNUM_NOT, T_ALNUM_NOT, + 'W', T_ALNUM_NOT, T_ALNUM_NOT, + 'z', + 'z', T_FIN_STR, T_BAD, + 'z', T_FIN_STR, T_BAD, + 'z', T_FIN_STR, T_BAD, + 'z', T_FIN_STR, T_BAD, + 'z', T_FIN_STR, T_BAD, + 'Z', + 'Z', T_END_STR, T_BAD, + 'Z', T_END_STR, T_BAD, + 'Z', T_END_STR, T_BAD, + 'Z', T_END_STR, T_BAD, + 'Z', T_END_STR, T_BAD, + + /* C escapes */ + + 'a', + 'a', CC_bel, CC_bel, + 'a', CC_bel, CC_bel, + 'a', CC_bel, CC_bel, + 'a', CC_bel, CC_bel, + 'a', CC_bel, CC_bel, + 'c', + 'c', T_ESCAPE, T_ESCAPE, + 'c', T_ESCAPE, T_ESCAPE, + 'c', T_ESCAPE, T_ESCAPE, + 'c', T_ESCAPE, T_ESCAPE, + 'c', T_ESCAPE, T_ESCAPE, + 'C', + 'C', T_ESCAPE, T_ESCAPE, + 'C', T_ESCAPE, T_ESCAPE, + 'C', T_ESCAPE, T_ESCAPE, + 'C', T_ESCAPE, T_ESCAPE, + 'C', T_ESCAPE, T_ESCAPE, + 'e', + 'e', CC_esc, CC_esc, + 'e', CC_esc, CC_esc, + 'e', CC_esc, CC_esc, + 'e', CC_esc, CC_esc, + 'e', CC_esc, CC_esc, + 'E', + 'E', CC_esc, CC_esc, + 'E', CC_esc, CC_esc, + 'E', CC_esc, CC_esc, + 'E', CC_esc, CC_esc, + 'E', CC_esc, CC_esc, + 'f', + 'f', '\f', '\f', + 'f', '\f', '\f', + 'f', '\f', '\f', + 'f', '\f', '\f', + 'f', '\f', '\f', + 'n', + 'n', '\n', '\n', + 'n', '\n', '\n', + 'n', '\n', '\n', + 'n', '\n', '\n', + 'n', '\n', '\n', + 'r', + 'r', '\r', '\r', + 'r', '\r', '\r', + 'r', '\r', '\r', + 'r', '\r', '\r', + 'r', '\r', '\r', + 't', + 't', '\t', '\t', + 't', '\t', '\t', + 't', '\t', '\t', + 't', '\t', '\t', + 't', '\t', '\t', + 'v', + 'v', CC_vt, CC_vt, + 'v', CC_vt, CC_vt, + 'v', CC_vt, CC_vt, + 'v', CC_vt, CC_vt, + 'v', CC_vt, CC_vt, + 'x', + 'x', T_ESCAPE, T_ESCAPE, + 'x', T_ESCAPE, T_ESCAPE, + 'x', T_ESCAPE, T_ESCAPE, + 'x', T_ESCAPE, T_ESCAPE, + 'x', T_ESCAPE, T_ESCAPE, +}; + +/* + * all allocation/free done here + * interface compatible with vmresize() + * + * malloc(n) alloc(0,n) + * realloc(p,n) alloc(p,n) + * free(p) alloc(p,0) + */ + +void* +alloc(register regdisc_t* disc, void* p, size_t n) +{ + if (disc->re_resizef) + { + if (!n && (disc->re_flags & REG_NOFREE)) + return 0; + return (*disc->re_resizef)(disc->re_resizehandle, p, n); + } + else if (!n) + { + if (!(disc->re_flags & REG_NOFREE)) + free(p); + return 0; + } + else if (p) + return realloc(p, n); + else + return malloc(n); +} diff --git a/src/lib/libast/regex/reglib.h b/src/lib/libast/regex/reglib.h new file mode 100644 index 0000000..6b84e5d --- /dev/null +++ b/src/lib/libast/regex/reglib.h @@ -0,0 +1,582 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex implementation + * + * based on Doug McIlroy's C++ implementation + * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest + * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume + */ + +#ifndef _REGLIB_H +#define _REGLIB_H + +#define REG_VERSION_EXEC 20020509L +#define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */ + +#define re_info env + +#define alloc _reg_alloc +#define classfun _reg_classfun +#define drop _reg_drop +#define fatal _reg_fatal +#define state _reg_state + +typedef struct regsubop_s +{ + int op; /* REG_SUB_LOWER,REG_SUB_UPPER */ + int off; /* re_rhs or match[] offset */ + int len; /* re_rhs len or len==0 match[] */ +} regsubop_t; + +#define _REG_SUB_PRIVATE_ \ + char* re_cur; /* re_buf cursor */ \ + char* re_end; /* re_buf end */ \ + regsubop_t* re_ops; /* rhs ops */ \ + char re_rhs[1]; /* substitution rhs */ + +#include <ast.h> +#include <cdt.h> +#include <stk.h> + +#include "regex.h" + +#include <ctype.h> +#include <errno.h> + +#if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG) +#define _AST_REGEX_DEBUG 1 +#endif + +#define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1) + +#undef RE_DUP_MAX /* posix puts this in limits.h! */ +#define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */ +#define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */ +#define BACK_REF_MAX 9 + +#define REG_COMP (~REG_EXEC) +#define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND) + +#define REX_NULL 0 /* null string (internal) */ +#define REX_ALT 1 /* a|b */ +#define REX_ALT_CATCH 2 /* REX_ALT catcher */ +#define REX_BACK 3 /* \1, \2, etc */ +#define REX_BEG 4 /* initial ^ */ +#define REX_BEG_STR 5 /* initial ^ w/ no newline */ +#define REX_BM 6 /* Boyer-Moore */ +#define REX_CAT 7 /* catenation catcher */ +#define REX_CLASS 8 /* [...] */ +#define REX_COLL_CLASS 9 /* collation order [...] */ +#define REX_CONJ 10 /* a&b */ +#define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */ +#define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */ +#define REX_DONE 13 /* completed match (internal) */ +#define REX_DOT 14 /* . */ +#define REX_END 15 /* final $ */ +#define REX_END_STR 16 /* final $ before tail newline */ +#define REX_EXEC 17 /* call re.re_exec() */ +#define REX_FIN_STR 18 /* final $ w/ no newline */ +#define REX_GROUP 19 /* \(...\) */ +#define REX_GROUP_CATCH 20 /* REX_GROUP catcher */ +#define REX_GROUP_AHEAD 21 /* 0-width lookahead */ +#define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */ +#define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */ +#define REX_GROUP_BEHIND 24 /* 0-width lookbehind */ +#define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */ +#define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */ +#define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */ +#define REX_GROUP_COND 28 /* conditional group */ +#define REX_GROUP_COND_CATCH 29 /* conditional group catcher */ +#define REX_GROUP_CUT 30 /* don't backtrack over this */ +#define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */ +#define REX_KMP 32 /* Knuth-Morris-Pratt */ +#define REX_NEG 33 /* negation */ +#define REX_NEG_CATCH 34 /* REX_NEG catcher */ +#define REX_NEST 35 /* nested match */ +#define REX_ONECHAR 36 /* a single-character literal */ +#define REX_REP 37 /* Kleene closure */ +#define REX_REP_CATCH 38 /* REX_REP catcher */ +#define REX_STRING 39 /* some chars */ +#define REX_TRIE 40 /* alternation of strings */ +#define REX_WBEG 41 /* \< */ +#define REX_WEND 42 /* \> */ +#define REX_WORD 43 /* word boundary */ +#define REX_WORD_NOT 44 /* not word boundary */ + +#define T_META ((int)UCHAR_MAX+1) +#define T_STAR (T_META+0) +#define T_PLUS (T_META+1) +#define T_QUES (T_META+2) +#define T_BANG (T_META+3) +#define T_AT (T_META+4) +#define T_TILDE (T_META+5) +#define T_PERCENT (T_META+6) +#define T_LEFT (T_META+7) +#define T_OPEN (T_META+8) +#define T_CLOSE (T_OPEN+1) +#define T_RIGHT (T_OPEN+2) +#define T_CFLX (T_OPEN+3) +#define T_DOT (T_OPEN+4) +#define T_DOTSTAR (T_OPEN+5) +#define T_END (T_OPEN+6) +#define T_BAD (T_OPEN+7) +#define T_DOLL (T_OPEN+8) +#define T_BRA (T_OPEN+9) +#define T_BAR (T_OPEN+10) +#define T_AND (T_OPEN+11) +#define T_LT (T_OPEN+12) +#define T_GT (T_OPEN+13) +#define T_SLASHPLUS (T_OPEN+14) +#define T_GROUP (T_OPEN+15) +#define T_WORD (T_OPEN+16) +#define T_WORD_NOT (T_WORD+1) +#define T_BEG_STR (T_WORD+2) +#define T_END_STR (T_WORD+3) +#define T_FIN_STR (T_WORD+4) +#define T_ESCAPE (T_WORD+5) +#define T_ALNUM (T_WORD+6) +#define T_ALNUM_NOT (T_ALNUM+1) +#define T_DIGIT (T_ALNUM+2) +#define T_DIGIT_NOT (T_ALNUM+3) +#define T_SPACE (T_ALNUM+4) +#define T_SPACE_NOT (T_ALNUM+5) +#define T_BACK (T_ALNUM+6) + +#define BRE 0 +#define ERE 3 +#define ARE 6 +#define SRE 9 +#define KRE 12 + +#define HIT SSIZE_MAX + +#define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07)))) +#define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07))) +#define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07))) + +#define setadd(p,c) bitset((p)->bits,c) +#define setclr(p,c) bitclr((p)->bits,c) +#define settst(p,c) bittst((p)->bits,c) + +#if _hdr_wchar && _lib_wctype && _lib_iswctype + +#include <stdio.h> /* because <wchar.h> includes it and we generate it */ +#include <wchar.h> +#if _hdr_wctype +#include <wctype.h> +#endif + +#if !defined(iswblank) && !_lib_iswblank +#define _need_iswblank 1 +#define iswblank(x) _reg_iswblank(x) +extern int _reg_iswblank(wint_t); +#endif + +#if !defined(towupper) && !_lib_towupper +#define towupper(x) toupper(x) +#endif + +#if !defined(towlower) && !_lib_towlower +#define towlower(x) tolower(x) +#endif + +#else + +#undef _lib_wctype + +#ifndef iswalnum +#define iswalnum(x) isalnum(x) +#endif +#ifndef iswalpha +#define iswalpha(x) isalpha(x) +#endif +#ifndef iswcntrl +#define iswcntrl(x) iscntrl(x) +#endif +#ifndef iswdigit +#define iswdigit(x) isdigit(x) +#endif +#ifndef iswgraph +#define iswgraph(x) isgraph(x) +#endif +#ifndef iswlower +#define iswlower(x) islower(x) +#endif +#ifndef iswprint +#define iswprint(x) isprint(x) +#endif +#ifndef iswpunct +#define iswpunct(x) ispunct(x) +#endif +#ifndef iswspace +#define iswspace(x) isspace(x) +#endif +#ifndef iswupper +#define iswupper(x) isupper(x) +#endif +#ifndef iswxdigit +#define iswxdigit(x) isxdigit(x) +#endif + +#ifndef towlower +#define towlower(x) tolower(x) +#endif +#ifndef towupper +#define towupper(x) toupper(x) +#endif + +#endif + +#ifndef iswblank +#define iswblank(x) ((x)==' '||(x)=='\t') +#endif + +#ifndef iswgraph +#define iswgraph(x) (iswprint(x)&&!iswblank(x)) +#endif + +#define isword(x) (isalnum(x)||(x)=='_') + +/* + * collation element support + */ + +#define COLL_KEY_MAX 32 + +#if COLL_KEY_MAX < MB_LEN_MAX +#undef COLL_KEY_MAX +#define COLL_KEY_MAX MB_LEN_MAX +#endif + +typedef unsigned char Ckey_t[COLL_KEY_MAX+1]; + +#define COLL_end 0 +#define COLL_call 1 +#define COLL_char 2 +#define COLL_range 3 +#define COLL_range_lc 4 +#define COLL_range_uc 5 + +typedef struct Celt_s +{ + short typ; + short min; + short max; + regclass_t fun; + Ckey_t beg; + Ckey_t end; +} Celt_t; + +/* + * private stuff hanging off regex_t + */ + +typedef struct Stk_pos_s +{ + off_t offset; + char* base; +} Stk_pos_t; + +typedef struct Vector_s +{ + Stk_t* stk; /* stack pointer */ + char* vec; /* the data */ + int inc; /* growth increment */ + int siz; /* element size */ + int max; /* max index */ + int cur; /* current index -- user domain */ +} Vector_t; + +/* + * Rex_t subtypes + */ + +typedef struct Cond_s +{ + unsigned char* beg; /* beginning of next match */ + struct Rex_s* next[2]; /* 0:no 1:yes next pattern */ + struct Rex_s* cont; /* right catcher */ + int yes; /* yes condition hit */ +} Cond_t; + +typedef struct Conj_left_s +{ + unsigned char* beg; /* beginning of left match */ + struct Rex_s* right; /* right pattern */ + struct Rex_s* cont; /* right catcher */ +} Conj_left_t; + +typedef struct Conj_right_s +{ + unsigned char* end; /* end of left match */ + struct Rex_s* cont; /* ambient continuation */ +} Conj_right_t; + +typedef unsigned int Bm_mask_t; + +typedef struct Bm_s +{ + Bm_mask_t** mask; + size_t* skip; + size_t* fail; + size_t size; + ssize_t back; + ssize_t left; + ssize_t right; + size_t complete; +} Bm_t; + +typedef struct String_s +{ + int* fail; + unsigned char* base; + size_t size; +} String_t; + +typedef struct Set_s +{ + unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT]; +} Set_t; + +typedef struct Collate_s +{ + int invert; + Celt_t* elements; +} Collate_t; + +typedef struct Binary_s +{ + struct Rex_s* left; + struct Rex_s* right; + int serial; +} Binary_t; + +typedef struct Group_s +{ + int number; /* group number */ + int last; /* last contained group number */ + int size; /* lookbehind size */ + int back; /* backreferenced */ + regflags_t flags; /* group flags */ + union + { + Binary_t binary; + struct Rex_s* rex; + } expr; +} Group_t; + +typedef struct Exec_s +{ + void* data; + const char* text; + size_t size; +} Exec_t; + +#define REX_NEST_open 0x01 +#define REX_NEST_close 0x02 +#define REX_NEST_escape 0x04 +#define REX_NEST_quote 0x08 +#define REX_NEST_literal 0x10 +#define REX_NEST_delimiter 0x20 +#define REX_NEST_terminator 0x40 +#define REX_NEST_separator 0x80 + +#define REX_NEST_SHIFT 8 + +typedef struct Nest_s +{ + int primary; + unsigned short none; /* for Nest_t.type[-1] */ + unsigned short type[1]; +} Nest_t; + +/* + * REX_ALT catcher, solely to get control at the end of an + * alternative to keep records for comparing matches. + */ + +typedef struct Alt_catch_s +{ + struct Rex_s* cont; +} Alt_catch_t; + +typedef struct Group_catch_s +{ + struct Rex_s* cont; + regoff_t* eo; +} Group_catch_t; + +typedef struct Behind_catch_s +{ + struct Rex_s* cont; + unsigned char* beg; + unsigned char* end; +} Behind_catch_t; + +/* + * REX_NEG catcher determines what string lengths can be matched, + * then Neg investigates continuations of other lengths. + * This is inefficient. For !POSITIONS expressions, we can do better: + * since matches to rex will be enumerated in decreasing order, + * we can investigate continuations whenever a length is skipped. + */ + +typedef struct Neg_catch_s +{ + unsigned char* beg; + unsigned char* index; +} Neg_catch_t; + +/* + * REX_REP catcher. One is created on the stack for + * each iteration of a complex repetition. + */ + +typedef struct Rep_catch_s +{ + struct Rex_s* cont; + struct Rex_s* ref; + unsigned char* beg; + int n; +} Rep_catch_t; + +/* + * data structure for an alternation of pure strings + * son points to a subtree of all strings with a common + * prefix ending in character c. sib links alternate + * letters in the same position of a word. end=1 if + * some word ends with c. the order of strings is + * irrelevant, except long words must be investigated + * before short ones. + */ + +typedef struct Trie_node_s +{ + unsigned char c; + unsigned char end; + struct Trie_node_s* son; + struct Trie_node_s* sib; +} Trie_node_t; + +typedef struct Trie_s +{ + Trie_node_t** root; + int min; + int max; +} Trie_t; + +/* + * Rex_t is a node in a regular expression + */ + +typedef struct Rex_s +{ + unsigned char type; /* node type */ + unsigned char marked; /* already marked */ + short serial; /* subpattern number */ + regflags_t flags; /* scoped flags */ + int explicit; /* scoped explicit match*/ + struct Rex_s* next; /* remaining parts */ + int lo; /* lo dup count */ + int hi; /* hi dup count */ + unsigned char* map; /* fold and/or ccode map*/ + union + { + Alt_catch_t alt_catch; /* alt catcher */ + Bm_t bm; /* bm */ + Behind_catch_t behind_catch; /* behind catcher */ + Set_t* charclass; /* char class */ + Collate_t collate; /* collation class */ + Cond_t cond_catch; /* cond catcher */ + Conj_left_t conj_left; /* conj left catcher */ + Conj_right_t conj_right; /* conj right catcher */ + void* data; /* data after Rex_t */ + Exec_t exec; /* re.re_exec() args */ + Group_t group; /* a|b or rep */ + Group_catch_t group_catch; /* group catcher */ + Neg_catch_t neg_catch; /* neg catcher */ + Nest_t nest; /* nested match */ + unsigned char onechar; /* single char */ + Rep_catch_t rep_catch; /* rep catcher */ + String_t string; /* string/kmp */ + Trie_t trie; /* trie */ + } re; +} Rex_t; + +typedef struct reglib_s /* library private regex_t info */ +{ + struct Rex_s* rex; /* compiled expression */ + regdisc_t* disc; /* REG_DISCIPLINE discipline */ + const regex_t* regex; /* from regexec */ + unsigned char* beg; /* beginning of string */ + unsigned char* end; /* end of string */ + Vector_t* pos; /* posns of certain subpatterns */ + Vector_t* bestpos; /* ditto for best match */ + regmatch_t* match; /* subexrs in current match */ + regmatch_t* best; /* ditto in best match yet */ + Stk_pos_t stk; /* exec stack pos */ + size_t min; /* minimum match length */ + size_t nsub; /* internal re_nsub */ + regflags_t flags; /* flags from regcomp() */ + int error; /* last error */ + int explicit; /* explicit match on this char */ + int leading; /* leading match on this char */ + int refs; /* regcomp()+regdup() references*/ + Rex_t done; /* the last continuation */ + regstat_t stats; /* for regstat() */ + unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */ + unsigned char hard; /* hard comp */ + unsigned char once; /* if 1st parse fails, quit */ + unsigned char separate; /* cannot combine */ + unsigned char stack; /* hard comp or exec */ + unsigned char sub; /* re_sub is valid */ + unsigned char test; /* debug/test bitmask */ +} Env_t; + +typedef struct State_s /* shared state */ +{ + regmatch_t nomatch; + struct + { + unsigned char key; + short val[15]; + } escape[52]; + short* magic[UCHAR_MAX+1]; + regdisc_t disc; + int fatal; + int initialized; + Dt_t* attrs; + Dt_t* names; + Dtdisc_t dtdisc; +} State_t; + +extern State_t state; + +extern void* alloc(regdisc_t*, void*, size_t); +extern regclass_t classfun(int); +extern void drop(regdisc_t*, Rex_t*); +extern int fatal(regdisc_t*, int, const char*); + +#endif diff --git a/src/lib/libast/regex/regnexec.c b/src/lib/libast/regex/regnexec.c new file mode 100644 index 0000000..0c3a0aa --- /dev/null +++ b/src/lib/libast/regex/regnexec.c @@ -0,0 +1,2045 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex executor + * single sized-record interface + */ + +#include "reglib.h" + +#if _AST_REGEX_DEBUG + +#define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) +#define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) +#define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0) + +static unsigned long debug; +static unsigned long debug_flag; + +static const char* rexnames[] = +{ + "REX_NULL", + "REX_ALT", + "REX_ALT_CATCH", + "REX_BACK", + "REX_BEG", + "REX_BEG_STR", + "REX_BM", + "REX_CAT", + "REX_CLASS", + "REX_COLL_CLASS", + "REX_CONJ", + "REX_CONJ_LEFT", + "REX_CONJ_RIGHT", + "REX_DONE", + "REX_DOT", + "REX_END", + "REX_END_STR", + "REX_EXEC", + "REX_FIN_STR", + "REX_GROUP", + "REX_GROUP_CATCH", + "REX_GROUP_AHEAD", + "REX_GROUP_AHEAD_CATCH", + "REX_GROUP_AHEAD_NOT", + "REX_GROUP_BEHIND", + "REX_GROUP_BEHIND_CATCH", + "REX_GROUP_BEHIND_NOT", + "REX_GROUP_BEHIND_NOT_CATCH", + "REX_GROUP_COND", + "REX_GROUP_COND_CATCH", + "REX_GROUP_CUT", + "REX_GROUP_CUT_CATCH", + "REX_KMP", + "REX_NEG", + "REX_NEG_CATCH", + "REX_NEST", + "REX_ONECHAR", + "REX_REP", + "REX_REP_CATCH", + "REX_STRING", + "REX_TRIE", + "REX_WBEG", + "REX_WEND", + "REX_WORD", + "REX_WORD_NOT" +}; + +static const char* rexname(Rex_t* rex) +{ + if (!rex) + return "NIL"; + if (rex->type >= elementsof(rexnames)) + return "ERROR"; + return rexnames[rex->type]; +} + +#else + +#define DEBUG_INIT() +#define DEBUG_TEST(f,y,n) (n) +#define DEBUG_CODE(f,y,n) do {n} while(0) + +#endif + +#define BEG_ALT 1 /* beginning of an alt */ +#define BEG_ONE 2 /* beginning of one iteration of a rep */ +#define BEG_REP 3 /* beginning of a repetition */ +#define BEG_SUB 4 /* beginning of a subexpression */ +#define END_ANY 5 /* end of any of above */ + +/* + * returns from parse() + */ + +#define NONE 0 /* no parse found */ +#define GOOD 1 /* some parse was found */ +#define CUT 2 /* no match and no backtrack */ +#define BEST 3 /* an unbeatable parse was found */ +#define BAD 4 /* error ocurred */ + +/* + * REG_SHELL_DOT test + */ + +#define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit)) + +/* + * Pos_t is for comparing parses. An entry is made in the + * array at the beginning and at the end of each Group_t, + * each iteration in a Group_t, and each Binary_t. + */ + +typedef struct +{ + unsigned char* p; /* where in string */ + size_t length; /* length in string */ + short serial; /* preorder subpattern number */ + short be; /* which end of pair */ +} Pos_t; + +/* ===== begin library support ===== */ + +#define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i)) + +static Vector_t* +vecopen(int inc, int siz) +{ + Vector_t* v; + Stk_t* sp; + + if (inc <= 0) + inc = 16; + if (!(sp = stkopen(STK_SMALL|STK_NULL))) + return 0; + if (!(v = (Vector_t*)stkseek(sp, sizeof(Vector_t) + inc * siz))) + { + stkclose(sp); + return 0; + } + v->stk = sp; + v->vec = (char*)v + sizeof(Vector_t); + v->max = v->inc = inc; + v->siz = siz; + v->cur = 0; + return v; +} + +static void* +vecseek(Vector_t** p, int index) +{ + Vector_t* v = *p; + + if (index >= v->max) + { + while ((v->max += v->inc) <= index); + if (!(v = (Vector_t*)stkseek(v->stk, sizeof(Vector_t) + v->max * v->siz))) + return 0; + *p = v; + v->vec = (char*)v + sizeof(Vector_t); + } + return v->vec + index * v->siz; +} + +static void +vecclose(Vector_t* v) +{ + if (v) + stkclose(v->stk); +} + +typedef struct +{ + Stk_pos_t pos; + char data[1]; +} Stk_frame_t; + +#define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0)) +#define stkold(s,p) stkset(s,(p)->base,(p)->offset) + +#define stkframe(s) (*((Stk_frame_t**)stktop(s)-1)) +#define stkdata(s,t) ((t*)stkframe(s)->data) +#define stkpop(s) stkold(s,&(stkframe(s)->pos)) + +static void* +stkpush(Stk_t* sp, size_t size) +{ + Stk_frame_t* f; + Stk_pos_t p; + + stknew(sp, &p); + size = sizeof(Stk_frame_t) + sizeof(size_t) + size - 1; + if (!(f = (Stk_frame_t*)stkalloc(sp, sizeof(Stk_frame_t) + sizeof(Stk_frame_t*) + size - 1))) + return 0; + f->pos = p; + stkframe(sp) = f; + return f->data; +} + +/* ===== end library support ===== */ + +/* + * Match_frame_t is for saving and restoring match records + * around alternate attempts, so that fossils will not be + * left in the match array. These are the only entries in + * the match array that are not otherwise guaranteed to + * have current data in them when they get used. + */ + +typedef struct +{ + size_t size; + regmatch_t* match; + regmatch_t save[1]; +} Match_frame_t; + +#define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0) +#define matchcopy(e,x) do if ((x)->re.group.number) { Match_frame_t* fp = (void*)stkframe(stkstd)->data; memcpy(fp->match, fp->save, fp->size); } while (0) +#define matchpop(e,x) do if ((x)->re.group.number) { Match_frame_t* fp = (void*)stkframe(stkstd)->data; memcpy(fp->match, fp->save, fp->size); stkpop(stkstd); } while (0) + +#define pospop(e) (--(e)->pos->cur) + +/* + * allocate a frame and push a match onto the stack + */ + +static int +_matchpush(Env_t* env, Rex_t* rex) +{ + Match_frame_t* f; + regmatch_t* m; + regmatch_t* e; + regmatch_t* s; + int num; + + if (rex->re.group.number <= 0 || (num = rex->re.group.last - rex->re.group.number + 1) <= 0) + num = 0; + if (!(f = (Match_frame_t*)stkpush(stkstd, sizeof(Match_frame_t) + (num - 1) * sizeof(regmatch_t)))) + { + env->error = REG_ESPACE; + return 1; + } + f->size = num * sizeof(regmatch_t); + f->match = m = env->match + rex->re.group.number; + e = m + num; + s = f->save; + while (m < e) + { + *s++ = *m; + *m++ = state.nomatch; + } + return 0; +} + +/* + * allocate a frame and push a pos onto the stack + */ + +static int +pospush(Env_t* env, Rex_t* rex, unsigned char* p, int be) +{ + Pos_t* pos; + + if (!(pos = vector(Pos_t, env->pos, env->pos->cur))) + { + env->error = REG_ESPACE; + return 1; + } + pos->serial = rex->serial; + pos->p = p; + pos->be = be; + env->pos->cur++; + return 0; +} + +/* + * two matches are known to have the same length + * os is start of old pos array, ns is start of new, + * oend and nend are end+1 pointers to ends of arrays. + * oe and ne are ends (not end+1) of subarrays. + * returns 1 if new is better, -1 if old, else 0. + */ + +static int +better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) +{ + Pos_t* oe; + Pos_t* ne; + int k; + int n; + + if (env->error) + return -1; + for (;;) + { + DEBUG_CODE(0x0080,{sfprintf(sfstdout, " %-*.*sold ", (level + 3) * 4, (level + 3) * 4, "");for (oe = os; oe < oend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n %-*.*snew ", (level + 3) * 4, (level + 3) * 4, "");for (oe = ns; oe < nend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n");},{0;}); + if (ns >= nend) + return DEBUG_TEST(0x8000,(os < oend),(0)); + if (os >= oend) + return DEBUG_TEST(0x8000,(-1),(1)); + n = os->serial; + if (ns->serial > n) + return -1; + if (n > ns->serial) + { + env->error = REG_PANIC; + return -1; + } + if (ns->p > os->p) + return 1; + if (os->p > ns->p) + return -1; + oe = os; + k = 0; + for (;;) + if ((++oe)->serial == n) + { + if (oe->be != END_ANY) + k++; + else if (k-- <= 0) + break; + } + ne = ns; + k = 0; + for (;;) + if ((++ne)->serial == n) + { + if (ne->be != END_ANY) + k++; + else if (k-- <= 0) + break; + } + if (ne->p > oe->p) + return 1; + if (oe->p > ne->p) + return -1; + if (k = better(env, os + 1, ns + 1, oe, ne, level + 1)) + return k; + os = oe + 1; + ns = ne + 1; + } +} + +#if _AST_REGEX_DEBUG + +static void +showmatch(regmatch_t* p) +{ + sfputc(sfstdout, '('); + if (p->rm_so < 0) + sfputc(sfstdout, '?'); + else + sfprintf(sfstdout, "%d", p->rm_so); + sfputc(sfstdout, ','); + if (p->rm_eo < 0) + sfputc(sfstdout, '?'); + else + sfprintf(sfstdout, "%d", p->rm_eo); + sfputc(sfstdout, ')'); +} + +static int +_better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) +{ + int i; + + DEBUG_CODE(0x0040,{sfprintf(sfstdout, "AHA better old ");for (i = 0; i <= env->nsub; i++)showmatch(&env->best[i]);sfprintf(sfstdout, "\n new ");for (i = 0; i <= env->nsub; i++)showmatch(&env->match[i]);sfprintf(sfstdout, "\n");},{0;}); + i = better(env, os, ns, oend, nend, 0); + DEBUG_TEST(0x0040,(sfprintf(sfstdout, " %s\n", i <= 0 ? "OLD" : "NEW")),(0)); + return i; +} + +#define better _better + +#endif + +#define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST) + +static int parse(Env_t*, Rex_t*, Rex_t*, unsigned char*); + +static int +parserep(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s, int n) +{ + int i; + int r = NONE; + Rex_t catcher; + + DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep %s %d %d %d %d `%-.*s'\n", __LINE__, debug_flag, rexname(rex->re.group.expr.rex), rex->re.group.number, rex->lo, n, rex->hi, env->end - s, s)),(0)); + if ((rex->flags & REG_MINIMAL) && n >= rex->lo && n < rex->hi) + { + if (env->stack && pospush(env, rex, s, END_ANY)) + return BAD; + i = follow(env, rex, cont, s); + if (env->stack) + pospop(env); + switch (i) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + } + if (n < rex->hi) + { + catcher.type = REX_REP_CATCH; + catcher.serial = rex->serial; + catcher.re.rep_catch.ref = rex; + catcher.re.rep_catch.cont = cont; + catcher.re.rep_catch.beg = s; + catcher.re.rep_catch.n = n + 1; + catcher.next = rex->next; + if (n == 0) + rex->re.rep_catch.beg = s; + if (env->stack) + { + if (matchpush(env, rex)) + return BAD; + if (pospush(env, rex, s, BEG_ONE)) + return BAD; +DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0)); + } + r = parse(env, rex->re.group.expr.rex, &catcher, s); + DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep parse %d %d `%-.*s'\n", __LINE__, debug_flag, rex->re.group.number, r, env->end - s, s)),(0)); + if (env->stack) + { + pospop(env); + matchpop(env, rex); +DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP %d %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, r, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0)); + } + switch (r) + { + case BAD: + return BAD; + case BEST: + return BEST; + case CUT: + r = NONE; + break; + case GOOD: + if (rex->flags & REG_MINIMAL) + return BEST; + r = GOOD; + break; + } + } + if (n < rex->lo) + return r; + if (!(rex->flags & REG_MINIMAL) || n >= rex->hi) + { + if (env->stack && pospush(env, rex, s, END_ANY)) + return BAD; + i = follow(env, rex, cont, s); + if (env->stack) + pospop(env); + switch (i) + { + case BAD: + r = BAD; + break; + case CUT: + r = CUT; + break; + case BEST: + r = BEST; + break; + case GOOD: + r = (rex->flags & REG_MINIMAL) ? BEST : GOOD; + break; + } + } + return r; +} + +static int +parsetrie(Env_t* env, Trie_node_t* x, Rex_t* rex, Rex_t* cont, unsigned char* s) +{ + unsigned char* p; + int r; + + if (p = rex->map) + { + for (;;) + { + if (s >= env->end) + return NONE; + while (x->c != p[*s]) + if (!(x = x->sib)) + return NONE; + if (x->end) + break; + x = x->son; + s++; + } + } + else + { + for (;;) + { + if (s >= env->end) + return NONE; + while (x->c != *s) + if (!(x = x->sib)) + return NONE; + if (x->end) + break; + x = x->son; + s++; + } + } + s++; + if (rex->flags & REG_MINIMAL) + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + if (x->son) + switch (parsetrie(env, x->son, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + return BEST; + case GOOD: + if (rex->flags & REG_MINIMAL) + return BEST; + r = GOOD; + break; + default: + r = NONE; + break; + } + else + r = NONE; + if (!(rex->flags & REG_MINIMAL)) + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + return BEST; + case GOOD: + return GOOD; + } + return r; +} + +static int +collelt(register Celt_t* ce, char* key, int c, int x) +{ + Ckey_t elt; + + mbxfrm(elt, key, COLL_KEY_MAX); + for (;; ce++) + { + switch (ce->typ) + { + case COLL_call: + if (!x && (*ce->fun)(c)) + return 1; + continue; + case COLL_char: + if (!strcmp((char*)ce->beg, (char*)elt)) + return 1; + continue; + case COLL_range: + if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max) + return 1; + continue; + case COLL_range_lc: + if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswlower(c) || !iswupper(c))) + return 1; + continue; + case COLL_range_uc: + if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswupper(c) || !iswlower(c))) + return 1; + continue; + } + break; + } + return 0; +} + +static int +collic(register Celt_t* ce, char* key, register char* nxt, int c, int x) +{ + if (!x) + { + if (collelt(ce, key, c, x)) + return 1; + if (iswlower(c)) + c = towupper(c); + else if (iswupper(c)) + c = towlower(c); + else + return 0; + x = mbconv(key, c); + key[x] = 0; + return collelt(ce, key, c, 0); + } + while (*nxt) + { + if (collic(ce, key, nxt + 1, c, x)) + return 1; + if (islower(*nxt)) + *nxt = toupper(*nxt); + else if (isupper(*nxt)) + *nxt = tolower(*nxt); + else + return 0; + nxt++; + } + return collelt(ce, key, c, x); +} + +static int +collmatch(Rex_t* rex, unsigned char* s, unsigned char* e, unsigned char** p) +{ + unsigned char* t; + wchar_t c; + int w; + int r; + int x; + int ic; + Ckey_t key; + Ckey_t elt; + + ic = (rex->flags & REG_ICASE); + if ((w = MBSIZE(s)) > 1) + { + memcpy((char*)key, (char*)s, w); + key[w] = 0; + t = s; + c = mbchar(t); +#if !_lib_wctype + c &= 0xff; +#endif + x = 0; + } + else + { + c = s[0]; + if (ic && isupper(c)) + c = tolower(c); + key[0] = c; + key[1] = 0; + if (isalpha(c)) + { + x = e - s; + if (x > COLL_KEY_MAX) + x = COLL_KEY_MAX; + while (w < x) + { + c = s[w]; + if (!isalpha(c)) + break; + r = mbxfrm(elt, key, COLL_KEY_MAX); + if (ic && isupper(c)) + c = tolower(c); + key[w] = c; + key[w + 1] = 0; + if (mbxfrm(elt, key, COLL_KEY_MAX) != r) + break; + w++; + } + } + key[w] = 0; + c = key[0]; + x = w - 1; + } + r = 1; + for (;;) + { + if (ic ? collic(rex->re.collate.elements, (char*)key, (char*)key, c, x) : collelt(rex->re.collate.elements, (char*)key, c, x)) + break; + if (!x) + { + r = 0; + break; + } + w = x--; + key[w] = 0; + } + *p = s + w; + return rex->re.collate.invert ? !r : r; +} + +static unsigned char* +nestmatch(register unsigned char* s, register unsigned char* e, const unsigned short* type, register int co) +{ + register int c; + register int cc; + unsigned int n; + int oc; + + if (type[co] & (REX_NEST_literal|REX_NEST_quote)) + { + n = (type[co] & REX_NEST_literal) ? REX_NEST_terminator : (REX_NEST_escape|REX_NEST_terminator); + while (s < e) + { + c = *s++; + if (c == co) + return s; + else if (type[c] & n) + { + if (s >= e || (type[c] & REX_NEST_terminator)) + break; + s++; + } + } + } + else + { + cc = type[co] >> REX_NEST_SHIFT; + oc = type[co] & (REX_NEST_open|REX_NEST_close); + n = 1; + while (s < e) + { + c = *s++; + switch (type[c] & (REX_NEST_escape|REX_NEST_open|REX_NEST_close|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) + { + case REX_NEST_delimiter: + case REX_NEST_terminator: + return oc ? 0 : s; + case REX_NEST_separator: + if (!oc) + return s; + break; + case REX_NEST_escape: + if (s >= e) + return 0; + s++; + break; + case REX_NEST_open|REX_NEST_close: + if (c == cc) + { + if (!--n) + return s; + } + /*FALLTHROUGH*/ + case REX_NEST_open: + if (c == co) + { + if (!++n) + return 0; + } + else if (!(s = nestmatch(s, e, type, c))) + return 0; + break; + case REX_NEST_close: + if (c != cc) + return 0; + if (!--n) + return s; + break; + } + } + return (oc || !(type[UCHAR_MAX+1] & REX_NEST_terminator)) ? 0 : s; + } + return 0; +} + +static int +parse(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s) +{ + int c; + int d; + int i; + int m; + int n; + int r; + int* f; + unsigned char* p; + unsigned char* t; + unsigned char* b; + unsigned char* e; + char* u; + regmatch_t* o; + Trie_node_t* x; + Rex_t* q; + Rex_t catcher; + Rex_t next; + + for (;;) + { +DEBUG_TEST(0x0008,(sfprintf(sfstdout, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); + switch (rex->type) + { + case REX_ALT: + if (env->stack) + { + if (matchpush(env, rex)) + return BAD; + if (pospush(env, rex, s, BEG_ALT)) + return BAD; + catcher.type = REX_ALT_CATCH; + catcher.serial = rex->serial; + catcher.re.alt_catch.cont = cont; + catcher.next = rex->next; + r = parse(env, rex->re.group.expr.binary.left, &catcher, s); + if (r < BEST || (rex->flags & REG_MINIMAL)) + { + matchcopy(env, rex); + ((Pos_t*)env->pos->vec + env->pos->cur - 1)->serial = catcher.serial = rex->re.group.expr.binary.serial; + n = parse(env, rex->re.group.expr.binary.right, &catcher, s); + if (n != NONE) + r = n; + } + pospop(env); + matchpop(env, rex); + } + else + { + if ((r = parse(env, rex->re.group.expr.binary.left, cont, s)) == NONE) + r = parse(env, rex->re.group.expr.binary.right, cont, s); + if (r == GOOD) + r = BEST; + } + return r; + case REX_ALT_CATCH: + if (pospush(env, rex, s, END_ANY)) + return BAD; + r = follow(env, rex, rex->re.alt_catch.cont, s); + pospop(env); + return r; + case REX_BACK: + o = &env->match[rex->lo]; + if (o->rm_so < 0) + return NONE; + i = o->rm_eo - o->rm_so; + e = s + i; + if (e > env->end) + return NONE; + t = env->beg + o->rm_so; + if (!(p = rex->map)) + { + while (s < e) + if (*s++ != *t++) + return NONE; + } + else if (!mbwide()) + { + while (s < e) + if (p[*s++] != p[*t++]) + return NONE; + } + else + { + while (s < e) + { + c = mbchar(s); + d = mbchar(t); + if (towupper(c) != towupper(d)) + return NONE; + } + } + break; + case REX_BEG: + if ((!(rex->flags & REG_NEWLINE) || s <= env->beg || *(s - 1) != '\n') && ((env->flags & REG_NOTBOL) || s != env->beg)) + return NONE; + break; + case REX_CLASS: + if (LEADING(env, rex, s)) + return NONE; + n = rex->hi; + if (n > env->end - s) + n = env->end - s; + m = rex->lo; + if (m > n) + return NONE; + r = NONE; + if (!(rex->flags & REG_MINIMAL)) + { + for (i = 0; i < n; i++) + if (!settst(rex->re.charclass, s[i])) + { + n = i; + break; + } + for (s += n; n-- >= m; s--) + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + return BEST; + case GOOD: + r = GOOD; + break; + } + } + else + { + for (e = s + m; s < e; s++) + if (!settst(rex->re.charclass, *s)) + return r; + e += n - m; + for (;;) + { + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + if (s >= e || !settst(rex->re.charclass, *s)) + break; + s++; + } + } + return r; + case REX_COLL_CLASS: + if (LEADING(env, rex, s)) + return NONE; + n = rex->hi; + if (n > env->end - s) + n = env->end - s; + m = rex->lo; + if (m > n) + return NONE; + r = NONE; + e = env->end; + if (!(rex->flags & REG_MINIMAL)) + { + if (!(b = (unsigned char*)stkpush(stkstd, n))) + { + env->error = REG_ESPACE; + return BAD; + } + for (i = 0; s < e && i < n && collmatch(rex, s, e, &t); i++) + { + b[i] = t - s; + s = t; + } + for (; i-- >= rex->lo; s -= b[i]) + switch (follow(env, rex, cont, s)) + { + case BAD: + stkpop(stkstd); + return BAD; + case CUT: + stkpop(stkstd); + return CUT; + case BEST: + stkpop(stkstd); + return BEST; + case GOOD: + r = GOOD; + break; + } + stkpop(stkstd); + } + else + { + for (i = 0; i < m && s < e; i++, s = t) + if (!collmatch(rex, s, e, &t)) + return r; + while (i++ <= n) + { + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + if (s >= e || !collmatch(rex, s, e, &s)) + break; + } + } + return r; + case REX_CONJ: + next.type = REX_CONJ_RIGHT; + next.re.conj_right.cont = cont; + next.next = rex->next; + catcher.type = REX_CONJ_LEFT; + catcher.re.conj_left.right = rex->re.group.expr.binary.right; + catcher.re.conj_left.cont = &next; + catcher.re.conj_left.beg = s; + catcher.next = 0; + return parse(env, rex->re.group.expr.binary.left, &catcher, s); + case REX_CONJ_LEFT: + rex->re.conj_left.cont->re.conj_right.end = s; + cont = rex->re.conj_left.cont; + s = rex->re.conj_left.beg; + rex = rex->re.conj_left.right; + continue; + case REX_CONJ_RIGHT: + if (rex->re.conj_right.end != s) + return NONE; + cont = rex->re.conj_right.cont; + break; + case REX_DONE: + if (!env->stack) + return BEST; + n = s - env->beg; + r = env->nsub; + DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); + if ((i = env->best[0].rm_eo) >= 0) + { + if (rex->flags & REG_MINIMAL) + { + if (n > i) + return GOOD; + } + else + { + if (n < i) + return GOOD; + } + if (n == i && better(env, + (Pos_t*)env->bestpos->vec, + (Pos_t*)env->pos->vec, + (Pos_t*)env->bestpos->vec+env->bestpos->cur, + (Pos_t*)env->pos->vec+env->pos->cur, + 0) <= 0) + return GOOD; + } + env->best[0].rm_eo = n; + memcpy(&env->best[1], &env->match[1], r * sizeof(regmatch_t)); + n = env->pos->cur; + if (!vector(Pos_t, env->bestpos, n)) + { + env->error = REG_ESPACE; + return BAD; + } + env->bestpos->cur = n; + memcpy(env->bestpos->vec, env->pos->vec, n * sizeof(Pos_t)); + DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); + return GOOD; + case REX_DOT: + if (LEADING(env, rex, s)) + return NONE; + n = rex->hi; + if (n > env->end - s) + n = env->end - s; + m = rex->lo; + if (m > n) + return NONE; + if ((c = rex->explicit) >= 0 && !mbwide()) + for (i = 0; i < n; i++) + if (s[i] == c) + { + n = i; + break; + } + r = NONE; + if (!(rex->flags & REG_MINIMAL)) + { + if (!mbwide()) + { + for (s += n; n-- >= m; s--) + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + return BEST; + case GOOD: + r = GOOD; + break; + } + } + else + { + if (!(b = (unsigned char*)stkpush(stkstd, n))) + { + env->error = REG_ESPACE; + return BAD; + } + e = env->end; + for (i = 0; s < e && i < n && *s != c; i++) + s += b[i] = MBSIZE(s); + for (; i-- >= m; s -= b[i]) + switch (follow(env, rex, cont, s)) + { + case BAD: + stkpop(stkstd); + return BAD; + case CUT: + stkpop(stkstd); + return CUT; + case BEST: + stkpop(stkstd); + return BEST; + case GOOD: + r = GOOD; + break; + } + stkpop(stkstd); + } + } + else + { + if (!mbwide()) + { + e = s + n; + for (s += m; s <= e; s++) + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + } + else + { + e = env->end; + for (i = 0; s < e && i < m && *s != c; i++) + s += MBSIZE(s); + if (i >= m) + for (; s <= e && i <= n; s += MBSIZE(s), i++) + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + } + } + return r; + case REX_END: + if ((!(rex->flags & REG_NEWLINE) || *s != '\n') && ((env->flags & REG_NOTEOL) || s < env->end)) + return NONE; + break; + case REX_GROUP: +DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); + if (env->stack) + { + if (rex->re.group.number) + env->match[rex->re.group.number].rm_so = s - env->beg; + if (pospush(env, rex, s, BEG_SUB)) + return BAD; + catcher.re.group_catch.eo = rex->re.group.number ? &env->match[rex->re.group.number].rm_eo : (regoff_t*)0; + } + catcher.type = REX_GROUP_CATCH; + catcher.serial = rex->serial; + catcher.re.group_catch.cont = cont; + catcher.next = rex->next; + r = parse(env, rex->re.group.expr.rex, &catcher, s); + if (env->stack) + { + pospop(env); + if (rex->re.group.number) + env->match[rex->re.group.number].rm_so = -1; + } + return r; + case REX_GROUP_CATCH: +DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rexname(rex->re.group_catch.cont), env->end - s, s)),(0)); + if (env->stack) + { + if (rex->re.group_catch.eo) + *rex->re.group_catch.eo = s - env->beg; + if (pospush(env, rex, s, END_ANY)) + return BAD; + } + r = follow(env, rex, rex->re.group_catch.cont, s); + if (env->stack) + { + pospop(env); + if (rex->re.group_catch.eo) + *rex->re.group_catch.eo = -1; + } + return r; + case REX_GROUP_AHEAD: + catcher.type = REX_GROUP_AHEAD_CATCH; + catcher.flags = rex->flags; + catcher.serial = rex->serial; + catcher.re.rep_catch.beg = s; + catcher.re.rep_catch.cont = cont; + catcher.next = rex->next; + return parse(env, rex->re.group.expr.rex, &catcher, s); + case REX_GROUP_AHEAD_CATCH: + return follow(env, rex, rex->re.rep_catch.cont, rex->re.rep_catch.beg); + case REX_GROUP_AHEAD_NOT: + r = parse(env, rex->re.group.expr.rex, NiL, s); + if (r == NONE) + r = follow(env, rex, cont, s); + else if (r != BAD) + r = NONE; + return r; + case REX_GROUP_BEHIND: + if ((s - env->beg) < rex->re.group.size) + return NONE; + catcher.type = REX_GROUP_BEHIND_CATCH; + catcher.flags = rex->flags; + catcher.serial = rex->serial; + catcher.re.behind_catch.beg = s; + catcher.re.behind_catch.end = e = env->end; + catcher.re.behind_catch.cont = cont; + catcher.next = rex->next; + for (t = s - rex->re.group.size; t >= env->beg; t--) + { + env->end = s; + r = parse(env, rex->re.group.expr.rex, &catcher, t); + env->end = e; + if (r != NONE) + return r; + } + return NONE; + case REX_GROUP_BEHIND_CATCH: + if (s != rex->re.behind_catch.beg) + return NONE; + env->end = rex->re.behind_catch.end; + return follow(env, rex, rex->re.behind_catch.cont, rex->re.behind_catch.beg); + case REX_GROUP_BEHIND_NOT: + if ((s - env->beg) < rex->re.group.size) + r = NONE; + else + { + catcher.type = REX_GROUP_BEHIND_NOT_CATCH; + catcher.re.neg_catch.beg = s; + catcher.next = 0; + e = env->end; + env->end = s; + for (t = s - rex->re.group.size; t >= env->beg; t--) + { + r = parse(env, rex->re.group.expr.rex, &catcher, t); + if (r != NONE) + break; + } + env->end = e; + } + if (r == NONE) + r = follow(env, rex, cont, s); + else if (r != BAD) + r = NONE; + return r; + case REX_GROUP_BEHIND_NOT_CATCH: + return s == rex->re.neg_catch.beg ? GOOD : NONE; + case REX_GROUP_COND: + if (q = rex->re.group.expr.binary.right) + { + catcher.re.cond_catch.next[0] = q->re.group.expr.binary.right; + catcher.re.cond_catch.next[1] = q->re.group.expr.binary.left; + } + else + catcher.re.cond_catch.next[0] = catcher.re.cond_catch.next[1] = 0; + if (q = rex->re.group.expr.binary.left) + { + catcher.type = REX_GROUP_COND_CATCH; + catcher.flags = rex->flags; + catcher.serial = rex->serial; + catcher.re.cond_catch.yes = 0; + catcher.re.cond_catch.beg = s; + catcher.re.cond_catch.cont = cont; + catcher.next = rex->next; + r = parse(env, q, &catcher, s); + if (r == BAD || catcher.re.cond_catch.yes) + return r; + } + else if (!rex->re.group.size || rex->re.group.size > 0 && env->match[rex->re.group.size].rm_so >= 0) + r = GOOD; + else + r = NONE; + if (q = catcher.re.cond_catch.next[r != NONE]) + { + catcher.type = REX_CAT; + catcher.flags = q->flags; + catcher.serial = q->serial; + catcher.re.group_catch.cont = cont; + catcher.next = rex->next; + return parse(env, q, &catcher, s); + } + return follow(env, rex, cont, s); + case REX_GROUP_COND_CATCH: + rex->re.cond_catch.yes = 1; + catcher.type = REX_CAT; + catcher.flags = rex->flags; + catcher.serial = rex->serial; + catcher.re.group_catch.cont = rex->re.cond_catch.cont; + catcher.next = rex->next; + return parse(env, rex->re.cond_catch.next[1], &catcher, rex->re.cond_catch.beg); + case REX_CAT: + return follow(env, rex, rex->re.group_catch.cont, s); + case REX_GROUP_CUT: + catcher.type = REX_GROUP_CUT_CATCH; + catcher.flags = rex->flags; + catcher.serial = rex->serial; + catcher.re.group_catch.cont = cont; + catcher.next = rex->next; + return parse(env, rex->re.group.expr.rex, &catcher, s); + case REX_GROUP_CUT_CATCH: + switch (r = follow(env, rex, rex->re.group_catch.cont, s)) + { + case GOOD: + r = BEST; + break; + case NONE: + r = CUT; + break; + } + return r; + case REX_KMP: + f = rex->re.string.fail; + b = rex->re.string.base; + n = rex->re.string.size; + t = s; + e = env->end; + if (p = rex->map) + { + while (t + n <= e) + { + for (i = -1; t < e; t++) + { + while (i >= 0 && b[i+1] != p[*t]) + i = f[i]; + if (b[i+1] == p[*t]) + i++; + if (i + 1 == n) + { + t++; + if (env->stack) + env->best[0].rm_so = t - s - n; + switch (follow(env, rex, cont, t)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + t -= n - 1; + break; + } + } + } + } + else + { + while (t + n <= e) + { + for (i = -1; t < e; t++) + { + while (i >= 0 && b[i+1] != *t) + i = f[i]; + if (b[i+1] == *t) + i++; + if (i + 1 == n) + { + t++; + if (env->stack) + env->best[0].rm_so = t - s - n; + switch (follow(env, rex, cont, t)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + t -= n - 1; + break; + } + } + } + } + return NONE; + case REX_NEG: + if (LEADING(env, rex, s)) + return NONE; + i = env->end - s; + n = ((i + 7) >> 3) + 1; + catcher.type = REX_NEG_CATCH; + catcher.re.neg_catch.beg = s; + if (!(p = (unsigned char*)stkpush(stkstd, n))) + return BAD; + memset(catcher.re.neg_catch.index = p, 0, n); + catcher.next = rex->next; + if (parse(env, rex->re.group.expr.rex, &catcher, s) == BAD) + r = BAD; + else + { + r = NONE; + for (; i >= 0; i--) + if (!bittst(p, i)) + { + switch (follow(env, rex, cont, s + i)) + { + case BAD: + r = BAD; + break; + case BEST: + r = BEST; + break; + case CUT: + r = CUT; + break; + case GOOD: + r = GOOD; + /*FALLTHROUGH*/ + default: + continue; + } + break; + } + } + stkpop(stkstd); + return r; + case REX_NEG_CATCH: + bitset(rex->re.neg_catch.index, s - rex->re.neg_catch.beg); + return NONE; + case REX_NEST: + if (s >= env->end) + return NONE; + do + { + if ((c = *s++) == rex->re.nest.primary) + { + if (s >= env->end || !(s = nestmatch(s, env->end, rex->re.nest.type, c))) + return NONE; + break; + } + if (rex->re.nest.primary >= 0) + return NONE; + if (rex->re.nest.type[c] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) + break; + if (!(s = nestmatch(s, env->end, rex->re.nest.type, c))) + return NONE; + } while (s < env->end && !(rex->re.nest.type[*(s-1)] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))); + break; + case REX_NULL: + break; + case REX_ONECHAR: + n = rex->hi; + if (n > env->end - s) + n = env->end - s; + m = rex->lo; + if (m > n) + return NONE; + r = NONE; + c = rex->re.onechar; + if (!(rex->flags & REG_MINIMAL)) + { + if (!mbwide()) + { + if (p = rex->map) + { + for (i = 0; i < n; i++, s++) + if (p[*s] != c) + break; + } + else + { + for (i = 0; i < n; i++, s++) + if (*s != c) + break; + } + for (; i-- >= m; s--) + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case BEST: + return BEST; + case CUT: + return CUT; + case GOOD: + r = GOOD; + break; + } + } + else + { + if (!(b = (unsigned char*)stkpush(stkstd, n))) + { + env->error = REG_ESPACE; + return BAD; + } + e = env->end; + if (!(rex->flags & REG_ICASE)) + { + for (i = 0; s < e && i < n; i++, s = t) + { + t = s; + if (mbchar(t) != c) + break; + b[i] = t - s; + } + } + else + { + for (i = 0; s < e && i < n; i++, s = t) + { + t = s; + if (towupper(mbchar(t)) != c) + break; + b[i] = t - s; + } + } + for (; i-- >= m; s -= b[i]) + switch (follow(env, rex, cont, s)) + { + case BAD: + stkpop(stkstd); + return BAD; + case BEST: + stkpop(stkstd); + return BEST; + case CUT: + stkpop(stkstd); + return CUT; + case GOOD: + r = GOOD; + break; + } + stkpop(stkstd); + } + } + else + { + if (!mbwide()) + { + e = s + m; + if (p = rex->map) + { + for (; s < e; s++) + if (p[*s] != c) + return r; + e += n - m; + for (;;) + { + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + if (s >= e || p[*s++] != c) + break; + } + } + else + { + for (; s < e; s++) + if (*s != c) + return r; + e += n - m; + for (;;) + { + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + if (s >= e || *s++ != c) + break; + } + } + } + else + { + if (!(rex->flags & REG_ICASE)) + { + for (i = 0; i < m && s < e; i++, s = t) + { + t = s; + if (mbchar(t) != c) + return r; + } + while (i++ <= n) + { + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + if (s >= e || mbchar(s) != c) + break; + } + } + else + { + for (i = 0; i < m && s < e; i++, s = t) + { + t = s; + if (towupper(mbchar(t)) != c) + return r; + } + while (i++ <= n) + { + switch (follow(env, rex, cont, s)) + { + case BAD: + return BAD; + case CUT: + return CUT; + case BEST: + case GOOD: + return BEST; + } + if (s >= e || towupper(mbchar(s)) != c) + break; + } + } + } + } + return r; + case REX_REP: + if (env->stack && pospush(env, rex, s, BEG_REP)) + return BAD; + r = parserep(env, rex, cont, s, 0); + if (env->stack) + pospop(env); + return r; + case REX_REP_CATCH: + DEBUG_TEST(0x0020,(sfprintf(sfstdout, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.rep_catch.n, s - rex->re.rep_catch.beg, env->end - s, s)),(0)); + if (env->stack && pospush(env, rex, s, END_ANY)) + return BAD; + if (s == rex->re.rep_catch.beg && rex->re.rep_catch.n > rex->re.rep_catch.ref->lo) + { + /* + * optional empty iteration + */ + +DEBUG_TEST(0x0002,(sfprintf(sfstdout, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__, rex->re.rep_catch.ref->re.group.expr.rex, rex->re.rep_catch.ref->re.group.expr.rex->re.group.back, rexname(rex->re.rep_catch.ref->re.group.expr.rex))),(0)); + if (!env->stack || s != rex->re.rep_catch.ref->re.rep_catch.beg && !rex->re.rep_catch.ref->re.group.expr.rex->re.group.back) + r = NONE; + else if (pospush(env, rex, s, END_ANY)) + r = BAD; + else + { + r = follow(env, rex, rex->re.rep_catch.cont, s); + pospop(env); + } + } + else + r = parserep(env, rex->re.rep_catch.ref, rex->re.rep_catch.cont, s, rex->re.rep_catch.n); + if (env->stack) + pospop(env); + return r; + case REX_STRING: +DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.string.size, rex->re.string.base, env->end - s, s)),(0)); + if (rex->re.string.size > (env->end - s)) + return NONE; + t = rex->re.string.base; + e = t + rex->re.string.size; + if (!(p = rex->map)) + { + while (t < e) + if (*s++ != *t++) + return NONE; + } + else if (!mbwide()) + { + while (t < e) + if (p[*s++] != *t++) + return NONE; + } + else + { + while (t < e) + { + c = mbchar(s); + d = mbchar(t); + if (towupper(c) != d) + return NONE; + } + } + break; + case REX_TRIE: + if (((s + rex->re.trie.min) > env->end) || !(x = rex->re.trie.root[rex->map ? rex->map[*s] : *s])) + return NONE; + return parsetrie(env, x, rex, cont, s); + case REX_EXEC: + u = 0; + r = (*env->disc->re_execf)(env->regex, rex->re.exec.data, rex->re.exec.text, rex->re.exec.size, (const char*)s, env->end - s, &u, env->disc); + e = (unsigned char*)u; + if (e >= s && e <= env->end) + s = e; + switch (r) + { + case 0: + break; + case REG_NOMATCH: + return NONE; + default: + env->error = r; + return BAD; + } + break; + case REX_WBEG: + if (!isword(*s) || s > env->beg && isword(*(s - 1))) + return NONE; + break; + case REX_WEND: + if (isword(*s) || s > env->beg && !isword(*(s - 1))) + return NONE; + break; + case REX_WORD: + if (s > env->beg && isword(*(s - 1)) == isword(*s)) + return NONE; + break; + case REX_WORD_NOT: + if (s == env->beg || isword(*(s - 1)) != isword(*s)) + return NONE; + break; + case REX_BEG_STR: + if (s != env->beg) + return NONE; + break; + case REX_END_STR: + for (t = s; t < env->end && *t == '\n'; t++); + if (t < env->end) + return NONE; + break; + case REX_FIN_STR: + if (s < env->end) + return NONE; + break; + } + if (!(rex = rex->next)) + { + if (!(rex = cont)) + break; + cont = 0; + } + } + return GOOD; +} + +#if _AST_REGEX_DEBUG + +static void +listnode(Rex_t* e, int level) +{ + int i; + + if (e) + { + do + { + for (i = 0; i < level; i++) + sfprintf(sfstderr, " "); + sfprintf(sfstderr, "%s\n", rexname(e)); + switch (e->type) + { + case REX_ALT: + case REX_CONJ: + listnode(e->re.group.expr.binary.left, level + 1); + listnode(e->re.group.expr.binary.right, level + 1); + break; + case REX_GROUP: + case REX_GROUP_AHEAD: + case REX_GROUP_AHEAD_NOT: + case REX_GROUP_BEHIND: + case REX_GROUP_BEHIND_NOT: + case REX_GROUP_CUT: + case REX_NEG: + case REX_REP: + listnode(e->re.group.expr.rex, level + 1); + break; + } + } while (e = e->next); + } +} + +static int +list(Env_t* env, Rex_t* rex) +{ + sfprintf(sfstderr, "AHA regex hard=%d stack=%p\n", env->hard, env->stack); + if (rex) + listnode(rex, 1); + return 0; +} + +#endif + +/* + * returning REG_BADPAT or REG_ESPACE is not explicitly + * countenanced by the standard + */ + +int +regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags) +{ + register int n; + register int i; + int j; + int k; + int m; + int advance; + Env_t* env; + Rex_t* e; + + DEBUG_INIT(); + DEBUG_TEST(0x0001,(sfprintf(sfstdout, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__, debug_flag, nmatch, flags, len, s)),(0)); + if (!p || !(env = p->env)) + return REG_BADPAT; + if (!s) + return fatal(env->disc, REG_BADPAT, NiL); + if (len < env->min) + { + DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, env->min)),(0)); + return REG_NOMATCH; + } + env->regex = p; + env->beg = (unsigned char*)s; + env->end = env->beg + len; + stknew(stkstd, &env->stk); + env->flags &= ~REG_EXEC; + env->flags |= (flags & REG_EXEC); + advance = 0; + if (env->stack = env->hard || !(env->flags & REG_NOSUB) && nmatch) + { + n = env->nsub; + if (!(env->match = (regmatch_t*)stkpush(stkstd, 2 * (n + 1) * sizeof(regmatch_t))) || + !env->pos && !(env->pos = vecopen(16, sizeof(Pos_t))) || + !env->bestpos && !(env->bestpos = vecopen(16, sizeof(Pos_t)))) + { + k = REG_ESPACE; + goto done; + } + env->pos->cur = env->bestpos->cur = 0; + env->best = &env->match[n + 1]; + env->best[0].rm_so = 0; + env->best[0].rm_eo = -1; + for (i = 0; i <= n; i++) + env->match[i] = state.nomatch; + if (flags & REG_ADVANCE) + advance = 1; + } + DEBUG_TEST(0x1000,(list(env,env->rex)),(0)); + k = REG_NOMATCH; + if ((e = env->rex)->type == REX_BM) + { + DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM\n", __LINE__)),(0)); + if (len < e->re.bm.right) + { + DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, e->re.bm.right)),(0)); + goto done; + } + else if (!(flags & REG_LEFT)) + { + register unsigned char* buf = (unsigned char*)s; + register size_t index = e->re.bm.left + e->re.bm.size; + register size_t mid = len - e->re.bm.right; + register size_t* skip = e->re.bm.skip; + register size_t* fail = e->re.bm.fail; + register Bm_mask_t** mask = e->re.bm.mask; + Bm_mask_t m; + size_t x; + + DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__, len, e->re.bm.right, e->re.bm.left, e->re.bm.size, index, mid)),(0)); + for (;;) + { + while ((index += skip[buf[index]]) < mid); + if (index < HIT) + { + DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, index, HIT)),(0)); + goto done; + } + index -= HIT; + m = mask[n = e->re.bm.size - 1][buf[index]]; + do + { + if (!n--) + { + if (e->re.bm.back < 0) + goto possible; + if (advance) + { + i = index - e->re.bm.back; + s += i; + if (env->stack) + env->best[0].rm_so += i; + goto possible; + } + x = index; + if (index < e->re.bm.back) + index = 0; + else + index -= e->re.bm.back; + while (index <= x) + { + if ((i = parse(env, e->next, &env->done, buf + index)) != NONE) + { + if (env->stack) + env->best[0].rm_so = index; + n = env->nsub; + goto hit; + } + index++; + } + index += e->re.bm.size; + break; + } + } while (m &= mask[n][buf[--index]]); + if ((index += fail[n + 1]) >= len) + goto done; + } + } + possible: + n = env->nsub; + e = e->next; + } + j = env->once || (flags & REG_LEFT); + DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d parse once=%d\n", __LINE__, j)),(0)); + while ((i = parse(env, e, &env->done, (unsigned char*)s)) == NONE || advance && !env->best[0].rm_eo && !(advance = 0)) + { + if (j) + goto done; + i = MBSIZE(s); + s += i; + if ((unsigned char*)s > env->end - env->min) + goto done; + if (env->stack) + env->best[0].rm_so += i; + } + if ((flags & REG_LEFT) && env->stack && env->best[0].rm_so) + goto done; + hit: + if (k = env->error) + goto done; + if (i == CUT) + { + k = env->error = REG_NOMATCH; + goto done; + } + if (!(env->flags & REG_NOSUB)) + { + k = (env->flags & (REG_SHELL|REG_AUGMENTED)) == (REG_SHELL|REG_AUGMENTED); + for (i = j = m = 0; j < nmatch; i++) + if (!i || !k || (i & 1)) + { + if (i > n) + match[j] = state.nomatch; + else + match[m = j] = env->best[i]; + j++; + } + if (k) + { + while (m > 0 && match[m].rm_so == -1 && match[m].rm_eo == -1) + m--; + ((regex_t*)p)->re_nsub = m; + } + } + k = 0; + done: + stkold(stkstd, &env->stk); + env->stk.base = 0; + if (k > REG_NOMATCH) + fatal(p->env->disc, k, NiL); + return k; +} + +void +regfree(regex_t* p) +{ + Env_t* env; + + if (p && (env = p->env)) + { +#if _REG_subcomp + if (env->sub) + { + regsubfree(p); + p->re_sub = 0; + } +#endif + p->env = 0; + if (--env->refs <= 0 && !(env->disc->re_flags & REG_NOFREE)) + { + drop(env->disc, env->rex); + if (env->pos) + vecclose(env->pos); + if (env->bestpos) + vecclose(env->bestpos); + if (env->stk.base) + stkold(stkstd, &env->stk); + alloc(env->disc, env, 0); + } + } +} diff --git a/src/lib/libast/regex/regrecord.c b/src/lib/libast/regex/regrecord.c new file mode 100644 index 0000000..001d14c --- /dev/null +++ b/src/lib/libast/regex/regrecord.c @@ -0,0 +1,34 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * return 1 if regrexec() can be used + */ + +#include "reglib.h" + +int +regrecord(const regex_t* p) +{ + return p && p->env && p->env->rex->type == REX_BM; +} diff --git a/src/lib/libast/regex/regrexec.c b/src/lib/libast/regex/regrexec.c new file mode 100644 index 0000000..6cb1272 --- /dev/null +++ b/src/lib/libast/regex/regrexec.c @@ -0,0 +1,145 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex record executor + * multiple record sized-buffer interface + */ + +#include "reglib.h" + +/* + * call regnexec() on records selected by Boyer-Moore + */ + +int +regrexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags, int sep, void* handle, regrecord_t record) +{ + register unsigned char* buf = (unsigned char*)s; + register unsigned char* beg; + register unsigned char* l; + register unsigned char* r; + register unsigned char* x; + register size_t* skip; + register size_t* fail; + register Bm_mask_t** mask; + register size_t index; + register int n; + unsigned char* end; + size_t mid; + int complete; + int exactlen; + int leftlen; + int rightlen; + int inv; + Bm_mask_t m; + Env_t* env; + Rex_t* e; + + if (!s || !p || !(env = p->env) || (e = env->rex)->type != REX_BM) + return REG_BADPAT; + inv = (flags & REG_INVERT) != 0; + buf = beg = (unsigned char*)s; + end = buf + len; + mid = (len < e->re.bm.right) ? 0 : (len - e->re.bm.right); + skip = e->re.bm.skip; + fail = e->re.bm.fail; + mask = e->re.bm.mask; + complete = e->re.bm.complete && !nmatch; + exactlen = e->re.bm.size; + leftlen = e->re.bm.left + exactlen; + rightlen = exactlen + e->re.bm.right; + index = leftlen++; + for (;;) + { + while ((index += skip[buf[index]]) < mid); + if (index < HIT) + goto impossible; + index -= HIT; + m = mask[n = exactlen - 1][buf[index]]; + do + { + if (!n--) + goto possible; + } while (m &= mask[n][buf[--index]]); + if ((index += fail[n + 1]) < len) + continue; + impossible: + if (inv) + { + l = r = buf + len; + goto invert; + } + n = 0; + goto done; + possible: + r = (l = buf + index) + exactlen; + while (l > beg) + if (*--l == sep) + { + l++; + break; + } + if ((r - l) < leftlen) + goto spanned; + while (r < end && *r != sep) + r++; + if ((r - (buf + index)) < rightlen) + goto spanned; + if (complete || (env->rex = ((r - l) > 128) ? e : e->next) && !(n = regnexec(p, (char*)l, r - l, nmatch, match, flags))) + { + if (inv) + { + invert: + x = beg; + while (beg < l) + { + while (x < l && *x != sep) + x++; + if (n = (*record)(handle, (char*)beg, x - beg)) + goto done; + beg = ++x; + } + } + else if (n = (*record)(handle, (char*)l, r - l)) + goto done; + if ((index = (r - buf) + leftlen) >= len) + { + n = (inv && (++r - buf) < len) ? (*record)(handle, (char*)r, (buf + len) - r): 0; + goto done; + } + beg = r + 1; + } + else if (n != REG_NOMATCH) + goto done; + else + { + spanned: + if ((index += exactlen) >= mid) + goto impossible; + } + } + done: + env->rex = e; + return n; +} diff --git a/src/lib/libast/regex/regstat.c b/src/lib/libast/regex/regstat.c new file mode 100644 index 0000000..c7f725f --- /dev/null +++ b/src/lib/libast/regex/regstat.c @@ -0,0 +1,53 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * return p stat info + */ + +#include "reglib.h" + +regstat_t* +regstat(const regex_t* p) +{ + register Rex_t* e; + + p->env->stats.re_flags = p->env->flags; + p->env->stats.re_info = 0; + e = p->env->rex; + if (e && e->type == REX_BM) + { + p->env->stats.re_record = p->env->rex->re.bm.size; + e = e->next; + } + else + p->env->stats.re_record = 0; + if (e && e->type == REX_BEG) + e = e->next; + if (e && e->type == REX_STRING) + e = e->next; + if (!e || e->type == REX_END && !e->next) + p->env->stats.re_info |= REG_LITERAL; + p->env->stats.re_record = (p && p->env && p->env->rex->type == REX_BM) ? p->env->rex->re.bm.size : -1; + return &p->env->stats; +} diff --git a/src/lib/libast/regex/regsub.c b/src/lib/libast/regex/regsub.c new file mode 100644 index 0000000..8e89bea --- /dev/null +++ b/src/lib/libast/regex/regsub.c @@ -0,0 +1,269 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * OBSOLETE Sfio_t buffer interface -- use regsubcomp(),regsubexec() + */ + +#include "reglib.h" + +/* + * do a single substitution + */ + +static int +subold(register Sfio_t* dp, const char* op, register const char* sp, size_t nmatch, register regmatch_t* match, register regflags_t flags, int sre) +{ + register int c; + char* s; + char* e; + const char* b; + regflags_t f; + + f = flags &= (REG_SUB_LOWER|REG_SUB_UPPER); + for (;;) + { + switch (c = *sp++) + { + case 0: + return 0; + case '~': + if (!sre || *sp != '(') + { + sfputc(dp, c); + continue; + } + b = sp - 1; + sp++; + break; + case '\\': + if (sre) + { + sfputc(dp, chresc(sp - 1, &s)); + sp = (const char*)s; + continue; + } + if (*sp == '&') + { + c = *sp++; + sfputc(dp, c); + continue; + } + break; + case '&': + if (sre) + { + sfputc(dp, c); + continue; + } + sp--; + break; + default: + switch (flags) + { + case REG_SUB_UPPER: + if (islower(c)) + c = toupper(c); + break; + case REG_SUB_LOWER: + if (isupper(c)) + c = tolower(c); + break; + case REG_SUB_UPPER|REG_SUB_LOWER: + if (isupper(c)) + c = tolower(c); + else if (islower(c)) + c = toupper(c); + break; + } + sfputc(dp, c); + continue; + } + switch (c = *sp++) + { + case 0: + sp--; + continue; + case '&': + c = 0; + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c -= '0'; + if (sre) + while (isdigit(*sp)) + c = c * 10 + *sp++ - '0'; + break; + case 'l': + if (sre && *sp != ')') + { + c = -1; + break; + } + if (c = *sp) + { + sp++; + if (isupper(c)) + c = tolower(c); + sfputc(dp, c); + } + continue; + case 'u': + if (sre) + { + if (*sp != ')') + { + c = -1; + break; + } + sp++; + } + if (c = *sp) + { + sp++; + if (islower(c)) + c = toupper(c); + sfputc(dp, c); + } + continue; + case 'E': + if (sre) + { + if (*sp != ')') + { + c = -1; + break; + } + sp++; + } + flags = f; + continue; + case 'L': + if (sre) + { + if (*sp != ')') + { + c = -1; + break; + } + sp++; + } + f = flags; + flags = REG_SUB_LOWER; + continue; + case 'U': + if (sre) + { + if (*sp != ')') + { + c = -1; + break; + } + sp++; + } + f = flags; + flags = REG_SUB_UPPER; + continue; + default: + if (!sre) + { + sfputc(dp, chresc(sp - 2, &s)); + sp = (const char*)s; + continue; + } + sp--; + c = -1; + break; + } + if (sre) + { + if (c < 0 || *sp != ')') + { + for (; b < sp; b++) + sfputc(dp, *b); + continue; + } + sp++; + } + if (c >= nmatch) + return REG_ESUBREG; + s = (char*)op + match[c].rm_so; + e = (char*)op + match[c].rm_eo; + while (s < e) + { + c = *s++; + switch (flags) + { + case REG_SUB_UPPER: + if (islower(c)) + c = toupper(c); + break; + case REG_SUB_LOWER: + if (isupper(c)) + c = tolower(c); + break; + case REG_SUB_UPPER|REG_SUB_LOWER: + if (isupper(c)) + c = tolower(c); + else if (islower(c)) + c = toupper(c); + break; + } + sfputc(dp, c); + } + } +} + +/* + * ed(1) style substitute using matches from last regexec() + */ + +int +regsub(const regex_t* p, Sfio_t* dp, const char* op, const char* sp, size_t nmatch, regmatch_t* match, regflags_t flags) +{ + int m; + int r; + int sre; + + if ((p->env->flags & REG_NOSUB) || !nmatch) + return fatal(p->env->disc, REG_BADPAT, NiL); + m = (flags >> 16) & 0x3fff; + sre = !!(p->env->flags & REG_SHELL); + r = 0; + do + { + if (--m > 0) + sfwrite(dp, op, match->rm_eo); + else + { + sfwrite(dp, op, match->rm_so); + if (r = subold(dp, op, sp, nmatch, match, flags, sre)) + return fatal(p->env->disc, r, NiL); + } + op += match->rm_eo; + } while ((m > 0 || (flags & REG_SUB_ALL)) && !(r = regexec(p, op, nmatch, match, p->env->flags|(match->rm_so == match->rm_eo ? REG_ADVANCE : 0)))); + if (r && r != REG_NOMATCH) + return fatal(p->env->disc, r, NiL); + sfputr(dp, op, -1); + return 0; +} diff --git a/src/lib/libast/regex/regsubcomp.c b/src/lib/libast/regex/regsubcomp.c new file mode 100644 index 0000000..73bde4c --- /dev/null +++ b/src/lib/libast/regex/regsubcomp.c @@ -0,0 +1,377 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex ed(1) style substitute compile + */ + +#include "reglib.h" + +static const regflags_t submap[] = +{ + 'g', REG_SUB_ALL, + 'l', REG_SUB_LOWER, + 'n', REG_SUB_NUMBER, + 'p', REG_SUB_PRINT, + 's', REG_SUB_STOP, + 'u', REG_SUB_UPPER, + 'w', REG_SUB_WRITE|REG_SUB_LAST, + 0, 0 +}; + +int +regsubflags(regex_t* p, register const char* s, char** e, int delim, register const regflags_t* map, int* pm, regflags_t* pf) +{ + register int c; + register const regflags_t* m; + regflags_t flags; + int minmatch; + regdisc_t* disc; + + flags = pf ? *pf : 0; + minmatch = pm ? *pm : 0; + if (!map) + map = submap; + while (!(flags & REG_SUB_LAST)) + { + if (!(c = *s++) || c == delim) + { + s--; + break; + } + else if (c >= '0' && c <= '9') + { + if (minmatch) + { + disc = p->env->disc; + regfree(p); + return fatal(disc, REG_EFLAGS, s - 1); + } + minmatch = c - '0'; + while (*s >= '0' && *s <= '9') + minmatch = minmatch * 10 + *s++ - '0'; + } + else + { + for (m = map; *m; m++) + if (*m++ == c) + { + if (flags & *m) + { + disc = p->env->disc; + regfree(p); + return fatal(disc, REG_EFLAGS, s - 1); + } + flags |= *m--; + break; + } + if (!*m) + { + s--; + break; + } + } + } + if (pf) + *pf = flags; + if (pm) + *pm = minmatch; + if (e) + *e = (char*)s; + return 0; +} + +/* + * compile substitute rhs and optional flags + */ + +int +regsubcomp(regex_t* p, register const char* s, const regflags_t* map, int minmatch, regflags_t flags) +{ + register regsub_t* sub; + register int c; + register int d; + register char* t; + register regsubop_t* op; + char* e; + const char* r; + int sre; + int f; + int g; + int n; + int nops; + const char* o; + regdisc_t* disc; + + disc = p->env->disc; + if (p->env->flags & REG_NOSUB) + { + regfree(p); + return fatal(disc, REG_BADPAT, NiL); + } + if (!(sub = (regsub_t*)alloc(p->env->disc, 0, sizeof(regsub_t) + strlen(s))) || !(sub->re_ops = (regsubop_t*)alloc(p->env->disc, 0, (nops = 8) * sizeof(regsubop_t)))) + { + if (sub) + alloc(p->env->disc, sub, 0); + regfree(p); + return fatal(disc, REG_ESPACE, s); + } + sub->re_buf = sub->re_end = 0; + p->re_sub = sub; + p->env->sub = 1; + op = sub->re_ops; + o = s; + if (!(p->env->flags & REG_DELIMITED)) + d = 0; + else + switch (d = *(s - 1)) + { + case '\\': + case '\n': + case '\r': + regfree(p); + return fatal(disc, REG_EDELIM, s); + } + sre = p->env->flags & REG_SHELL; + t = sub->re_rhs; + if (d) + { + r = s; + for (;;) + { + if (!*s) + { + if (p->env->flags & REG_MUSTDELIM) + { + regfree(p); + return fatal(disc, REG_EDELIM, r); + } + break; + } + else if (*s == d) + { + flags |= REG_SUB_FULL; + s++; + break; + } + else if (*s++ == '\\' && !*s++) + { + regfree(p); + return fatal(disc, REG_EESCAPE, r); + } + } + if (*s) + { + if (n = regsubflags(p, s, &e, d, map, &minmatch, &flags)) + return n; + s = (const char*)e; + } + p->re_npat = s - o; + s = r; + } + else + p->re_npat = 0; + op->op = f = g = flags & (REG_SUB_LOWER|REG_SUB_UPPER); + op->off = 0; + while ((c = *s++) != d) + { + again: + if (!c) + { + p->re_npat = s - o - 1; + break; + } + else if (c == '\\') + { + if (*s == c) + { + *t++ = *s++; + continue; + } + if ((c = *s++) == d) + goto again; + if (!c) + { + regfree(p); + return fatal(disc, REG_EESCAPE, s - 2); + } + if (c == '&') + { + *t++ = c; + continue; + } + } + else if (c == '&') + { + if (sre) + { + *t++ = c; + continue; + } + } + else + { + switch (op->op) + { + case REG_SUB_UPPER: + if (islower(c)) + c = toupper(c); + break; + case REG_SUB_LOWER: + if (isupper(c)) + c = tolower(c); + break; + case REG_SUB_UPPER|REG_SUB_LOWER: + if (isupper(c)) + c = tolower(c); + else if (islower(c)) + c = toupper(c); + break; + } + *t++ = c; + continue; + } + switch (c) + { + case 0: + s--; + continue; + case '&': + c = 0; + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c -= '0'; + if (isdigit(*s) && (p->env->flags & REG_MULTIREF)) + c = c * 10 + *s++ - '0'; + break; + case 'l': + if (c = *s) + { + s++; + if (isupper(c)) + c = tolower(c); + *t++ = c; + } + continue; + case 'u': + if (c = *s) + { + s++; + if (islower(c)) + c = toupper(c); + *t++ = c; + } + continue; + case 'E': + f = g; + set: + if ((op->len = (t - sub->re_rhs) - op->off) && (n = ++op - sub->re_ops) >= nops) + { + if (!(sub->re_ops = (regsubop_t*)alloc(p->env->disc, sub->re_ops, (nops *= 2) * sizeof(regsubop_t)))) + { + regfree(p); + return fatal(disc, REG_ESPACE, NiL); + } + op = sub->re_ops + n; + } + op->op = f; + op->off = t - sub->re_rhs; + continue; + case 'L': + g = f; + f = REG_SUB_LOWER; + goto set; + case 'U': + g = f; + f = REG_SUB_UPPER; + goto set; + default: + if (!sre) + { + *t++ = chresc(s - 2, &e); + s = (const char*)e; + continue; + } + s--; + c = -1; + break; + } + if (c > p->re_nsub) + { + regfree(p); + return fatal(disc, REG_ESUBREG, s - 1); + } + if ((n = op - sub->re_ops) >= (nops - 2)) + { + if (!(sub->re_ops = (regsubop_t*)alloc(p->env->disc, sub->re_ops, (nops *= 2) * sizeof(regsubop_t)))) + { + regfree(p); + return fatal(disc, REG_ESPACE, NiL); + } + op = sub->re_ops + n; + } + if (op->len = (t - sub->re_rhs) - op->off) + op++; + op->op = f; + op->off = c; + op->len = 0; + op++; + op->op = f; + op->off = t - sub->re_rhs; + } + if ((op->len = (t - sub->re_rhs) - op->off) && (n = ++op - sub->re_ops) >= nops) + { + if (!(sub->re_ops = (regsubop_t*)alloc(p->env->disc, sub->re_ops, (nops *= 2) * sizeof(regsubop_t)))) + { + regfree(p); + return fatal(disc, REG_ESPACE, NiL); + } + op = sub->re_ops + n; + } + op->len = -1; + sub->re_flags = flags; + sub->re_min = minmatch; + return 0; +} + +void +regsubfree(regex_t* p) +{ + Env_t* env; + regsub_t* sub; + + if (p && (env = p->env) && env->sub && (sub = p->re_sub)) + { + env->sub = 0; + p->re_sub = 0; + if (!(env->disc->re_flags & REG_NOFREE)) + { + if (sub->re_buf) + alloc(env->disc, sub->re_buf, 0); + if (sub->re_ops) + alloc(env->disc, sub->re_ops, 0); + alloc(env->disc, sub, 0); + } + } +} diff --git a/src/lib/libast/regex/regsubexec.c b/src/lib/libast/regex/regsubexec.c new file mode 100644 index 0000000..78caeba --- /dev/null +++ b/src/lib/libast/regex/regsubexec.c @@ -0,0 +1,196 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * posix regex ed(1) style substitute execute + */ + +#include "reglib.h" + +#define NEED(p,b,n,r) \ + do \ + { \ + if (((b)->re_end - (b)->re_cur) < (n)) \ + { \ + size_t o = (b)->re_cur - (b)->re_buf; \ + size_t a = ((b)->re_end - (b)->re_buf); \ + if (a < n) \ + a = roundof(n, 128); \ + a *= 2; \ + if (!((b)->re_buf = alloc(p->env->disc, (b)->re_buf, a))) \ + { \ + (b)->re_buf = (b)->re_cur = (b)->re_end = 0; \ + c = REG_ESPACE; \ + r; \ + } \ + (b)->re_cur = (b)->re_buf + o; \ + (b)->re_end = (b)->re_buf + a; \ + } \ + } while (0) + +#define PUTC(p,b,x,r) \ + do \ + { \ + NEED(p, b, 1, r); \ + *(b)->re_cur++ = (x); \ + } while (0) + +#define PUTS(p,b,x,z,r) \ + do if (z) \ + { \ + NEED(p, b, z, r); \ + memcpy((b)->re_cur, x, z); \ + (b)->re_cur += (z); \ + } while (0) + +/* + * do a single substitution + */ + +static int +sub(const regex_t* p, register regsub_t* b, const char* ss, register regsubop_t* op, size_t nmatch, register regmatch_t* match) +{ + register char* s; + register char* e; + register int c; + + for (;; op++) + { + switch (op->len) + { + case -1: + break; + case 0: + if (op->off >= nmatch) + return REG_ESUBREG; + if ((c = match[op->off].rm_so) < 0) + continue; + s = (char*)ss + c; + if ((c = match[op->off].rm_eo) < 0) + continue; + e = (char*)ss + c; + NEED(p, b, e - s, return c); + switch (op->op) + { + case REG_SUB_UPPER: + while (s < e) + { + c = *s++; + if (islower(c)) + c = toupper(c); + *b->re_cur++ = c; + } + break; + case REG_SUB_LOWER: + while (s < e) + { + c = *s++; + if (isupper(c)) + c = tolower(c); + *b->re_cur++ = c; + } + break; + case REG_SUB_UPPER|REG_SUB_LOWER: + while (s < e) + { + c = *s++; + if (isupper(c)) + c = tolower(c); + else if (islower(c)) + c = toupper(c); + *b->re_cur++ = c; + } + break; + default: + while (s < e) + *b->re_cur++ = *s++; + break; + } + continue; + default: + NEED(p, b, op->len, return c); + s = b->re_rhs + op->off; + e = s + op->len; + while (s < e) + *b->re_cur++ = *s++; + continue; + } + break; + } + return 0; +} + +/* + * ed(1) style substitute using matches from last regexec() + */ + +int +regsubexec(const regex_t* p, const char* s, size_t nmatch, regmatch_t* match) +{ + register int c; + register regsub_t* b; + const char* e; + int m; + + if (!p->env->sub || (p->env->flags & REG_NOSUB) || !nmatch) + return fatal(p->env->disc, REG_BADPAT, NiL); + b = p->re_sub; + m = b->re_min; + b->re_cur = b->re_buf; + e = (const char*)p->env->end; + c = 0; + for (;;) + { + if (--m > 0) + PUTS(p, b, s, match->rm_eo, return fatal(p->env->disc, c, NiL)); + else + { + PUTS(p, b, s, match->rm_so, return fatal(p->env->disc, c, NiL)); + if (!c && (c = sub(p, b, s, b->re_ops, nmatch, match))) + return fatal(p->env->disc, c, NiL); + } + s += match->rm_eo; + if (m <= 0 && !(b->re_flags & REG_SUB_ALL) || !*s) + break; + if (c = regnexec(p, s, e - s, nmatch, match, p->env->flags|(match->rm_so == match->rm_eo ? REG_ADVANCE : 0))) + { + if (c != REG_NOMATCH) + return fatal(p->env->disc, c, NiL); + break; + } + if (!match->rm_so && !match->rm_eo && *s && m <= 1) + { + match->rm_so = match->rm_eo = 1; + c = 1; + } + } + while (s < e) + { + c = *s++; + PUTC(p, b, c, return fatal(p->env->disc, c, NiL)); + } + NEED(p, b, 1, return fatal(p->env->disc, c, NiL)); + *b->re_cur = 0; + b->re_len = b->re_cur - b->re_buf; + return 0; +} |