diff options
Diffstat (limited to 'src/lib/libast/astsa/strmatch.c')
-rw-r--r-- | src/lib/libast/astsa/strmatch.c | 597 |
1 files changed, 597 insertions, 0 deletions
diff --git a/src/lib/libast/astsa/strmatch.c b/src/lib/libast/astsa/strmatch.c new file mode 100644 index 0000000..f6a9ff3 --- /dev/null +++ b/src/lib/libast/astsa/strmatch.c @@ -0,0 +1,597 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2011 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * D. G. Korn + * G. S. Fowler + * AT&T Research + * + * match shell file patterns -- derived from Bourne and Korn shell gmatch() + * + * sh pattern egrep RE description + * ---------- -------- ----------- + * * .* 0 or more chars + * ? . any single char + * [.] [.] char class + * [!.] [^.] negated char class + * [[:.:]] [[:.:]] ctype class + * [[=.=]] [[=.=]] equivalence class + * [[...]] [[...]] collation element + * *(.) (.)* 0 or more of + * +(.) (.)+ 1 or more of + * ?(.) (.)? 0 or 1 of + * (.) (.) 1 of + * @(.) (.) 1 of + * a|b a|b a or b + * \# () subgroup back reference [1-9] + * a&b a and b + * !(.) none of + * + * \ used to escape metacharacters + * + * *, ?, (, |, &, ), [, \ must be \'d outside of [...] + * only ] must be \'d inside [...] + * + * BUG: unbalanced ) terminates top level pattern + */ + +#include <ast.h> +#include <ctype.h> +#include <hashkey.h> + +#ifndef isblank +#define isblank(x) ((x)==' '||(x)=='\t') +#endif + +#ifndef isgraph +#define isgraph(x) (isprint(x)&&!isblank(x)) +#endif + +#define MAXGROUP 10 + +typedef struct +{ + char* beg[MAXGROUP]; + char* end[MAXGROUP]; + char* next_s; + short groups; +} Group_t; + +typedef struct +{ + Group_t current; + Group_t best; + char* last_s; + char* next_p; +} Match_t; + +#define mbgetchar(p) (*p++) + +#ifndef isxdigit +#define isxdigit(c) ((c)>='0'&&(c)<='9'||(c)>='a'&&(c)<='f'||(c)>='A'&&(c)<='F') +#endif + +#define getsource(s,e) (((s)>=(e))?0:mbgetchar(s)) + +#define COLL_MAX 3 + +/* + * gobble chars up to <sub> or ) keeping track of (...) and [...] + * sub must be one of { '|', '&', 0 } + * 0 returned if s runs out + */ + +static char* +gobble(Match_t* mp, register char* s, register int sub, int* g, int clear) +{ + register int p = 0; + register char* b = 0; + int c = 0; + int n; + + for (;;) + switch (mbgetchar(s)) + { + case '\\': + if (mbgetchar(s)) + break; + /*FALLTHROUGH*/ + case 0: + return 0; + case '[': + if (!b) + { + if (*s == '!') + mbgetchar(s); + b = s; + } + else if (*s == '.' || *s == '=' || *s == ':') + c = *s; + break; + case ']': + if (b) + { + if (*(s - 2) == c) + c = 0; + else if (b != (s - 1)) + b = 0; + } + break; + case '(': + if (!b) + { + p++; + n = (*g)++; + if (clear) + { + if (!sub) + n++; + if (n < MAXGROUP) + mp->current.beg[n] = mp->current.end[n] = 0; + } + } + break; + case ')': + if (!b && p-- <= 0) + return sub ? 0 : s; + break; + case '|': + if (!b && !p && sub == '|') + return s; + break; + } +} + +static int grpmatch(Match_t*, int, char*, register char*, char*, int); + +/* + * match a single pattern + * e is the end (0) of the substring in s + * r marks the start of a repeated subgroup pattern + */ + +static int +onematch(Match_t* mp, int g, char* s, char* p, char* e, char* r, int flags) +{ + register int pc; + register int sc; + register int n; + register int icase; + char* olds; + char* oldp; + + icase = flags & STR_ICASE; + do + { + olds = s; + sc = getsource(s, e); + if (icase && isupper(sc)) + sc = tolower(sc); + oldp = p; + switch (pc = mbgetchar(p)) + { + case '(': + case '*': + case '?': + case '+': + case '@': + case '!': + if (pc == '(' || *p == '(') + { + char* subp; + int oldg; + + s = olds; + subp = p + (pc != '('); + oldg = g; + n = ++g; + if (g < MAXGROUP && (!r || g > mp->current.groups)) + mp->current.beg[g] = mp->current.end[g] = 0; + if (!(p = gobble(mp, subp, 0, &g, !r))) + return 0; + if (pc == '*' || pc == '?' || pc == '+' && oldp == r) + { + if (onematch(mp, g, s, p, e, NiL, flags)) + return 1; + if (!sc || !getsource(s, e)) + { + mp->current.groups = oldg; + return 0; + } + } + if (pc == '*' || pc == '+') + { + p = oldp; + sc = n - 1; + } + else + sc = g; + pc = (pc != '!'); + do + { + if (grpmatch(mp, n, olds, subp, s, flags) == pc) + { + if (n < MAXGROUP) + { + if (!mp->current.beg[n] || mp->current.beg[n] > olds) + mp->current.beg[n] = olds; + if (s > mp->current.end[n]) + mp->current.end[n] = s; + } + if (onematch(mp, sc, s, p, e, oldp, flags)) + { + if (p == oldp && n < MAXGROUP) + { + if (!mp->current.beg[n] || mp->current.beg[n] > olds) + mp->current.beg[n] = olds; + if (s > mp->current.end[n]) + mp->current.end[n] = s; + } + return 1; + } + } + } while (s < e && mbgetchar(s)); + mp->current.groups = oldg; + return 0; + } + else if (pc == '*') + { + /* + * several stars are the same as one + */ + + while (*p == '*' && *(p + 1) != '(') + p++; + oldp = p; + switch (pc = mbgetchar(p)) + { + case '@': + case '!': + case '+': + n = *p == '('; + break; + case '(': + case '[': + case '?': + case '*': + n = 1; + break; + case 0: + case '|': + case '&': + case ')': + mp->current.next_s = (flags & STR_MAXIMAL) ? e : olds; + mp->next_p = oldp; + mp->current.groups = g; + if (!pc && (!mp->best.next_s || (flags & STR_MAXIMAL) && mp->current.next_s > mp->best.next_s || !(flags & STR_MAXIMAL) && mp->current.next_s < mp->best.next_s)) + mp->best = mp->current; + return 1; + case '\\': + if (!(pc = mbgetchar(p))) + return 0; + if (pc >= '0' && pc <= '9') + { + n = pc - '0'; + if (n <= g && mp->current.beg[n]) + pc = *mp->current.beg[n]; + } + /*FALLTHROUGH*/ + default: + if (icase && isupper(pc)) + pc = tolower(pc); + n = 0; + break; + } + p = oldp; + for (;;) + { + if ((n || pc == sc) && onematch(mp, g, olds, p, e, NiL, flags)) + return 1; + if (!sc) + return 0; + olds = s; + sc = getsource(s, e); + if ((flags & STR_ICASE) && isupper(sc)) + sc = tolower(sc); + } + } + else if (pc != '?' && pc != sc) + return 0; + break; + case 0: + if (!(flags & STR_MAXIMAL)) + sc = 0; + /*FALLTHROUGH*/ + case '|': + case '&': + case ')': + if (!sc) + { + mp->current.next_s = olds; + mp->next_p = oldp; + mp->current.groups = g; + } + if (!pc && (!mp->best.next_s || (flags & STR_MAXIMAL) && olds > mp->best.next_s || !(flags & STR_MAXIMAL) && olds < mp->best.next_s)) + { + mp->best = mp->current; + mp->best.next_s = olds; + mp->best.groups = g; + } + return !sc; + case '[': + { + /*UNDENT...*/ + + int invert; + int x; + int ok = 0; + char* range; + + if (!sc) + return 0; + range = 0; + n = 0; + if (invert = *p == '!') + p++; + for (;;) + { + oldp = p; + if (!(pc = mbgetchar(p))) + return 0; + else if (pc == '[' && (*p == ':' || *p == '=' || *p == '.')) + { + x = 0; + n = mbgetchar(p); + oldp = p; + for (;;) + { + if (!(pc = mbgetchar(p))) + return 0; + if (pc == n && *p == ']') + break; + x++; + } + mbgetchar(p); + if (ok) + /*NOP*/; + else if (n == ':') + { + switch (HASHNKEY5(x, oldp[0], oldp[1], oldp[2], oldp[3], oldp[4])) + { + case HASHNKEY5(5,'a','l','n','u','m'): + if (isalnum(sc)) + ok = 1; + break; + case HASHNKEY5(5,'a','l','p','h','a'): + if (isalpha(sc)) + ok = 1; + break; + case HASHNKEY5(5,'b','l','a','n','k'): + if (isblank(sc)) + ok = 1; + break; + case HASHNKEY5(5,'c','n','t','r','l'): + if (iscntrl(sc)) + ok = 1; + break; + case HASHNKEY5(5,'d','i','g','i','t'): + if (isdigit(sc)) + ok = 1; + break; + case HASHNKEY5(5,'g','r','a','p','h'): + if (isgraph(sc)) + ok = 1; + break; + case HASHNKEY5(5,'l','o','w','e','r'): + if (islower(sc)) + ok = 1; + break; + case HASHNKEY5(5,'p','r','i','n','t'): + if (isprint(sc)) + ok = 1; + break; + case HASHNKEY5(5,'p','u','n','c','t'): + if (ispunct(sc)) + ok = 1; + break; + case HASHNKEY5(5,'s','p','a','c','e'): + if (isspace(sc)) + ok = 1; + break; + case HASHNKEY5(5,'u','p','p','e','r'): + if (icase ? islower(sc) : isupper(sc)) + ok = 1; + break; + case HASHNKEY5(6,'x','d','i','g','i'): + if (oldp[5] == 't' && isxdigit(sc)) + ok = 1; + break; + } + } + else if (range) + goto getrange; + else if (*p == '-' && *(p + 1) != ']') + { + mbgetchar(p); + range = oldp; + } + else if (isalpha(*oldp) && isalpha(*olds) && tolower(*oldp) == tolower(*olds) || sc == mbgetchar(oldp)) + ok = 1; + n = 1; + } + else if (pc == ']' && n) + { + if (ok != invert) + break; + return 0; + } + else if (pc == '\\' && (oldp = p, !(pc = mbgetchar(p)))) + return 0; + else if (ok) + /*NOP*/; + else if (range) + { + getrange: + if (icase && isupper(pc)) + pc = tolower(pc); + x = mbgetchar(range); + if (icase && isupper(x)) + x = tolower(x); + if (sc == x || sc == pc || sc > x && sc < pc) + ok = 1; + if (*p == '-' && *(p + 1) != ']') + { + mbgetchar(p); + range = oldp; + } + else + range = 0; + n = 1; + } + else if (*p == '-' && *(p + 1) != ']') + { + mbgetchar(p); + range = oldp; + n = 1; + } + else + { + if (icase && isupper(pc)) + pc = tolower(pc); + if (sc == pc) + ok = 1; + n = pc; + } + } + + /*...INDENT*/ + } + break; + case '\\': + if (!(pc = mbgetchar(p))) + return 0; + if (pc >= '0' && pc <= '9') + { + n = pc - '0'; + if (n <= g && (oldp = mp->current.beg[n])) + { + while (oldp < mp->current.end[n]) + if (!*olds || *olds++ != *oldp++) + return 0; + s = olds; + break; + } + } + /*FALLTHROUGH*/ + default: + if (icase && isupper(pc)) + pc = tolower(pc); + if (pc != sc) + return 0; + break; + } + } while (sc); + return 0; +} + +/* + * match any pattern in a group + * | and & subgroups are parsed here + */ + +static int +grpmatch(Match_t* mp, int g, char* s, register char* p, char* e, int flags) +{ + register char* a; + + do + { + for (a = p; onematch(mp, g, s, a, e, NiL, flags); a++) + if (*(a = mp->next_p) != '&') + return 1; + } while (p = gobble(mp, p, '|', &g, 1)); + return 0; +} + +/* + * subgroup match + * 0 returned if no match + * otherwise number of subgroups matched returned + * match group begin offsets are even elements of sub + * match group end offsets are odd elements of sub + * the matched string is from s+sub[0] up to but not + * including s+sub[1] + */ + +int +strgrpmatch(const char* b, const char* p, int* sub, int n, int flags) +{ + register int i; + register char* s; + char* e; + Match_t match; + + s = (char*)b; + match.last_s = e = s + strlen(s); + for (;;) + { + match.best.next_s = 0; + match.current.groups = 0; + if ((i = grpmatch(&match, 0, s, (char*)p, e, flags)) || match.best.next_s) + { + if (!i) + match.current = match.best; + match.current.groups++; + match.current.end[0] = match.current.next_s; + break; + } + if ((flags & STR_LEFT) || s >= e) + return 0; + s++; + } + if ((flags & STR_RIGHT) && match.current.next_s != e) + return 0; + if (!sub) + return 1; + match.current.beg[0] = s; + s = (char*)b; + if (n > match.current.groups) + n = match.current.groups; + for (i = 0; i < n; i++) + { + sub[i * 2] = match.current.end[i] ? match.current.beg[i] - s : 0; + sub[i * 2 + 1] = match.current.end[i] ? match.current.end[i] - s : 0; + } + return n; +} + +/* + * compare the string s with the shell pattern p + * returns 1 for match 0 otherwise + */ + +int +strmatch(const char* s, const char* p) +{ + return strgrpmatch(s, p, NiL, 0, STR_MAXIMAL|STR_LEFT|STR_RIGHT); +} |