diff options
author | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
---|---|---|
committer | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
commit | 7c478bd95313f5f23a4c958a745db2134aa03244 (patch) | |
tree | c871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/lib/libbc/libc/gen/common/regex.c | |
download | illumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz |
OpenSolaris Launch
Diffstat (limited to 'usr/src/lib/libbc/libc/gen/common/regex.c')
-rw-r--r-- | usr/src/lib/libbc/libc/gen/common/regex.c | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/usr/src/lib/libbc/libc/gen/common/regex.c b/usr/src/lib/libbc/libc/gen/common/regex.c new file mode 100644 index 0000000000..9352d6ca7e --- /dev/null +++ b/usr/src/lib/libbc/libc/gen/common/regex.c @@ -0,0 +1,441 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1987 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + /* from UCB 4.1 80/12/21 */ + +/* + * routines to do regular expression matching + * + * Entry points: + * + * re_comp(s) + * char *s; + * ... returns 0 if the string s was compiled successfully, + * a pointer to an error message otherwise. + * If passed 0 or a null string returns without changing + * the currently compiled re (see note 11 below). + * + * re_exec(s) + * char *s; + * ... returns 1 if the string s matches the last compiled regular + * expression, + * 0 if the string s failed to match the last compiled + * regular expression, and + * -1 if the compiled regular expression was invalid + * (indicating an internal error). + * + * The strings passed to both re_comp and re_exec may have trailing or + * embedded newline characters; they are terminated by nulls. + * + * The identity of the author of these routines is lost in antiquity; + * this is essentially the same as the re code in the original V6 ed. + * + * The regular expressions recognized are described below. This description + * is essentially the same as that for ed. + * + * A regular expression specifies a set of strings of characters. + * A member of this set of strings is said to be matched by + * the regular expression. In the following specification for + * regular expressions the word `character' means any character but NUL. + * + * 1. Any character except a special character matches itself. + * Special characters are the regular expression delimiter plus + * \ [ . and sometimes ^ * $. + * 2. A . matches any character. + * 3. A \ followed by any character except a digit or ( ) + * matches that character. + * 4. A nonempty string s bracketed [s] (or [^s]) matches any + * character in (or not in) s. In s, \ has no special meaning, + * and ] may only appear as the first letter. A substring + * a-b, with a and b in ascending ASCII order, stands for + * the inclusive range of ASCII characters. + * 5. A regular expression of form 1-4 followed by * matches a + * sequence of 0 or more matches of the regular expression. + * 6. A regular expression, x, of form 1-8, bracketed \(x\) + * matches what x matches. + * 7. A \ followed by a digit n matches a copy of the string that the + * bracketed regular expression beginning with the nth \( matched. + * 8. A regular expression of form 1-8, x, followed by a regular + * expression of form 1-7, y matches a match for x followed by + * a match for y, with the x match being as long as possible + * while still permitting a y match. + * 9. A regular expression of form 1-8 preceded by ^ (or followed + * by $), is constrained to matches that begin at the left + * (or end at the right) end of a line. + * 10. A regular expression of form 1-9 picks out the longest among + * the leftmost matches in a line. + * 11. An empty regular expression stands for a copy of the last + * regular expression encountered. + */ + +/* + * constants for re's + */ +#define CBRA 1 +#define CCHR 2 +#define CDOT 4 +#define CCL 6 +#define NCCL 8 +#define CDOL 10 +#define CEOF 11 +#define CKET 12 +#define CBACK 18 + +#define CSTAR 01 + +#define ESIZE 512 +#define NBRA 9 + +static struct re_globals { + char _expbuf[ESIZE]; + char *_braslist[NBRA], *_braelist[NBRA]; + char _circf; +} *re_globals; +#define expbuf (_re->_expbuf) +#define braslist (_re->_braslist) +#define braelist (_re->_braelist) +#define circf (_re->_circf) + +/* + * compile the regular expression argument into a dfa + */ +char * +re_comp(sp) + register char *sp; +{ + register int c; + register struct re_globals *_re = re_globals; + register char *ep; + int cclcnt, numbra = 0; + char *lastep = 0; + char bracket[NBRA]; + char *bracketp = &bracket[0]; + char *retoolong = "Regular expression too long"; + + if (_re == 0) { + _re = (struct re_globals *)calloc(1, sizeof (*_re)); + if (_re == 0) + return ("Out of memory"); + re_globals = _re; + } + ep = expbuf; + +#define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } + + if (sp == 0 || *sp == '\0') { + if (*ep == 0) + return("No previous regular expression"); + return(0); + } + if (*sp == '^') { + circf = 1; + sp++; + } + else + circf = 0; + for (;;) { + if (ep >= &expbuf[ESIZE]) + comerr(retoolong); + if ((c = *sp++) == '\0') { + if (bracketp != bracket) + comerr("unmatched \\("); + *ep++ = CEOF; + *ep++ = 0; + return(0); + } + if (c != '*') + lastep = ep; + switch (c) { + + case '.': + *ep++ = CDOT; + continue; + + case '*': + if (lastep == 0 || *lastep == CBRA || *lastep == CKET) + goto defchar; + *lastep |= CSTAR; + continue; + + case '$': + if (*sp != '\0') + goto defchar; + *ep++ = CDOL; + continue; + + case '[': + *ep++ = CCL; + *ep++ = 0; + cclcnt = 1; + if ((c = *sp++) == '^') { + c = *sp++; + ep[-2] = NCCL; + } + do { + if (c == '\0') + comerr("missing ]"); + if (c == '-' && ep [-1] != 0) { + if ((c = *sp++) == ']') { + *ep++ = '-'; + cclcnt++; + break; + } + while (ep[-1] < c) { + *ep = ep[-1] + 1; + ep++; + cclcnt++; + if (ep >= &expbuf[ESIZE]) + comerr(retoolong); + } + } + *ep++ = c; + cclcnt++; + if (ep >= &expbuf[ESIZE]) + comerr(retoolong); + } while ((c = *sp++) != ']'); + lastep[1] = cclcnt; + continue; + + case '\\': + if ((c = *sp++) == '(') { + if (numbra >= NBRA) + comerr("too many \\(\\) pairs"); + *bracketp++ = numbra; + *ep++ = CBRA; + *ep++ = numbra++; + continue; + } + if (c == ')') { + if (bracketp <= bracket) + comerr("unmatched \\)"); + *ep++ = CKET; + *ep++ = *--bracketp; + continue; + } + if (c >= '1' && c < ('1' + NBRA)) { + *ep++ = CBACK; + *ep++ = c - '1'; + continue; + } + *ep++ = CCHR; + *ep++ = c; + continue; + + defchar: + default: + *ep++ = CCHR; + *ep++ = c; + } + } +} + +/* + * match the argument string against the compiled re + */ +int +re_exec(p1) + register char *p1; +{ + register struct re_globals *_re = re_globals; + register char *p2; + register int c; + int rv; + + if (_re == 0) + return (0); + p2 = expbuf; + for (c = 0; c < NBRA; c++) { + braslist[c] = 0; + braelist[c] = 0; + } + if (circf) + return((advance(p1, p2))); + /* + * fast check for first character + */ + if (*p2 == CCHR) { + c = p2[1]; + do { + if (*p1 != c) + continue; + if (rv = advance(p1, p2)) + return(rv); + } while (*p1++); + return(0); + } + /* + * regular algorithm + */ + do + if (rv = advance(p1, p2)) + return(rv); + while (*p1++); + return(0); +} + +/* + * try to match the next thing in the dfa + */ +static int +advance(lp, ep) + register char *lp, *ep; +{ + register char *curlp; + int ct, i; + int rv; + register struct re_globals *_re = re_globals; + + for (;;) + switch (*ep++) { + + case CCHR: + if (*ep++ == *lp++) + continue; + return(0); + + case CDOT: + if (*lp++) + continue; + return(0); + + case CDOL: + if (*lp == '\0') + continue; + return(0); + + case CEOF: + return(1); + + case CCL: + if (cclass(ep, *lp++, 1)) { + ep += *ep; + continue; + } + return(0); + + case NCCL: + if (cclass(ep, *lp++, 0)) { + ep += *ep; + continue; + } + return(0); + + case CBRA: + braslist[*ep++] = lp; + continue; + + case CKET: + braelist[*ep++] = lp; + continue; + + case CBACK: + if (braelist[i = *ep++] == 0) + return(-1); + if (backref(i, lp)) { + lp += braelist[i] - braslist[i]; + continue; + } + return(0); + + case CBACK|CSTAR: + if (braelist[i = *ep++] == 0) + return(-1); + curlp = lp; + ct = braelist[i] - braslist[i]; + while (backref(i, lp)) + lp += ct; + while (lp >= curlp) { + if (rv = advance(lp, ep)) + return(rv); + lp -= ct; + } + continue; + + case CDOT|CSTAR: + curlp = lp; + while (*lp++) + ; + goto star; + + case CCHR|CSTAR: + curlp = lp; + while (*lp++ == *ep) + ; + ep++; + goto star; + + case CCL|CSTAR: + case NCCL|CSTAR: + curlp = lp; + while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) + ; + ep += *ep; + goto star; + + star: + do { + lp--; + if (rv = advance(lp, ep)) + return(rv); + } while (lp > curlp); + return(0); + + default: + return(-1); + } +} + +static +backref(i, lp) + register int i; + register char *lp; +{ + register char *bp; + register struct re_globals *_re = re_globals; + + bp = braslist[i]; + while (*bp++ == *lp++) + if (bp >= braelist[i]) + return(1); + return(0); +} + +static int +cclass(set, c, af) + register char *set, c; + int af; +{ + register int n; + + if (c == 0) + return(0); + n = *set++; + while (--n) + if (*set++ == c) + return(af); + return(! af); +} |