summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc/port/gen/regexpr.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libc/port/gen/regexpr.c')
-rw-r--r--usr/src/lib/libc/port/gen/regexpr.c450
1 files changed, 450 insertions, 0 deletions
diff --git a/usr/src/lib/libc/port/gen/regexpr.c b/usr/src/lib/libc/port/gen/regexpr.c
new file mode 100644
index 0000000000..44afcdc9e0
--- /dev/null
+++ b/usr/src/lib/libc/port/gen/regexpr.c
@@ -0,0 +1,450 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * routines to do regular expression matching
+ *
+ * Entry points:
+ *
+ * re_comp(s)
+ * char *s;
+ * ... returns 0 if the string s was compiled successfully,
+ * a pointer to an error message otherwise.
+ * If passed 0 or a null string returns without changing
+ * the currently compiled re (see note 11 below).
+ *
+ * re_exec(s)
+ * char *s;
+ * ... returns 1 if the string s matches the last compiled regular
+ * expression,
+ * 0 if the string s failed to match the last compiled
+ * regular expression, and
+ * -1 if the compiled regular expression was invalid
+ * (indicating an internal error).
+ *
+ * The strings passed to both re_comp and re_exec may have trailing or
+ * embedded newline characters; they are terminated by nulls.
+ *
+ * The identity of the author of these routines is lost in antiquity;
+ * this is essentially the same as the re code in the original V6 ed.
+ *
+ * The regular expressions recognized are described below. This description
+ * is essentially the same as that for ed.
+ *
+ * A regular expression specifies a set of strings of characters.
+ * A member of this set of strings is said to be matched by
+ * the regular expression. In the following specification for
+ * regular expressions the word `character' means any character but NUL.
+ *
+ * 1. Any character except a special character matches itself.
+ * Special characters are the regular expression delimiter plus
+ * \ [ . and sometimes ^ * $.
+ * 2. A . matches any character.
+ * 3. A \ followed by any character except a digit or ( )
+ * matches that character.
+ * 4. A nonempty string s bracketed [s] (or [^s]) matches any
+ * character in (or not in) s. In s, \ has no special meaning,
+ * and ] may only appear as the first letter. A substring
+ * a-b, with a and b in ascending ASCII order, stands for
+ * the inclusive range of ASCII characters.
+ * 5. A regular expression of form 1-4 followed by * matches a
+ * sequence of 0 or more matches of the regular expression.
+ * 6. A regular expression, x, of form 1-8, bracketed \(x\)
+ * matches what x matches.
+ * 7. A \ followed by a digit n matches a copy of the string that the
+ * bracketed regular expression beginning with the nth \( matched.
+ * 8. A regular expression of form 1-8, x, followed by a regular
+ * expression of form 1-7, y matches a match for x followed by
+ * a match for y, with the x match being as long as possible
+ * while still permitting a y match.
+ * 9. A regular expression of form 1-8 preceded by ^ (or followed
+ * by $), is constrained to matches that begin at the left
+ * (or end at the right) end of a line.
+ * 10. A regular expression of form 1-9 picks out the longest among
+ * the leftmost matches in a line.
+ * 11. An empty regular expression stands for a copy of the last
+ * regular expression encountered.
+ */
+
+#include "synonyms.h"
+
+#include <stdlib.h>
+#include <re_comp.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+/*
+ * constants for re's
+ */
+#define CBRA 1
+#define CCHR 2
+#define CDOT 4
+#define CCL 6
+#define NCCL 8
+#define CDOL 10
+#define CEOF 11
+#define CKET 12
+#define CBACK 18
+
+#define CSTAR 01
+
+#define ESIZE 512
+#define NBRA 9
+
+static struct re_globals {
+ char _expbuf[ESIZE];
+ char *_braslist[NBRA], *_braelist[NBRA];
+ char _circf;
+} *re_globals;
+#define expbuf (_re->_expbuf)
+#define braslist (_re->_braslist)
+#define braelist (_re->_braelist)
+#define circf (_re->_circf)
+
+static int advance(const char *, char *);
+static int backref(int, const char *);
+static int cclass(char *, char, int);
+
+/*
+ * compile the regular expression argument into a dfa
+ */
+char *
+re_comp(const char *sp)
+{
+ char c;
+ struct re_globals *_re = re_globals;
+ char *ep;
+ char cclcnt, numbra = 0;
+ char *lastep = NULL;
+ char bracket[NBRA];
+ char *bracketp = &bracket[0];
+ char *retoolong = "Regular expression too long";
+
+ if (_re == NULL) {
+ _re = (struct re_globals *)calloc(1, sizeof (*_re));
+ if (_re == NULL)
+ return ("Out of memory");
+ re_globals = _re;
+ }
+ ep = expbuf;
+
+#define comerr(msg) {expbuf[0] = 0; return (msg); }
+
+ if (sp == NULL || *sp == '\0') {
+ if (*ep == 0)
+ return ("No previous regular expression");
+ return (NULL);
+ }
+ if (*sp == '^') {
+ circf = 1;
+ sp++;
+ }
+ else
+ circf = 0;
+ for (;;) {
+ if (ep >= &expbuf[ESIZE])
+ comerr(retoolong);
+ if ((c = *sp++) == '\0') {
+ if (bracketp != bracket)
+ comerr("unmatched \\(");
+ *ep++ = CEOF;
+ *ep++ = 0;
+ return (NULL);
+ }
+ if (c != '*')
+ lastep = ep;
+ switch (c) {
+
+ case '.':
+ *ep++ = CDOT;
+ continue;
+
+ case '*':
+ if (lastep == NULL || *lastep == CBRA ||
+ *lastep == CKET)
+ goto defchar;
+ *lastep |= CSTAR;
+ continue;
+
+ case '$':
+ if (*sp != '\0')
+ goto defchar;
+ *ep++ = CDOL;
+ continue;
+
+ case '[':
+ *ep++ = CCL;
+ *ep++ = 0;
+ cclcnt = 1;
+ if ((c = *sp++) == '^') {
+ c = *sp++;
+ ep[-2] = NCCL;
+ }
+ do {
+ if (c == '\0')
+ comerr("missing ]");
+ if (c == '-' && ep [-1] != 0) {
+ if ((c = *sp++) == ']') {
+ *ep++ = '-';
+ cclcnt++;
+ break;
+ }
+ while (ep[-1] < c) {
+ *ep = ep[-1] + 1;
+ ep++;
+ cclcnt++;
+ if (ep >= &expbuf[ESIZE])
+ comerr(retoolong);
+ }
+ }
+ *ep++ = c;
+ cclcnt++;
+ if (ep >= &expbuf[ESIZE])
+ comerr(retoolong);
+ } while ((c = *sp++) != ']');
+ lastep[1] = cclcnt;
+ continue;
+
+ case '\\':
+ if ((c = *sp++) == '(') {
+ if (numbra >= NBRA)
+ comerr("too many \\(\\) pairs");
+ *bracketp++ = numbra;
+ *ep++ = CBRA;
+ *ep++ = numbra++;
+ continue;
+ }
+ if (c == ')') {
+ if (bracketp <= bracket)
+ comerr("unmatched \\)");
+ *ep++ = CKET;
+ *ep++ = *--bracketp;
+ continue;
+ }
+ if (c >= '1' && c < ('1' + NBRA)) {
+ *ep++ = CBACK;
+ *ep++ = c - '1';
+ continue;
+ }
+ *ep++ = CCHR;
+ *ep++ = c;
+ continue;
+
+ defchar:
+ default:
+ *ep++ = CCHR;
+ *ep++ = c;
+ }
+ }
+}
+
+/*
+ * match the argument string against the compiled re
+ */
+int
+re_exec(const char *p1)
+{
+ struct re_globals *_re = re_globals;
+ char *p2;
+ int c;
+ int rv;
+
+ if (_re == NULL)
+ return (0);
+ p2 = expbuf;
+ for (c = 0; c < NBRA; c++) {
+ braslist[c] = 0;
+ braelist[c] = 0;
+ }
+ if (circf)
+ return ((advance(p1, p2)));
+ /*
+ * fast check for first character
+ */
+ if (*p2 == CCHR) {
+ c = p2[1];
+ do {
+ if (*p1 != c)
+ continue;
+ if (rv = advance(p1, p2))
+ return (rv);
+ } while (*p1++);
+ return (0);
+ }
+ /*
+ * regular algorithm
+ */
+ do
+ if (rv = advance(p1, p2))
+ return (rv);
+ while (*p1++);
+ return (0);
+}
+
+/*
+ * try to match the next thing in the dfa
+ */
+static int
+advance(const char *lp, char *ep)
+{
+ const char *curlp;
+ ptrdiff_t ct;
+ int i;
+ int rv;
+ struct re_globals *_re = re_globals;
+
+ for (;;)
+ switch (*ep++) {
+
+ case CCHR:
+ if (*ep++ == *lp++)
+ continue;
+ return (0);
+
+ case CDOT:
+ if (*lp++)
+ continue;
+ return (0);
+
+ case CDOL:
+ if (*lp == '\0')
+ continue;
+ return (0);
+
+ case CEOF:
+ return (1);
+
+ case CCL:
+ if (cclass(ep, *lp++, 1)) {
+ ep += *ep;
+ continue;
+ }
+ return (0);
+
+ case NCCL:
+ if (cclass(ep, *lp++, 0)) {
+ ep += *ep;
+ continue;
+ }
+ return (0);
+
+ case CBRA:
+ braslist[*ep++] = (char *)lp;
+ continue;
+
+ case CKET:
+ braelist[*ep++] = (char *)lp;
+ continue;
+
+ case CBACK:
+ if (braelist[i = *ep++] == NULL)
+ return (-1);
+ if (backref(i, lp)) {
+ lp += braelist[i] - braslist[i];
+ continue;
+ }
+ return (0);
+
+ case CBACK|CSTAR:
+ if (braelist[i = *ep++] == NULL)
+ return (-1);
+ curlp = lp;
+ ct = braelist[i] - braslist[i];
+ while (backref(i, lp))
+ lp += ct;
+ while (lp >= curlp) {
+ if (rv = advance(lp, ep))
+ return (rv);
+ lp -= ct;
+ }
+ continue;
+
+ case CDOT|CSTAR:
+ curlp = lp;
+ while (*lp++)
+ ;
+ goto star;
+
+ case CCHR|CSTAR:
+ curlp = lp;
+ while (*lp++ == *ep)
+ ;
+ ep++;
+ goto star;
+
+ case CCL|CSTAR:
+ case NCCL|CSTAR:
+ curlp = lp;
+ while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
+ ;
+ ep += *ep;
+ goto star;
+
+ star:
+ do {
+ lp--;
+ if (rv = advance(lp, ep))
+ return (rv);
+ } while (lp > curlp);
+ return (0);
+
+ default:
+ return (-1);
+ }
+}
+
+static int
+backref(int i, const char *lp)
+{
+ char *bp;
+ struct re_globals *_re = re_globals;
+
+ bp = braslist[i];
+ while (*bp++ == *lp++)
+ if (bp >= braelist[i])
+ return (1);
+ return (0);
+}
+
+static int
+cclass(char *set, char c, int af)
+{
+ int n;
+
+ if (c == 0)
+ return (0);
+ n = *set++;
+ while (--n)
+ if (*set++ == c)
+ return (af);
+ return (! af);
+}