summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc/port/regex
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libc/port/regex')
-rw-r--r--usr/src/lib/libc/port/regex/engine.c213
-rw-r--r--usr/src/lib/libc/port/regex/regcomp.c341
-rw-r--r--usr/src/lib/libc/port/regex/regex2.h1
3 files changed, 304 insertions, 251 deletions
diff --git a/usr/src/lib/libc/port/regex/engine.c b/usr/src/lib/libc/port/regex/engine.c
index c66c08e59e..2e04a9085b 100644
--- a/usr/src/lib/libc/port/regex/engine.c
+++ b/usr/src/lib/libc/port/regex/engine.c
@@ -34,6 +34,8 @@
* SUCH DAMAGE.
*/
+#include <stdbool.h>
+
/*
* The matching engine and friends. This file is #included by regexec.c
* after suitable #defines of a variety of macros used herein, so that
@@ -43,8 +45,7 @@
#ifdef SNAMES
#define matcher smatcher
-#define fast sfast
-#define slow sslow
+#define walk swalk
#define dissect sdissect
#define backref sbackref
#define step sstep
@@ -54,8 +55,7 @@
#endif
#ifdef LNAMES
#define matcher lmatcher
-#define fast lfast
-#define slow lslow
+#define walk lwalk
#define dissect ldissect
#define backref lbackref
#define step lstep
@@ -65,8 +65,7 @@
#endif
#ifdef MNAMES
#define matcher mmatcher
-#define fast mfast
-#define slow mslow
+#define walk mwalk
#define dissect mdissect
#define backref mbackref
#define step mstep
@@ -104,10 +103,8 @@ static const char *dissect(struct match *, const char *, const char *,
sopno, sopno);
static const char *backref(struct match *, const char *, const char *, sopno,
sopno, sopno, int);
-static const char *fast(struct match *, const char *, const char *, sopno,
- sopno);
-static const char *slow(struct match *, const char *, const char *, sopno,
- sopno);
+static const char *walk(struct match *m, const char *start, const char *stop,
+ sopno startst, sopno stopst, bool fast);
static states step(struct re_guts *, sopno, sopno, states, wint_t, states);
#define MAX_RECURSION 100
#define BOL (OUT-1)
@@ -253,7 +250,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch,
/* this loop does only one repetition except for backrefs */
for (;;) {
- endp = fast(m, start, stop, gf, gl);
+ endp = walk(m, start, stop, gf, gl, true);
if (endp == NULL) { /* a miss */
if (m->pmatch != NULL)
free((char *)m->pmatch);
@@ -269,7 +266,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch,
assert(m->coldp != NULL);
for (;;) {
NOTE("finding start");
- endp = slow(m, m->coldp, stop, gf, gl);
+ endp = walk(m, m->coldp, stop, gf, gl, false);
if (endp != NULL)
break;
assert(m->coldp < m->endp);
@@ -314,7 +311,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch,
if (dp != NULL || endp <= m->coldp)
break; /* defeat */
NOTE("backoff");
- endp = slow(m, m->coldp, endp-1, gf, gl);
+ endp = walk(m, m->coldp, endp-1, gf, gl, false);
if (endp == NULL)
break; /* defeat */
/* try it on a shorter possibility */
@@ -395,7 +392,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
es += OPND(m->g->strip[es]);
break;
case OCH_:
- while (OP(m->g->strip[es]) != O_CH)
+ while (OP(m->g->strip[es]) != (sop)O_CH)
es += OPND(m->g->strip[es]);
break;
}
@@ -427,10 +424,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
stp = stop;
for (;;) {
/* how long could this one be? */
- rest = slow(m, sp, stp, ss, es);
+ rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
- tail = slow(m, rest, stop, es, stopst);
+ tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
@@ -440,12 +437,9 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
ssub = ss + 1;
esub = es - 1;
/* did innards match? */
- if (slow(m, sp, rest, ssub, esub) != NULL) {
+ if (walk(m, sp, rest, ssub, esub, false) != NULL) {
dp = dissect(m, sp, rest, ssub, esub);
assert(dp == rest);
-#if defined(__lint)
- (void) dp;
-#endif
} else /* no */
assert(sp == rest);
sp = rest;
@@ -454,10 +448,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
stp = stop;
for (;;) {
/* how long could this one be? */
- rest = slow(m, sp, stp, ss, es);
+ rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
- tail = slow(m, rest, stop, es, stopst);
+ tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
@@ -469,7 +463,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
ssp = sp;
oldssp = ssp;
for (;;) { /* find last match of innards */
- sep = slow(m, ssp, rest, ssub, esub);
+ sep = walk(m, ssp, rest, ssub, esub, false);
if (sep == NULL || sep == ssp)
break; /* failed or matched null */
oldssp = ssp; /* on to next try */
@@ -481,7 +475,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
ssp = oldssp;
}
assert(sep == rest); /* must exhaust substring */
- assert(slow(m, ssp, sep, ssub, esub) == rest);
+ assert(walk(m, ssp, sep, ssub, esub, false) == rest);
dp = dissect(m, ssp, sep, ssub, esub);
assert(dp == sep);
sp = rest;
@@ -490,10 +484,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
stp = stop;
for (;;) {
/* how long could this one be? */
- rest = slow(m, sp, stp, ss, es);
+ rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
- tail = slow(m, rest, stop, es, stopst);
+ tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
@@ -504,7 +498,8 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
esub = ss + OPND(m->g->strip[ss]) - 1;
assert(OP(m->g->strip[esub]) == OOR1);
for (;;) { /* find first matching branch */
- if (slow(m, sp, rest, ssub, esub) == rest)
+ if (walk(m, sp, rest, ssub, esub,
+ false) == rest)
break; /* it matched all of it */
/* that one missed, try next one */
assert(OP(m->g->strip[esub]) == OOR1);
@@ -512,7 +507,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst,
assert(OP(m->g->strip[esub]) == OOR2);
ssub = esub + 1;
esub += OPND(m->g->strip[esub]);
- if (OP(m->g->strip[esub]) == OOR2)
+ if (OP(m->g->strip[esub]) == (sop)OOR2)
esub--;
else
assert(OP(m->g->strip[esub]) == O_CH);
@@ -637,7 +632,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst,
do {
assert(OP(s) == OOR2);
ss += OPND(s);
- } while (OP(s = m->g->strip[ss]) != O_CH);
+ } while (OP(s = m->g->strip[ss]) != (sop)O_CH);
/* note that the ss++ gets us past the O_CH */
break;
default: /* have to make a choice */
@@ -670,7 +665,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst,
ssp = m->offp + m->pmatch[i].rm_so;
if (memcmp(sp, ssp, len) != 0)
return (NULL);
- while (m->g->strip[ss] != SOP(O_BACK, i))
+ while (m->g->strip[ss] != (sop)SOP(O_BACK, i))
ss++;
return (backref(m, sp+len, stop, ss+1, stopst, lev, rec));
case OQUEST_: /* to null or not */
@@ -701,13 +696,13 @@ backref(struct match *m, const char *start, const char *stop, sopno startst,
if (dp != NULL)
return (dp);
/* that one missed, try next one */
- if (OP(m->g->strip[esub]) == O_CH)
+ if (OP(m->g->strip[esub]) == (sop)O_CH)
return (NULL); /* there is none */
esub++;
- assert(OP(m->g->strip[esub]) == OOR2);
+ assert(OP(m->g->strip[esub]) == (sop)OOR2);
ssub = esub + 1;
esub += OPND(m->g->strip[esub]);
- if (OP(m->g->strip[esub]) == OOR2)
+ if (OP(m->g->strip[esub]) == (sop)OOR2)
esub--;
else
assert(OP(m->g->strip[esub]) == O_CH);
@@ -745,115 +740,15 @@ backref(struct match *m, const char *start, const char *stop, sopno startst,
}
/*
- * fast - step through the string at top speed
+ * Step through the string either quickly or slowly. Returns where it ended
+ * or NULL.
*/
static const char *
-fast(struct match *m, const char *start, const char *stop, sopno startst,
- sopno stopst)
+walk(struct match *m, const char *start, const char *stop, sopno startst,
+ sopno stopst, bool fast)
{
states st = m->st;
states fresh = m->fresh;
- states tmp = m->tmp;
- const char *p = start;
- wint_t c;
- wint_t lastc; /* previous c */
- wint_t flagch;
- int i;
- const char *coldp; /* last p after which no match was underway */
- size_t clen;
-
- CLEAR(st);
- SET1(st, startst);
- SP("fast", st, *p);
- st = step(m->g, startst, stopst, st, NOTHING, st);
- ASSIGN(fresh, st);
- SP("start", st, *p);
- coldp = NULL;
- if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
- c = OUT;
- else {
- /*
- * XXX Wrong if the previous character was multi-byte.
- * Newline never is (in encodings supported by FreeBSD),
- * so this only breaks the ISWORD tests below.
- */
- c = (uch)*(start - 1);
- }
- for (;;) {
- /* next character */
- lastc = c;
- if (p == m->endp) {
- clen = 0;
- c = OUT;
- } else
- clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
- if (EQ(st, fresh))
- coldp = p;
-
- /* is there an EOL and/or BOL between lastc and c? */
- flagch = '\0';
- i = 0;
- if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
- (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
- flagch = BOL;
- i = m->g->nbol;
- }
- if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
- (c == OUT && !(m->eflags&REG_NOTEOL))) {
- flagch = (flagch == BOL) ? BOLEOL : EOL;
- i += m->g->neol;
- }
- if (i != 0) {
- for (; i > 0; i--)
- st = step(m->g, startst, stopst, st,
- flagch, st);
- SP("boleol", st, c);
- }
-
- /* how about a word boundary? */
- if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
- (c != OUT && ISWORD(c))) {
- flagch = BOW;
- }
- if ((lastc != OUT && ISWORD(lastc)) &&
- (flagch == EOL || (c != OUT && !ISWORD(c)))) {
- flagch = EOW;
- }
- if (flagch == BOW || flagch == EOW) {
- st = step(m->g, startst, stopst, st, flagch, st);
- SP("boweow", st, c);
- }
-
- /* are we done? */
- if (ISSET(st, stopst) || p == stop || clen > stop - p)
- break; /* NOTE BREAK OUT */
-
- /* no, we must deal with this character */
- ASSIGN(tmp, st);
- ASSIGN(st, fresh);
- assert(c != OUT);
- st = step(m->g, startst, stopst, tmp, c, st);
- SP("aft", st, c);
- assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p += clen;
- }
-
- assert(coldp != NULL);
- m->coldp = coldp;
- if (ISSET(st, stopst))
- return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
- else
- return (NULL);
-}
-
-/*
- * slow - step through the string more deliberately
- */
-static const char *
-slow(struct match *m, const char *start, const char *stop, sopno startst,
- sopno stopst)
-{
- states st = m->st;
states empty = m->empty;
states tmp = m->tmp;
const char *p = start;
@@ -869,6 +764,8 @@ slow(struct match *m, const char *start, const char *stop, sopno startst,
SET1(st, startst);
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
+ if (fast)
+ ASSIGN(fresh, st);
matchp = NULL;
if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
c = OUT;
@@ -889,6 +786,9 @@ slow(struct match *m, const char *start, const char *stop, sopno startst,
} else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
+ if (fast && EQ(st, fresh))
+ matchp = p;
+
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
i = 0;
@@ -924,14 +824,21 @@ slow(struct match *m, const char *start, const char *stop, sopno startst,
}
/* are we done? */
- if (ISSET(st, stopst))
- matchp = p;
- if (EQ(st, empty) || p == stop || clen > stop - p)
+ if (ISSET(st, stopst)) {
+ if (fast)
+ break;
+ else
+ matchp = p;
+ }
+ if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
break; /* NOTE BREAK OUT */
/* no, we must deal with this character */
ASSIGN(tmp, st);
- ASSIGN(st, empty);
+ if (fast)
+ ASSIGN(st, fresh);
+ else
+ ASSIGN(st, empty);
assert(c != OUT);
st = step(m->g, startst, stopst, tmp, c, st);
SP("saft", st, c);
@@ -939,10 +846,17 @@ slow(struct match *m, const char *start, const char *stop, sopno startst,
p += clen;
}
- return (matchp);
+ if (fast) {
+ assert(matchp != NULL);
+ m->coldp = matchp;
+ if (ISSET(st, stopst))
+ return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
+ else
+ return (NULL);
+ } else
+ return (matchp);
}
-
/*
* step - map set of states reachable before char to set reachable after
*/
@@ -1028,22 +942,22 @@ step(struct re_guts *g,
break;
case OCH_: /* mark the first two branches */
FWD(aft, aft, 1);
- assert(OP(g->strip[pc+OPND(s)]) == OOR2);
+ assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
FWD(aft, aft, OPND(s));
break;
case OOR1: /* done a branch, find the O_CH */
if (ISSTATEIN(aft, here)) {
for (look = 1;
- OP(s = g->strip[pc+look]) != O_CH;
+ OP(s = g->strip[pc+look]) != (sop)O_CH;
look += OPND(s))
- assert(OP(s) == OOR2);
+ assert(OP(s) == (sop)OOR2);
FWD(aft, aft, look + 1);
}
break;
case OOR2: /* propagate OCH_'s marking */
FWD(aft, aft, 1);
- if (OP(g->strip[pc+OPND(s)]) != O_CH) {
- assert(OP(g->strip[pc+OPND(s)]) == OOR2);
+ if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) {
+ assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
FWD(aft, aft, OPND(s));
}
break;
@@ -1124,8 +1038,7 @@ pchar(int ch)
#endif
#undef matcher
-#undef fast
-#undef slow
+#undef walk
#undef dissect
#undef backref
#undef step
diff --git a/usr/src/lib/libc/port/regex/regcomp.c b/usr/src/lib/libc/port/regex/regcomp.c
index 3bd27129a0..8df8c337cd 100644
--- a/usr/src/lib/libc/port/regex/regcomp.c
+++ b/usr/src/lib/libc/port/regex/regcomp.c
@@ -41,8 +41,9 @@
#include <string.h>
#include <ctype.h>
#include <limits.h>
-#include <stdlib.h>
#include <regex.h>
+#include <stdlib.h>
+#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
@@ -56,6 +57,24 @@
#include "../locale/mblocal.h"
/*
+ * Branching context, used to keep track of branch state for all of the branch-
+ * aware functions. In addition to keeping track of branch positions for the
+ * p_branch_* functions, we use this to simplify some clumsiness in BREs for
+ * detection of whether ^ is acting as an anchor or being used erroneously and
+ * also for whether we're in a sub-expression or not.
+ */
+struct branchc {
+ sopno start;
+ sopno back;
+ sopno fwd;
+
+ int nbranch;
+ int nchain;
+ bool outer;
+ bool terminate;
+};
+
+/*
* parse structure, passed up and down to avoid global variables and
* other clumsinesses
*/
@@ -71,6 +90,11 @@ struct parse {
#define NPAREN 10 /* we need to remember () 1-9 for back refs */
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
sopno pend[NPAREN]; /* -> ) ([0] unused) */
+ bool allowbranch; /* can this expression branch? */
+ bool bre; /* convenience; is this a BRE? */
+ bool (*parse_expr)(struct parse *, struct branchc *);
+ void (*pre_parse)(struct parse *, struct branchc *);
+ void (*post_parse)(struct parse *, struct branchc *);
};
/* ========= begin header generated by ./mkh ========= */
@@ -79,11 +103,17 @@ extern "C" {
#endif
/* === regcomp.c === */
-static void p_ere(struct parse *p, int stop);
-static void p_ere_exp(struct parse *p);
+static bool p_ere_exp(struct parse *p, struct branchc *bc);
static void p_str(struct parse *p);
-static void p_bre(struct parse *p, int end1, int end2);
-static int p_simp_re(struct parse *p, int starordinary);
+static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
+static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
+static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
+static bool p_branch_empty(struct parse *p, struct branchc *bc);
+static bool p_branch_do(struct parse *p, struct branchc *bc);
+static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
+static void p_bre_post_parse(struct parse *p, struct branchc *bc);
+static void p_re(struct parse *p, int end1, int end2);
+static bool p_simp_re(struct parse *p, struct branchc *bc);
static int p_count(struct parse *p);
static void p_bracket(struct parse *p);
static void p_b_term(struct parse *p, cset *cs);
@@ -133,6 +163,7 @@ static char nuls[10]; /* place to point scanner in event of error */
#define MORE2() (p->next+1 < p->end)
#define SEE(c) (MORE() && PEEK() == (c))
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
#define NEXT() (p->next++)
@@ -229,6 +260,19 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
p->pbegin[i] = 0;
p->pend[i] = 0;
}
+ if (cflags & REG_EXTENDED) {
+ p->allowbranch = true;
+ p->bre = false;
+ p->parse_expr = p_ere_exp;
+ p->pre_parse = NULL;
+ p->post_parse = NULL;
+ } else {
+ p->allowbranch = false;
+ p->bre = true;
+ p->parse_expr = p_simp_re;
+ p->pre_parse = p_bre_pre_parse;
+ p->post_parse = p_bre_post_parse;
+ }
g->sets = NULL;
g->ncsets = 0;
g->cflags = cflags;
@@ -246,12 +290,10 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
/* do it */
EMIT(OEND, 0);
g->firststate = THERE();
- if (cflags&REG_EXTENDED)
- p_ere(p, OUT);
- else if (cflags&REG_NOSPEC)
+ if (cflags & REG_NOSPEC)
p_str(p);
else
- p_bre(p, OUT, OUT);
+ p_re(p, OUT, OUT);
EMIT(OEND, 0);
g->laststate = THERE();
@@ -288,55 +330,11 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
}
/*
- * p_ere - ERE parser top level, concatenation and alternation
- */
-static void
-p_ere(struct parse *p,
- int stop) /* character this ERE should end at */
-{
- char c;
- sopno prevback;
- sopno prevfwd;
- sopno conc;
- int first = 1; /* is this the first alternative? */
-
- for (;;) {
- /* do a bunch of concatenated expressions */
- conc = HERE();
- while (MORE() && (c = PEEK()) != '|' && c != stop)
- p_ere_exp(p);
- /* require nonempty */
- (void) REQUIRE(HERE() != conc, REG_BADPAT);
-
- if (!EAT('|'))
- break; /* NOTE BREAK OUT */
-
- if (first) {
- INSERT(OCH_, conc); /* offset is wrong */
- prevfwd = conc;
- prevback = conc;
- first = 0;
- }
- ASTERN(OOR1, prevback);
- prevback = THERE();
- AHEAD(prevfwd); /* fix previous offset */
- prevfwd = HERE();
- EMIT(OOR2, 0); /* offset is very wrong */
- }
-
- if (!first) { /* tail-end fixups */
- AHEAD(prevfwd);
- ASTERN(O_CH, prevback);
- }
-
- assert(!MORE() || SEE(stop));
-}
-
-/*
- * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
+ * Parse one subERE, an atom possibly followed by a repetition op,
+ * return whether we should terminate or not.
*/
-static void
-p_ere_exp(struct parse *p)
+static bool
+p_ere_exp(struct parse *p, struct branchc *bc)
{
char c;
wint_t wc;
@@ -346,6 +344,7 @@ p_ere_exp(struct parse *p)
sopno subno;
int wascaret = 0;
+ (void) bc;
assert(MORE()); /* caller should have ensured this */
c = GETNEXT();
@@ -359,7 +358,7 @@ p_ere_exp(struct parse *p)
p->pbegin[subno] = HERE();
EMIT(OLPAREN, subno);
if (!SEE(')'))
- p_ere(p, ')');
+ p_re(p, ')', IGN);
if (subno < NPAREN) {
p->pend[subno] = HERE();
assert(p->pend[subno] != 0);
@@ -425,7 +424,7 @@ p_ere_exp(struct parse *p)
break;
default:
if (p->error != 0)
- return;
+ return (false);
p->next--;
wc = WGETNEXT();
ordinary(p, wc);
@@ -433,11 +432,11 @@ p_ere_exp(struct parse *p)
}
if (!MORE())
- return;
+ return (false);
c = PEEK();
/* we call { a repetition if followed by a digit */
if (!(c == '*' || c == '+' || c == '?' || c == '{'))
- return; /* no repetition, we're done */
+ return (false); /* no repetition, we're done */
else if (c == '{')
(void) REQUIRE(MORE2() && \
(isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
@@ -486,12 +485,13 @@ p_ere_exp(struct parse *p)
}
if (!MORE())
- return;
+ return (false);
c = PEEK();
if (!(c == '*' || c == '+' || c == '?' ||
(c == '{' && MORE2() && isdigit((uch)PEEK2()))))
- return;
+ return (false);
SETERROR(REG_BADRPT);
+ return (false);
}
/*
@@ -506,47 +506,181 @@ p_str(struct parse *p)
}
/*
- * p_bre - BRE parser top level, anchoring and concatenation
- * Giving end1 as OUT essentially eliminates the end1/end2 check.
- *
- * This implementation is a bit of a kludge, in that a trailing $ is first
- * taken as an ordinary character and then revised to be an anchor.
- * The amount of lookahead needed to avoid this kludge is excessive.
+ * Eat consecutive branch delimiters for the kind of expression that we are
+ * parsing, return the number of delimiters that we ate.
+ */
+static int
+p_branch_eat_delim(struct parse *p, struct branchc *bc)
+{
+ int nskip;
+
+ (void) bc;
+ nskip = 0;
+ while (EAT('|'))
+ ++nskip;
+ return (nskip);
+}
+
+/*
+ * Insert necessary branch book-keeping operations. This emits a
+ * bogus 'next' offset, since we still have more to parse
*/
static void
-p_bre(struct parse *p,
- int end1, /* first terminating character */
- int end2) /* second terminating character */
+p_branch_ins_offset(struct parse *p, struct branchc *bc)
+{
+ if (bc->nbranch == 0) {
+ INSERT(OCH_, bc->start); /* offset is wrong */
+ bc->fwd = bc->start;
+ bc->back = bc->start;
+ }
+
+ ASTERN(OOR1, bc->back);
+ bc->back = THERE();
+ AHEAD(bc->fwd); /* fix previous offset */
+ bc->fwd = HERE();
+ EMIT(OOR2, 0); /* offset is very wrong */
+ ++bc->nbranch;
+}
+
+/*
+ * Fix the offset of the tail branch, if we actually had any branches.
+ * This is to correct the bogus placeholder offset that we use.
+ */
+static void
+p_branch_fix_tail(struct parse *p, struct branchc *bc)
+{
+ /* Fix bogus offset at the tail if we actually have branches */
+ if (bc->nbranch > 0) {
+ AHEAD(bc->fwd);
+ ASTERN(O_CH, bc->back);
+ }
+}
+
+/*
+ * Signal to the parser that an empty branch has been encountered; this will,
+ * in the future, be used to allow for more permissive behavior with empty
+ * branches. The return value should indicate whether parsing may continue
+ * or not.
+ */
+static bool
+p_branch_empty(struct parse *p, struct branchc *bc)
+{
+ (void) bc;
+ SETERROR(REG_BADPAT);
+ return (false);
+}
+
+/*
+ * Take care of any branching requirements. This includes inserting the
+ * appropriate branching instructions as well as eating all of the branch
+ * delimiters until we either run out of pattern or need to parse more pattern.
+ */
+static bool
+p_branch_do(struct parse *p, struct branchc *bc)
{
- sopno start = HERE();
- int first = 1; /* first subexpression? */
- int wasdollar = 0;
+ int ate = 0;
+
+ ate = p_branch_eat_delim(p, bc);
+ if (ate == 0)
+ return (false);
+ else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
+ /*
+ * Halt parsing only if we have an empty branch and
+ * p_branch_empty indicates that we must not continue.
+ * In the future, this will not necessarily be an error.
+ */
+ return (false);
+ p_branch_ins_offset(p, bc);
+
+ return (true);
+}
+static void
+p_bre_pre_parse(struct parse *p, struct branchc *bc)
+{
+ (void) bc;
+ /*
+ * Does not move cleanly into expression parser because of
+ * ordinary interpration of * at the beginning position of
+ * an expression.
+ */
if (EAT('^')) {
EMIT(OBOL, 0);
p->g->iflags |= USEBOL;
p->g->nbol++;
}
- while (MORE() && !SEETWO(end1, end2)) {
- wasdollar = p_simp_re(p, first);
- first = 0;
- }
- if (wasdollar) { /* oops, that was a trailing anchor */
+}
+
+static void
+p_bre_post_parse(struct parse *p, struct branchc *bc)
+{
+ /* Expression is terminating due to EOL token */
+ if (bc->terminate) {
DROP(1);
EMIT(OEOL, 0);
p->g->iflags |= USEEOL;
p->g->neol++;
}
+}
+
+/*
+ * Top level parser, concatenation and BRE anchoring.
+ * Giving end1 as OUT essentially eliminates the end1/end2 check.
+ *
+ * This implementation is a bit of a kludge, in that a trailing $ is first
+ * taken as an ordinary character and then revised to be an anchor.
+ * The amount of lookahead needed to avoid this kludge is excessive.
+ */
+static void
+p_re(struct parse *p,
+ int end1, /* first terminating character */
+ int end2) /* second terminating character; ignored for EREs */
+{
+ struct branchc bc;
- (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */
+ bc.nbranch = 0;
+ if (end1 == OUT && end2 == OUT)
+ bc.outer = true;
+ else
+ bc.outer = false;
+#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
+ for (;;) {
+ bc.start = HERE();
+ bc.nchain = 0;
+ bc.terminate = false;
+ if (p->pre_parse != NULL)
+ p->pre_parse(p, &bc);
+ while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
+ !SEEEND()) {
+ bc.terminate = p->parse_expr(p, &bc);
+ ++bc.nchain;
+ }
+ if (p->post_parse != NULL)
+ p->post_parse(p, &bc);
+ (void) REQUIRE(HERE() != bc.start, REG_BADPAT);
+ if (!p->allowbranch)
+ break;
+ /*
+ * p_branch_do's return value indicates whether we should
+ * continue parsing or not. This is both for correctness and
+ * a slight optimization, because it will check if we've
+ * encountered an empty branch or the end of the string
+ * immediately following a branch delimiter.
+ */
+ if (!p_branch_do(p, &bc))
+ break;
+ }
+#undef SEE_END
+ if (p->allowbranch)
+ p_branch_fix_tail(p, &bc);
+ assert(!MORE() || SEE(end1));
}
/*
* p_simp_re - parse a simple RE, an atom possibly followed by a repetition
*/
-static int /* was the simple RE an unbackslashed $? */
-p_simp_re(struct parse *p,
- int starordinary) /* is a leading * an ordinary character? */
+static bool /* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p, struct branchc *bc)
{
int c;
int count;
@@ -592,7 +726,7 @@ p_simp_re(struct parse *p,
EMIT(OLPAREN, subno);
/* the MORE here is an error heuristic */
if (MORE() && !SEETWO('\\', ')'))
- p_bre(p, '\\', ')');
+ p_re(p, '\\', ')');
if (subno < NPAREN) {
p->pend[subno] = HERE();
assert(p->pend[subno] != 0);
@@ -627,11 +761,16 @@ p_simp_re(struct parse *p,
p->g->backrefs = 1;
break;
case '*':
- (void) REQUIRE(starordinary, REG_BADRPT);
+ /*
+ * Ordinary if used as the first character beyond BOL anchor of
+ * a (sub-)expression, counts as a bad repetition operator if it
+ * appears otherwise.
+ */
+ (void) REQUIRE(bc->nchain == 0, REG_BADRPT);
/* FALLTHROUGH */
default:
if (p->error != 0)
- return (0); /* Definitely not $... */
+ return (false); /* Definitely not $... */
p->next--;
wc = WGETNEXT();
ordinary(p, wc);
@@ -662,9 +801,9 @@ p_simp_re(struct parse *p,
SETERROR(REG_BADBR);
}
} else if (c == '$') /* $ (but not \$) ends it */
- return (1);
+ return (true);
- return (0);
+ return (false);
}
/*
@@ -893,7 +1032,7 @@ p_b_coll_elem(struct parse *p,
}
len = p->next - sp;
for (cp = cnames; cp->name != NULL; cp++)
- if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+ if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
return (cp->code); /* known name */
(void) memset(&mbs, 0, sizeof (mbs));
if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
@@ -1436,12 +1575,12 @@ findmust(struct parse *p, struct re_guts *g)
scan += OPND(s);
s = *scan;
/* assert() interferes w debug printouts */
- if (OP(s) != O_QUEST && OP(s) != O_CH &&
- OP(s) != OOR2) {
+ if (OP(s) != (sop)O_QUEST &&
+ OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
g->iflags |= BAD;
return;
}
- } while (OP(s) != O_QUEST && OP(s) != O_CH);
+ } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
/* FALLTHROUGH */
case OBOW: /* things that break a sequence */
case OEOW:
@@ -1450,7 +1589,7 @@ findmust(struct parse *p, struct re_guts *g)
case O_QUEST:
case O_CH:
case OEND:
- if (newlen > g->mlen) { /* ends one */
+ if (newlen > (sopno)g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1) {
@@ -1465,7 +1604,7 @@ findmust(struct parse *p, struct re_guts *g)
newlen = 0;
break;
case OANY:
- if (newlen > g->mlen) { /* ends one */
+ if (newlen > (sopno)g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1) {
@@ -1483,7 +1622,7 @@ findmust(struct parse *p, struct re_guts *g)
break;
case OANYOF: /* may or may not invalidate offset */
/* First, everything as OANY */
- if (newlen > g->mlen) { /* ends one */
+ if (newlen > (sopno)g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1) {
@@ -1507,7 +1646,7 @@ findmust(struct parse *p, struct re_guts *g)
* save the last known good offset, in case the
* must sequence doesn't occur later.
*/
- if (newlen > g->mlen) { /* ends one */
+ if (newlen > (sopno)g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1)
@@ -1567,7 +1706,7 @@ altoffset(sop *scan, int offset)
largest = 0;
try = 0;
s = *scan++;
- while (OP(s) != O_QUEST && OP(s) != O_CH) {
+ while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
switch (OP(s)) {
case OOR1:
if (try > largest)
@@ -1583,10 +1722,10 @@ altoffset(sop *scan, int offset)
do {
scan += OPND(s);
s = *scan;
- if (OP(s) != O_QUEST && OP(s) != O_CH &&
- OP(s) != OOR2)
+ if (OP(s) != (sop)O_QUEST &&
+ OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
return (-1);
- } while (OP(s) != O_QUEST && OP(s) != O_CH);
+ } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
/*
* We must skip to the next position, or we'll
* leave altoffset() too early.
diff --git a/usr/src/lib/libc/port/regex/regex2.h b/usr/src/lib/libc/port/regex/regex2.h
index 551611c610..44b20c3c24 100644
--- a/usr/src/lib/libc/port/regex/regex2.h
+++ b/usr/src/lib/libc/port/regex/regex2.h
@@ -186,4 +186,5 @@ struct re_guts {
/* misc utilities */
#define OUT (CHAR_MIN - 1) /* a non-character value */
+#define IGN (CHAR_MIN - 2)
#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')