diff options
Diffstat (limited to 'usr/src/lib/libc/port/regex')
| -rw-r--r-- | usr/src/lib/libc/port/regex/engine.c | 213 | ||||
| -rw-r--r-- | usr/src/lib/libc/port/regex/regcomp.c | 341 | ||||
| -rw-r--r-- | usr/src/lib/libc/port/regex/regex2.h | 1 |
3 files changed, 304 insertions, 251 deletions
diff --git a/usr/src/lib/libc/port/regex/engine.c b/usr/src/lib/libc/port/regex/engine.c index c66c08e59e..2e04a9085b 100644 --- a/usr/src/lib/libc/port/regex/engine.c +++ b/usr/src/lib/libc/port/regex/engine.c @@ -34,6 +34,8 @@ * SUCH DAMAGE. */ +#include <stdbool.h> + /* * The matching engine and friends. This file is #included by regexec.c * after suitable #defines of a variety of macros used herein, so that @@ -43,8 +45,7 @@ #ifdef SNAMES #define matcher smatcher -#define fast sfast -#define slow sslow +#define walk swalk #define dissect sdissect #define backref sbackref #define step sstep @@ -54,8 +55,7 @@ #endif #ifdef LNAMES #define matcher lmatcher -#define fast lfast -#define slow lslow +#define walk lwalk #define dissect ldissect #define backref lbackref #define step lstep @@ -65,8 +65,7 @@ #endif #ifdef MNAMES #define matcher mmatcher -#define fast mfast -#define slow mslow +#define walk mwalk #define dissect mdissect #define backref mbackref #define step mstep @@ -104,10 +103,8 @@ static const char *dissect(struct match *, const char *, const char *, sopno, sopno); static const char *backref(struct match *, const char *, const char *, sopno, sopno, sopno, int); -static const char *fast(struct match *, const char *, const char *, sopno, - sopno); -static const char *slow(struct match *, const char *, const char *, sopno, - sopno); +static const char *walk(struct match *m, const char *start, const char *stop, + sopno startst, sopno stopst, bool fast); static states step(struct re_guts *, sopno, sopno, states, wint_t, states); #define MAX_RECURSION 100 #define BOL (OUT-1) @@ -253,7 +250,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch, /* this loop does only one repetition except for backrefs */ for (;;) { - endp = fast(m, start, stop, gf, gl); + endp = walk(m, start, stop, gf, gl, true); if (endp == NULL) { /* a miss */ if (m->pmatch != NULL) free((char *)m->pmatch); @@ -269,7 +266,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch, assert(m->coldp != NULL); for (;;) { NOTE("finding start"); - endp = slow(m, m->coldp, stop, gf, gl); + endp = walk(m, m->coldp, stop, gf, gl, false); if (endp != NULL) break; assert(m->coldp < m->endp); @@ -314,7 +311,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch, if (dp != NULL || endp <= m->coldp) break; /* defeat */ NOTE("backoff"); - endp = slow(m, m->coldp, endp-1, gf, gl); + endp = walk(m, m->coldp, endp-1, gf, gl, false); if (endp == NULL) break; /* defeat */ /* try it on a shorter possibility */ @@ -395,7 +392,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, es += OPND(m->g->strip[es]); break; case OCH_: - while (OP(m->g->strip[es]) != O_CH) + while (OP(m->g->strip[es]) != (sop)O_CH) es += OPND(m->g->strip[es]); break; } @@ -427,10 +424,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -440,12 +437,9 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, ssub = ss + 1; esub = es - 1; /* did innards match? */ - if (slow(m, sp, rest, ssub, esub) != NULL) { + if (walk(m, sp, rest, ssub, esub, false) != NULL) { dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); -#if defined(__lint) - (void) dp; -#endif } else /* no */ assert(sp == rest); sp = rest; @@ -454,10 +448,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -469,7 +463,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, ssp = sp; oldssp = ssp; for (;;) { /* find last match of innards */ - sep = slow(m, ssp, rest, ssub, esub); + sep = walk(m, ssp, rest, ssub, esub, false); if (sep == NULL || sep == ssp) break; /* failed or matched null */ oldssp = ssp; /* on to next try */ @@ -481,7 +475,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, ssp = oldssp; } assert(sep == rest); /* must exhaust substring */ - assert(slow(m, ssp, sep, ssub, esub) == rest); + assert(walk(m, ssp, sep, ssub, esub, false) == rest); dp = dissect(m, ssp, sep, ssub, esub); assert(dp == sep); sp = rest; @@ -490,10 +484,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -504,7 +498,8 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, esub = ss + OPND(m->g->strip[ss]) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ - if (slow(m, sp, rest, ssub, esub) == rest) + if (walk(m, sp, rest, ssub, esub, + false) == rest) break; /* it matched all of it */ /* that one missed, try next one */ assert(OP(m->g->strip[esub]) == OOR1); @@ -512,7 +507,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, assert(OP(m->g->strip[esub]) == OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) + if (OP(m->g->strip[esub]) == (sop)OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); @@ -637,7 +632,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, do { assert(OP(s) == OOR2); ss += OPND(s); - } while (OP(s = m->g->strip[ss]) != O_CH); + } while (OP(s = m->g->strip[ss]) != (sop)O_CH); /* note that the ss++ gets us past the O_CH */ break; default: /* have to make a choice */ @@ -670,7 +665,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, ssp = m->offp + m->pmatch[i].rm_so; if (memcmp(sp, ssp, len) != 0) return (NULL); - while (m->g->strip[ss] != SOP(O_BACK, i)) + while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) ss++; return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); case OQUEST_: /* to null or not */ @@ -701,13 +696,13 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, if (dp != NULL) return (dp); /* that one missed, try next one */ - if (OP(m->g->strip[esub]) == O_CH) + if (OP(m->g->strip[esub]) == (sop)O_CH) return (NULL); /* there is none */ esub++; - assert(OP(m->g->strip[esub]) == OOR2); + assert(OP(m->g->strip[esub]) == (sop)OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) + if (OP(m->g->strip[esub]) == (sop)OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); @@ -745,115 +740,15 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, } /* - * fast - step through the string at top speed + * Step through the string either quickly or slowly. Returns where it ended + * or NULL. */ static const char * -fast(struct match *m, const char *start, const char *stop, sopno startst, - sopno stopst) +walk(struct match *m, const char *start, const char *stop, sopno startst, + sopno stopst, bool fast) { states st = m->st; states fresh = m->fresh; - states tmp = m->tmp; - const char *p = start; - wint_t c; - wint_t lastc; /* previous c */ - wint_t flagch; - int i; - const char *coldp; /* last p after which no match was underway */ - size_t clen; - - CLEAR(st); - SET1(st, startst); - SP("fast", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); - ASSIGN(fresh, st); - SP("start", st, *p); - coldp = NULL; - if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) - c = OUT; - else { - /* - * XXX Wrong if the previous character was multi-byte. - * Newline never is (in encodings supported by FreeBSD), - * so this only breaks the ISWORD tests below. - */ - c = (uch)*(start - 1); - } - for (;;) { - /* next character */ - lastc = c; - if (p == m->endp) { - clen = 0; - c = OUT; - } else - clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); - if (EQ(st, fresh)) - coldp = p; - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ((lastc == '\n' && m->g->cflags®_NEWLINE) || - (lastc == OUT && !(m->eflags®_NOTBOL))) { - flagch = BOL; - i = m->g->nbol; - } - if ((c == '\n' && m->g->cflags®_NEWLINE) || - (c == OUT && !(m->eflags®_NOTEOL))) { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, - flagch, st); - SP("boleol", st, c); - } - - /* how about a word boundary? */ - if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c))) { - flagch = BOW; - } - if ((lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c)))) { - flagch = EOW; - } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst) || p == stop || clen > stop - p) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, fresh); - assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); - SP("aft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p += clen; - } - - assert(coldp != NULL); - m->coldp = coldp; - if (ISSET(st, stopst)) - return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); - else - return (NULL); -} - -/* - * slow - step through the string more deliberately - */ -static const char * -slow(struct match *m, const char *start, const char *stop, sopno startst, - sopno stopst) -{ - states st = m->st; states empty = m->empty; states tmp = m->tmp; const char *p = start; @@ -869,6 +764,8 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, SET1(st, startst); SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); + if (fast) + ASSIGN(fresh, st); matchp = NULL; if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) c = OUT; @@ -889,6 +786,9 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, } else clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); + if (fast && EQ(st, fresh)) + matchp = p; + /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; i = 0; @@ -924,14 +824,21 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, } /* are we done? */ - if (ISSET(st, stopst)) - matchp = p; - if (EQ(st, empty) || p == stop || clen > stop - p) + if (ISSET(st, stopst)) { + if (fast) + break; + else + matchp = p; + } + if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ ASSIGN(tmp, st); - ASSIGN(st, empty); + if (fast) + ASSIGN(st, fresh); + else + ASSIGN(st, empty); assert(c != OUT); st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); @@ -939,10 +846,17 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, p += clen; } - return (matchp); + if (fast) { + assert(matchp != NULL); + m->coldp = matchp; + if (ISSET(st, stopst)) + return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); + else + return (NULL); + } else + return (matchp); } - /* * step - map set of states reachable before char to set reachable after */ @@ -1028,22 +942,22 @@ step(struct re_guts *g, break; case OCH_: /* mark the first two branches */ FWD(aft, aft, 1); - assert(OP(g->strip[pc+OPND(s)]) == OOR2); + assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); FWD(aft, aft, OPND(s)); break; case OOR1: /* done a branch, find the O_CH */ if (ISSTATEIN(aft, here)) { for (look = 1; - OP(s = g->strip[pc+look]) != O_CH; + OP(s = g->strip[pc+look]) != (sop)O_CH; look += OPND(s)) - assert(OP(s) == OOR2); + assert(OP(s) == (sop)OOR2); FWD(aft, aft, look + 1); } break; case OOR2: /* propagate OCH_'s marking */ FWD(aft, aft, 1); - if (OP(g->strip[pc+OPND(s)]) != O_CH) { - assert(OP(g->strip[pc+OPND(s)]) == OOR2); + if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { + assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); FWD(aft, aft, OPND(s)); } break; @@ -1124,8 +1038,7 @@ pchar(int ch) #endif #undef matcher -#undef fast -#undef slow +#undef walk #undef dissect #undef backref #undef step diff --git a/usr/src/lib/libc/port/regex/regcomp.c b/usr/src/lib/libc/port/regex/regcomp.c index 3bd27129a0..8df8c337cd 100644 --- a/usr/src/lib/libc/port/regex/regcomp.c +++ b/usr/src/lib/libc/port/regex/regcomp.c @@ -41,8 +41,9 @@ #include <string.h> #include <ctype.h> #include <limits.h> -#include <stdlib.h> #include <regex.h> +#include <stdlib.h> +#include <stdbool.h> #include <wchar.h> #include <wctype.h> @@ -56,6 +57,24 @@ #include "../locale/mblocal.h" /* + * Branching context, used to keep track of branch state for all of the branch- + * aware functions. In addition to keeping track of branch positions for the + * p_branch_* functions, we use this to simplify some clumsiness in BREs for + * detection of whether ^ is acting as an anchor or being used erroneously and + * also for whether we're in a sub-expression or not. + */ +struct branchc { + sopno start; + sopno back; + sopno fwd; + + int nbranch; + int nchain; + bool outer; + bool terminate; +}; + +/* * parse structure, passed up and down to avoid global variables and * other clumsinesses */ @@ -71,6 +90,11 @@ struct parse { #define NPAREN 10 /* we need to remember () 1-9 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ sopno pend[NPAREN]; /* -> ) ([0] unused) */ + bool allowbranch; /* can this expression branch? */ + bool bre; /* convenience; is this a BRE? */ + bool (*parse_expr)(struct parse *, struct branchc *); + void (*pre_parse)(struct parse *, struct branchc *); + void (*post_parse)(struct parse *, struct branchc *); }; /* ========= begin header generated by ./mkh ========= */ @@ -79,11 +103,17 @@ extern "C" { #endif /* === regcomp.c === */ -static void p_ere(struct parse *p, int stop); -static void p_ere_exp(struct parse *p); +static bool p_ere_exp(struct parse *p, struct branchc *bc); static void p_str(struct parse *p); -static void p_bre(struct parse *p, int end1, int end2); -static int p_simp_re(struct parse *p, int starordinary); +static int p_branch_eat_delim(struct parse *p, struct branchc *bc); +static void p_branch_ins_offset(struct parse *p, struct branchc *bc); +static void p_branch_fix_tail(struct parse *p, struct branchc *bc); +static bool p_branch_empty(struct parse *p, struct branchc *bc); +static bool p_branch_do(struct parse *p, struct branchc *bc); +static void p_bre_pre_parse(struct parse *p, struct branchc *bc); +static void p_bre_post_parse(struct parse *p, struct branchc *bc); +static void p_re(struct parse *p, int end1, int end2); +static bool p_simp_re(struct parse *p, struct branchc *bc); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); @@ -133,6 +163,7 @@ static char nuls[10]; /* place to point scanner in event of error */ #define MORE2() (p->next+1 < p->end) #define SEE(c) (MORE() && PEEK() == (c)) #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) #define NEXT() (p->next++) @@ -229,6 +260,19 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, p->pbegin[i] = 0; p->pend[i] = 0; } + if (cflags & REG_EXTENDED) { + p->allowbranch = true; + p->bre = false; + p->parse_expr = p_ere_exp; + p->pre_parse = NULL; + p->post_parse = NULL; + } else { + p->allowbranch = false; + p->bre = true; + p->parse_expr = p_simp_re; + p->pre_parse = p_bre_pre_parse; + p->post_parse = p_bre_post_parse; + } g->sets = NULL; g->ncsets = 0; g->cflags = cflags; @@ -246,12 +290,10 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, /* do it */ EMIT(OEND, 0); g->firststate = THERE(); - if (cflags®_EXTENDED) - p_ere(p, OUT); - else if (cflags®_NOSPEC) + if (cflags & REG_NOSPEC) p_str(p); else - p_bre(p, OUT, OUT); + p_re(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); @@ -288,55 +330,11 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, } /* - * p_ere - ERE parser top level, concatenation and alternation - */ -static void -p_ere(struct parse *p, - int stop) /* character this ERE should end at */ -{ - char c; - sopno prevback; - sopno prevfwd; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - /* require nonempty */ - (void) REQUIRE(HERE() != conc, REG_BADPAT); - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op + * Parse one subERE, an atom possibly followed by a repetition op, + * return whether we should terminate or not. */ -static void -p_ere_exp(struct parse *p) +static bool +p_ere_exp(struct parse *p, struct branchc *bc) { char c; wint_t wc; @@ -346,6 +344,7 @@ p_ere_exp(struct parse *p) sopno subno; int wascaret = 0; + (void) bc; assert(MORE()); /* caller should have ensured this */ c = GETNEXT(); @@ -359,7 +358,7 @@ p_ere_exp(struct parse *p) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) - p_ere(p, ')'); + p_re(p, ')', IGN); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -425,7 +424,7 @@ p_ere_exp(struct parse *p) break; default: if (p->error != 0) - return; + return (false); p->next--; wc = WGETNEXT(); ordinary(p, wc); @@ -433,11 +432,11 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (false); c = PEEK(); /* we call { a repetition if followed by a digit */ if (!(c == '*' || c == '+' || c == '?' || c == '{')) - return; /* no repetition, we're done */ + return (false); /* no repetition, we're done */ else if (c == '{') (void) REQUIRE(MORE2() && \ (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); @@ -486,12 +485,13 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (false); c = PEEK(); if (!(c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())))) - return; + return (false); SETERROR(REG_BADRPT); + return (false); } /* @@ -506,47 +506,181 @@ p_str(struct parse *p) } /* - * p_bre - BRE parser top level, anchoring and concatenation - * Giving end1 as OUT essentially eliminates the end1/end2 check. - * - * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. - * The amount of lookahead needed to avoid this kludge is excessive. + * Eat consecutive branch delimiters for the kind of expression that we are + * parsing, return the number of delimiters that we ate. + */ +static int +p_branch_eat_delim(struct parse *p, struct branchc *bc) +{ + int nskip; + + (void) bc; + nskip = 0; + while (EAT('|')) + ++nskip; + return (nskip); +} + +/* + * Insert necessary branch book-keeping operations. This emits a + * bogus 'next' offset, since we still have more to parse */ static void -p_bre(struct parse *p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ +p_branch_ins_offset(struct parse *p, struct branchc *bc) +{ + if (bc->nbranch == 0) { + INSERT(OCH_, bc->start); /* offset is wrong */ + bc->fwd = bc->start; + bc->back = bc->start; + } + + ASTERN(OOR1, bc->back); + bc->back = THERE(); + AHEAD(bc->fwd); /* fix previous offset */ + bc->fwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + ++bc->nbranch; +} + +/* + * Fix the offset of the tail branch, if we actually had any branches. + * This is to correct the bogus placeholder offset that we use. + */ +static void +p_branch_fix_tail(struct parse *p, struct branchc *bc) +{ + /* Fix bogus offset at the tail if we actually have branches */ + if (bc->nbranch > 0) { + AHEAD(bc->fwd); + ASTERN(O_CH, bc->back); + } +} + +/* + * Signal to the parser that an empty branch has been encountered; this will, + * in the future, be used to allow for more permissive behavior with empty + * branches. The return value should indicate whether parsing may continue + * or not. + */ +static bool +p_branch_empty(struct parse *p, struct branchc *bc) +{ + (void) bc; + SETERROR(REG_BADPAT); + return (false); +} + +/* + * Take care of any branching requirements. This includes inserting the + * appropriate branching instructions as well as eating all of the branch + * delimiters until we either run out of pattern or need to parse more pattern. + */ +static bool +p_branch_do(struct parse *p, struct branchc *bc) { - sopno start = HERE(); - int first = 1; /* first subexpression? */ - int wasdollar = 0; + int ate = 0; + + ate = p_branch_eat_delim(p, bc); + if (ate == 0) + return (false); + else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) + /* + * Halt parsing only if we have an empty branch and + * p_branch_empty indicates that we must not continue. + * In the future, this will not necessarily be an error. + */ + return (false); + p_branch_ins_offset(p, bc); + + return (true); +} +static void +p_bre_pre_parse(struct parse *p, struct branchc *bc) +{ + (void) bc; + /* + * Does not move cleanly into expression parser because of + * ordinary interpration of * at the beginning position of + * an expression. + */ if (EAT('^')) { EMIT(OBOL, 0); p->g->iflags |= USEBOL; p->g->nbol++; } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) { /* oops, that was a trailing anchor */ +} + +static void +p_bre_post_parse(struct parse *p, struct branchc *bc) +{ + /* Expression is terminating due to EOL token */ + if (bc->terminate) { DROP(1); EMIT(OEOL, 0); p->g->iflags |= USEEOL; p->g->neol++; } +} + +/* + * Top level parser, concatenation and BRE anchoring. + * Giving end1 as OUT essentially eliminates the end1/end2 check. + * + * This implementation is a bit of a kludge, in that a trailing $ is first + * taken as an ordinary character and then revised to be an anchor. + * The amount of lookahead needed to avoid this kludge is excessive. + */ +static void +p_re(struct parse *p, + int end1, /* first terminating character */ + int end2) /* second terminating character; ignored for EREs */ +{ + struct branchc bc; - (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */ + bc.nbranch = 0; + if (end1 == OUT && end2 == OUT) + bc.outer = true; + else + bc.outer = false; +#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) + for (;;) { + bc.start = HERE(); + bc.nchain = 0; + bc.terminate = false; + if (p->pre_parse != NULL) + p->pre_parse(p, &bc); + while (MORE() && (!p->allowbranch || !SEESPEC('|')) && + !SEEEND()) { + bc.terminate = p->parse_expr(p, &bc); + ++bc.nchain; + } + if (p->post_parse != NULL) + p->post_parse(p, &bc); + (void) REQUIRE(HERE() != bc.start, REG_BADPAT); + if (!p->allowbranch) + break; + /* + * p_branch_do's return value indicates whether we should + * continue parsing or not. This is both for correctness and + * a slight optimization, because it will check if we've + * encountered an empty branch or the end of the string + * immediately following a branch delimiter. + */ + if (!p_branch_do(p, &bc)) + break; + } +#undef SEE_END + if (p->allowbranch) + p_branch_fix_tail(p, &bc); + assert(!MORE() || SEE(end1)); } /* * p_simp_re - parse a simple RE, an atom possibly followed by a repetition */ -static int /* was the simple RE an unbackslashed $? */ -p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ +static bool /* was the simple RE an unbackslashed $? */ +p_simp_re(struct parse *p, struct branchc *bc) { int c; int count; @@ -592,7 +726,7 @@ p_simp_re(struct parse *p, EMIT(OLPAREN, subno); /* the MORE here is an error heuristic */ if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); + p_re(p, '\\', ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -627,11 +761,16 @@ p_simp_re(struct parse *p, p->g->backrefs = 1; break; case '*': - (void) REQUIRE(starordinary, REG_BADRPT); + /* + * Ordinary if used as the first character beyond BOL anchor of + * a (sub-)expression, counts as a bad repetition operator if it + * appears otherwise. + */ + (void) REQUIRE(bc->nchain == 0, REG_BADRPT); /* FALLTHROUGH */ default: if (p->error != 0) - return (0); /* Definitely not $... */ + return (false); /* Definitely not $... */ p->next--; wc = WGETNEXT(); ordinary(p, wc); @@ -662,9 +801,9 @@ p_simp_re(struct parse *p, SETERROR(REG_BADBR); } } else if (c == '$') /* $ (but not \$) ends it */ - return (1); + return (true); - return (0); + return (false); } /* @@ -893,7 +1032,7 @@ p_b_coll_elem(struct parse *p, } len = p->next - sp; for (cp = cnames; cp->name != NULL; cp++) - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') + if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) return (cp->code); /* known name */ (void) memset(&mbs, 0, sizeof (mbs)); if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) @@ -1436,12 +1575,12 @@ findmust(struct parse *p, struct re_guts *g) scan += OPND(s); s = *scan; /* assert() interferes w debug printouts */ - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) { + if (OP(s) != (sop)O_QUEST && + OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { g->iflags |= BAD; return; } - } while (OP(s) != O_QUEST && OP(s) != O_CH); + } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: @@ -1450,7 +1589,7 @@ findmust(struct parse *p, struct re_guts *g) case O_QUEST: case O_CH: case OEND: - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1465,7 +1604,7 @@ findmust(struct parse *p, struct re_guts *g) newlen = 0; break; case OANY: - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1483,7 +1622,7 @@ findmust(struct parse *p, struct re_guts *g) break; case OANYOF: /* may or may not invalidate offset */ /* First, everything as OANY */ - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1507,7 +1646,7 @@ findmust(struct parse *p, struct re_guts *g) * save the last known good offset, in case the * must sequence doesn't occur later. */ - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) @@ -1567,7 +1706,7 @@ altoffset(sop *scan, int offset) largest = 0; try = 0; s = *scan++; - while (OP(s) != O_QUEST && OP(s) != O_CH) { + while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { switch (OP(s)) { case OOR1: if (try > largest) @@ -1583,10 +1722,10 @@ altoffset(sop *scan, int offset) do { scan += OPND(s); s = *scan; - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) + if (OP(s) != (sop)O_QUEST && + OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) return (-1); - } while (OP(s) != O_QUEST && OP(s) != O_CH); + } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); /* * We must skip to the next position, or we'll * leave altoffset() too early. diff --git a/usr/src/lib/libc/port/regex/regex2.h b/usr/src/lib/libc/port/regex/regex2.h index 551611c610..44b20c3c24 100644 --- a/usr/src/lib/libc/port/regex/regex2.h +++ b/usr/src/lib/libc/port/regex/regex2.h @@ -186,4 +186,5 @@ struct re_guts { /* misc utilities */ #define OUT (CHAR_MIN - 1) /* a non-character value */ +#define IGN (CHAR_MIN - 2) #define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_') |
