diff options
Diffstat (limited to 'usr/src/lib/libc/port/regex/regcomp.c')
-rw-r--r-- | usr/src/lib/libc/port/regex/regcomp.c | 341 |
1 files changed, 240 insertions, 101 deletions
diff --git a/usr/src/lib/libc/port/regex/regcomp.c b/usr/src/lib/libc/port/regex/regcomp.c index 3bd27129a0..8df8c337cd 100644 --- a/usr/src/lib/libc/port/regex/regcomp.c +++ b/usr/src/lib/libc/port/regex/regcomp.c @@ -41,8 +41,9 @@ #include <string.h> #include <ctype.h> #include <limits.h> -#include <stdlib.h> #include <regex.h> +#include <stdlib.h> +#include <stdbool.h> #include <wchar.h> #include <wctype.h> @@ -56,6 +57,24 @@ #include "../locale/mblocal.h" /* + * Branching context, used to keep track of branch state for all of the branch- + * aware functions. In addition to keeping track of branch positions for the + * p_branch_* functions, we use this to simplify some clumsiness in BREs for + * detection of whether ^ is acting as an anchor or being used erroneously and + * also for whether we're in a sub-expression or not. + */ +struct branchc { + sopno start; + sopno back; + sopno fwd; + + int nbranch; + int nchain; + bool outer; + bool terminate; +}; + +/* * parse structure, passed up and down to avoid global variables and * other clumsinesses */ @@ -71,6 +90,11 @@ struct parse { #define NPAREN 10 /* we need to remember () 1-9 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ sopno pend[NPAREN]; /* -> ) ([0] unused) */ + bool allowbranch; /* can this expression branch? */ + bool bre; /* convenience; is this a BRE? */ + bool (*parse_expr)(struct parse *, struct branchc *); + void (*pre_parse)(struct parse *, struct branchc *); + void (*post_parse)(struct parse *, struct branchc *); }; /* ========= begin header generated by ./mkh ========= */ @@ -79,11 +103,17 @@ extern "C" { #endif /* === regcomp.c === */ -static void p_ere(struct parse *p, int stop); -static void p_ere_exp(struct parse *p); +static bool p_ere_exp(struct parse *p, struct branchc *bc); static void p_str(struct parse *p); -static void p_bre(struct parse *p, int end1, int end2); -static int p_simp_re(struct parse *p, int starordinary); +static int p_branch_eat_delim(struct parse *p, struct branchc *bc); +static void p_branch_ins_offset(struct parse *p, struct branchc *bc); +static void p_branch_fix_tail(struct parse *p, struct branchc *bc); +static bool p_branch_empty(struct parse *p, struct branchc *bc); +static bool p_branch_do(struct parse *p, struct branchc *bc); +static void p_bre_pre_parse(struct parse *p, struct branchc *bc); +static void p_bre_post_parse(struct parse *p, struct branchc *bc); +static void p_re(struct parse *p, int end1, int end2); +static bool p_simp_re(struct parse *p, struct branchc *bc); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); @@ -133,6 +163,7 @@ static char nuls[10]; /* place to point scanner in event of error */ #define MORE2() (p->next+1 < p->end) #define SEE(c) (MORE() && PEEK() == (c)) #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) #define NEXT() (p->next++) @@ -229,6 +260,19 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, p->pbegin[i] = 0; p->pend[i] = 0; } + if (cflags & REG_EXTENDED) { + p->allowbranch = true; + p->bre = false; + p->parse_expr = p_ere_exp; + p->pre_parse = NULL; + p->post_parse = NULL; + } else { + p->allowbranch = false; + p->bre = true; + p->parse_expr = p_simp_re; + p->pre_parse = p_bre_pre_parse; + p->post_parse = p_bre_post_parse; + } g->sets = NULL; g->ncsets = 0; g->cflags = cflags; @@ -246,12 +290,10 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, /* do it */ EMIT(OEND, 0); g->firststate = THERE(); - if (cflags®_EXTENDED) - p_ere(p, OUT); - else if (cflags®_NOSPEC) + if (cflags & REG_NOSPEC) p_str(p); else - p_bre(p, OUT, OUT); + p_re(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); @@ -288,55 +330,11 @@ regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, } /* - * p_ere - ERE parser top level, concatenation and alternation - */ -static void -p_ere(struct parse *p, - int stop) /* character this ERE should end at */ -{ - char c; - sopno prevback; - sopno prevfwd; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - /* require nonempty */ - (void) REQUIRE(HERE() != conc, REG_BADPAT); - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op + * Parse one subERE, an atom possibly followed by a repetition op, + * return whether we should terminate or not. */ -static void -p_ere_exp(struct parse *p) +static bool +p_ere_exp(struct parse *p, struct branchc *bc) { char c; wint_t wc; @@ -346,6 +344,7 @@ p_ere_exp(struct parse *p) sopno subno; int wascaret = 0; + (void) bc; assert(MORE()); /* caller should have ensured this */ c = GETNEXT(); @@ -359,7 +358,7 @@ p_ere_exp(struct parse *p) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) - p_ere(p, ')'); + p_re(p, ')', IGN); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -425,7 +424,7 @@ p_ere_exp(struct parse *p) break; default: if (p->error != 0) - return; + return (false); p->next--; wc = WGETNEXT(); ordinary(p, wc); @@ -433,11 +432,11 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (false); c = PEEK(); /* we call { a repetition if followed by a digit */ if (!(c == '*' || c == '+' || c == '?' || c == '{')) - return; /* no repetition, we're done */ + return (false); /* no repetition, we're done */ else if (c == '{') (void) REQUIRE(MORE2() && \ (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); @@ -486,12 +485,13 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (false); c = PEEK(); if (!(c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())))) - return; + return (false); SETERROR(REG_BADRPT); + return (false); } /* @@ -506,47 +506,181 @@ p_str(struct parse *p) } /* - * p_bre - BRE parser top level, anchoring and concatenation - * Giving end1 as OUT essentially eliminates the end1/end2 check. - * - * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. - * The amount of lookahead needed to avoid this kludge is excessive. + * Eat consecutive branch delimiters for the kind of expression that we are + * parsing, return the number of delimiters that we ate. + */ +static int +p_branch_eat_delim(struct parse *p, struct branchc *bc) +{ + int nskip; + + (void) bc; + nskip = 0; + while (EAT('|')) + ++nskip; + return (nskip); +} + +/* + * Insert necessary branch book-keeping operations. This emits a + * bogus 'next' offset, since we still have more to parse */ static void -p_bre(struct parse *p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ +p_branch_ins_offset(struct parse *p, struct branchc *bc) +{ + if (bc->nbranch == 0) { + INSERT(OCH_, bc->start); /* offset is wrong */ + bc->fwd = bc->start; + bc->back = bc->start; + } + + ASTERN(OOR1, bc->back); + bc->back = THERE(); + AHEAD(bc->fwd); /* fix previous offset */ + bc->fwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + ++bc->nbranch; +} + +/* + * Fix the offset of the tail branch, if we actually had any branches. + * This is to correct the bogus placeholder offset that we use. + */ +static void +p_branch_fix_tail(struct parse *p, struct branchc *bc) +{ + /* Fix bogus offset at the tail if we actually have branches */ + if (bc->nbranch > 0) { + AHEAD(bc->fwd); + ASTERN(O_CH, bc->back); + } +} + +/* + * Signal to the parser that an empty branch has been encountered; this will, + * in the future, be used to allow for more permissive behavior with empty + * branches. The return value should indicate whether parsing may continue + * or not. + */ +static bool +p_branch_empty(struct parse *p, struct branchc *bc) +{ + (void) bc; + SETERROR(REG_BADPAT); + return (false); +} + +/* + * Take care of any branching requirements. This includes inserting the + * appropriate branching instructions as well as eating all of the branch + * delimiters until we either run out of pattern or need to parse more pattern. + */ +static bool +p_branch_do(struct parse *p, struct branchc *bc) { - sopno start = HERE(); - int first = 1; /* first subexpression? */ - int wasdollar = 0; + int ate = 0; + + ate = p_branch_eat_delim(p, bc); + if (ate == 0) + return (false); + else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) + /* + * Halt parsing only if we have an empty branch and + * p_branch_empty indicates that we must not continue. + * In the future, this will not necessarily be an error. + */ + return (false); + p_branch_ins_offset(p, bc); + + return (true); +} +static void +p_bre_pre_parse(struct parse *p, struct branchc *bc) +{ + (void) bc; + /* + * Does not move cleanly into expression parser because of + * ordinary interpration of * at the beginning position of + * an expression. + */ if (EAT('^')) { EMIT(OBOL, 0); p->g->iflags |= USEBOL; p->g->nbol++; } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) { /* oops, that was a trailing anchor */ +} + +static void +p_bre_post_parse(struct parse *p, struct branchc *bc) +{ + /* Expression is terminating due to EOL token */ + if (bc->terminate) { DROP(1); EMIT(OEOL, 0); p->g->iflags |= USEEOL; p->g->neol++; } +} + +/* + * Top level parser, concatenation and BRE anchoring. + * Giving end1 as OUT essentially eliminates the end1/end2 check. + * + * This implementation is a bit of a kludge, in that a trailing $ is first + * taken as an ordinary character and then revised to be an anchor. + * The amount of lookahead needed to avoid this kludge is excessive. + */ +static void +p_re(struct parse *p, + int end1, /* first terminating character */ + int end2) /* second terminating character; ignored for EREs */ +{ + struct branchc bc; - (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */ + bc.nbranch = 0; + if (end1 == OUT && end2 == OUT) + bc.outer = true; + else + bc.outer = false; +#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) + for (;;) { + bc.start = HERE(); + bc.nchain = 0; + bc.terminate = false; + if (p->pre_parse != NULL) + p->pre_parse(p, &bc); + while (MORE() && (!p->allowbranch || !SEESPEC('|')) && + !SEEEND()) { + bc.terminate = p->parse_expr(p, &bc); + ++bc.nchain; + } + if (p->post_parse != NULL) + p->post_parse(p, &bc); + (void) REQUIRE(HERE() != bc.start, REG_BADPAT); + if (!p->allowbranch) + break; + /* + * p_branch_do's return value indicates whether we should + * continue parsing or not. This is both for correctness and + * a slight optimization, because it will check if we've + * encountered an empty branch or the end of the string + * immediately following a branch delimiter. + */ + if (!p_branch_do(p, &bc)) + break; + } +#undef SEE_END + if (p->allowbranch) + p_branch_fix_tail(p, &bc); + assert(!MORE() || SEE(end1)); } /* * p_simp_re - parse a simple RE, an atom possibly followed by a repetition */ -static int /* was the simple RE an unbackslashed $? */ -p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ +static bool /* was the simple RE an unbackslashed $? */ +p_simp_re(struct parse *p, struct branchc *bc) { int c; int count; @@ -592,7 +726,7 @@ p_simp_re(struct parse *p, EMIT(OLPAREN, subno); /* the MORE here is an error heuristic */ if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); + p_re(p, '\\', ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -627,11 +761,16 @@ p_simp_re(struct parse *p, p->g->backrefs = 1; break; case '*': - (void) REQUIRE(starordinary, REG_BADRPT); + /* + * Ordinary if used as the first character beyond BOL anchor of + * a (sub-)expression, counts as a bad repetition operator if it + * appears otherwise. + */ + (void) REQUIRE(bc->nchain == 0, REG_BADRPT); /* FALLTHROUGH */ default: if (p->error != 0) - return (0); /* Definitely not $... */ + return (false); /* Definitely not $... */ p->next--; wc = WGETNEXT(); ordinary(p, wc); @@ -662,9 +801,9 @@ p_simp_re(struct parse *p, SETERROR(REG_BADBR); } } else if (c == '$') /* $ (but not \$) ends it */ - return (1); + return (true); - return (0); + return (false); } /* @@ -893,7 +1032,7 @@ p_b_coll_elem(struct parse *p, } len = p->next - sp; for (cp = cnames; cp->name != NULL; cp++) - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') + if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) return (cp->code); /* known name */ (void) memset(&mbs, 0, sizeof (mbs)); if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) @@ -1436,12 +1575,12 @@ findmust(struct parse *p, struct re_guts *g) scan += OPND(s); s = *scan; /* assert() interferes w debug printouts */ - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) { + if (OP(s) != (sop)O_QUEST && + OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { g->iflags |= BAD; return; } - } while (OP(s) != O_QUEST && OP(s) != O_CH); + } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: @@ -1450,7 +1589,7 @@ findmust(struct parse *p, struct re_guts *g) case O_QUEST: case O_CH: case OEND: - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1465,7 +1604,7 @@ findmust(struct parse *p, struct re_guts *g) newlen = 0; break; case OANY: - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1483,7 +1622,7 @@ findmust(struct parse *p, struct re_guts *g) break; case OANYOF: /* may or may not invalidate offset */ /* First, everything as OANY */ - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1507,7 +1646,7 @@ findmust(struct parse *p, struct re_guts *g) * save the last known good offset, in case the * must sequence doesn't occur later. */ - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) @@ -1567,7 +1706,7 @@ altoffset(sop *scan, int offset) largest = 0; try = 0; s = *scan++; - while (OP(s) != O_QUEST && OP(s) != O_CH) { + while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { switch (OP(s)) { case OOR1: if (try > largest) @@ -1583,10 +1722,10 @@ altoffset(sop *scan, int offset) do { scan += OPND(s); s = *scan; - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) + if (OP(s) != (sop)O_QUEST && + OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) return (-1); - } while (OP(s) != O_QUEST && OP(s) != O_CH); + } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); /* * We must skip to the next position, or we'll * leave altoffset() too early. |