diff options
author | Kyle Evans <kevans@FreeBSD.org> | 2019-09-19 13:28:47 +0300 |
---|---|---|
committer | Dan McDonald <danmcd@joyent.com> | 2019-10-22 11:09:13 -0400 |
commit | 62e7498000cb79c72fafb7270dbc3beb8a41863d (patch) | |
tree | fae4d56a53784eb331008a57d80d48e832d4e09c /usr/src/lib/libc/port/regex/engine.c | |
parent | 65f204200cf9a50fd6bad4093ee0b07bc35105ac (diff) | |
download | illumos-joyent-62e7498000cb79c72fafb7270dbc3beb8a41863d.tar.gz |
8993 regex: Refactor fast/slow stepping bits in the matching engine
Portions contributed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src/lib/libc/port/regex/engine.c')
-rw-r--r-- | usr/src/lib/libc/port/regex/engine.c | 213 |
1 files changed, 63 insertions, 150 deletions
diff --git a/usr/src/lib/libc/port/regex/engine.c b/usr/src/lib/libc/port/regex/engine.c index c66c08e59e..2e04a9085b 100644 --- a/usr/src/lib/libc/port/regex/engine.c +++ b/usr/src/lib/libc/port/regex/engine.c @@ -34,6 +34,8 @@ * SUCH DAMAGE. */ +#include <stdbool.h> + /* * The matching engine and friends. This file is #included by regexec.c * after suitable #defines of a variety of macros used herein, so that @@ -43,8 +45,7 @@ #ifdef SNAMES #define matcher smatcher -#define fast sfast -#define slow sslow +#define walk swalk #define dissect sdissect #define backref sbackref #define step sstep @@ -54,8 +55,7 @@ #endif #ifdef LNAMES #define matcher lmatcher -#define fast lfast -#define slow lslow +#define walk lwalk #define dissect ldissect #define backref lbackref #define step lstep @@ -65,8 +65,7 @@ #endif #ifdef MNAMES #define matcher mmatcher -#define fast mfast -#define slow mslow +#define walk mwalk #define dissect mdissect #define backref mbackref #define step mstep @@ -104,10 +103,8 @@ static const char *dissect(struct match *, const char *, const char *, sopno, sopno); static const char *backref(struct match *, const char *, const char *, sopno, sopno, sopno, int); -static const char *fast(struct match *, const char *, const char *, sopno, - sopno); -static const char *slow(struct match *, const char *, const char *, sopno, - sopno); +static const char *walk(struct match *m, const char *start, const char *stop, + sopno startst, sopno stopst, bool fast); static states step(struct re_guts *, sopno, sopno, states, wint_t, states); #define MAX_RECURSION 100 #define BOL (OUT-1) @@ -253,7 +250,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch, /* this loop does only one repetition except for backrefs */ for (;;) { - endp = fast(m, start, stop, gf, gl); + endp = walk(m, start, stop, gf, gl, true); if (endp == NULL) { /* a miss */ if (m->pmatch != NULL) free((char *)m->pmatch); @@ -269,7 +266,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch, assert(m->coldp != NULL); for (;;) { NOTE("finding start"); - endp = slow(m, m->coldp, stop, gf, gl); + endp = walk(m, m->coldp, stop, gf, gl, false); if (endp != NULL) break; assert(m->coldp < m->endp); @@ -314,7 +311,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch, if (dp != NULL || endp <= m->coldp) break; /* defeat */ NOTE("backoff"); - endp = slow(m, m->coldp, endp-1, gf, gl); + endp = walk(m, m->coldp, endp-1, gf, gl, false); if (endp == NULL) break; /* defeat */ /* try it on a shorter possibility */ @@ -395,7 +392,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, es += OPND(m->g->strip[es]); break; case OCH_: - while (OP(m->g->strip[es]) != O_CH) + while (OP(m->g->strip[es]) != (sop)O_CH) es += OPND(m->g->strip[es]); break; } @@ -427,10 +424,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -440,12 +437,9 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, ssub = ss + 1; esub = es - 1; /* did innards match? */ - if (slow(m, sp, rest, ssub, esub) != NULL) { + if (walk(m, sp, rest, ssub, esub, false) != NULL) { dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); -#if defined(__lint) - (void) dp; -#endif } else /* no */ assert(sp == rest); sp = rest; @@ -454,10 +448,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -469,7 +463,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, ssp = sp; oldssp = ssp; for (;;) { /* find last match of innards */ - sep = slow(m, ssp, rest, ssub, esub); + sep = walk(m, ssp, rest, ssub, esub, false); if (sep == NULL || sep == ssp) break; /* failed or matched null */ oldssp = ssp; /* on to next try */ @@ -481,7 +475,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, ssp = oldssp; } assert(sep == rest); /* must exhaust substring */ - assert(slow(m, ssp, sep, ssub, esub) == rest); + assert(walk(m, ssp, sep, ssub, esub, false) == rest); dp = dissect(m, ssp, sep, ssub, esub); assert(dp == sep); sp = rest; @@ -490,10 +484,10 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -504,7 +498,8 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, esub = ss + OPND(m->g->strip[ss]) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ - if (slow(m, sp, rest, ssub, esub) == rest) + if (walk(m, sp, rest, ssub, esub, + false) == rest) break; /* it matched all of it */ /* that one missed, try next one */ assert(OP(m->g->strip[esub]) == OOR1); @@ -512,7 +507,7 @@ dissect(struct match *m, const char *start, const char *stop, sopno startst, assert(OP(m->g->strip[esub]) == OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) + if (OP(m->g->strip[esub]) == (sop)OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); @@ -637,7 +632,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, do { assert(OP(s) == OOR2); ss += OPND(s); - } while (OP(s = m->g->strip[ss]) != O_CH); + } while (OP(s = m->g->strip[ss]) != (sop)O_CH); /* note that the ss++ gets us past the O_CH */ break; default: /* have to make a choice */ @@ -670,7 +665,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, ssp = m->offp + m->pmatch[i].rm_so; if (memcmp(sp, ssp, len) != 0) return (NULL); - while (m->g->strip[ss] != SOP(O_BACK, i)) + while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) ss++; return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); case OQUEST_: /* to null or not */ @@ -701,13 +696,13 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, if (dp != NULL) return (dp); /* that one missed, try next one */ - if (OP(m->g->strip[esub]) == O_CH) + if (OP(m->g->strip[esub]) == (sop)O_CH) return (NULL); /* there is none */ esub++; - assert(OP(m->g->strip[esub]) == OOR2); + assert(OP(m->g->strip[esub]) == (sop)OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) + if (OP(m->g->strip[esub]) == (sop)OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); @@ -745,115 +740,15 @@ backref(struct match *m, const char *start, const char *stop, sopno startst, } /* - * fast - step through the string at top speed + * Step through the string either quickly or slowly. Returns where it ended + * or NULL. */ static const char * -fast(struct match *m, const char *start, const char *stop, sopno startst, - sopno stopst) +walk(struct match *m, const char *start, const char *stop, sopno startst, + sopno stopst, bool fast) { states st = m->st; states fresh = m->fresh; - states tmp = m->tmp; - const char *p = start; - wint_t c; - wint_t lastc; /* previous c */ - wint_t flagch; - int i; - const char *coldp; /* last p after which no match was underway */ - size_t clen; - - CLEAR(st); - SET1(st, startst); - SP("fast", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); - ASSIGN(fresh, st); - SP("start", st, *p); - coldp = NULL; - if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) - c = OUT; - else { - /* - * XXX Wrong if the previous character was multi-byte. - * Newline never is (in encodings supported by FreeBSD), - * so this only breaks the ISWORD tests below. - */ - c = (uch)*(start - 1); - } - for (;;) { - /* next character */ - lastc = c; - if (p == m->endp) { - clen = 0; - c = OUT; - } else - clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); - if (EQ(st, fresh)) - coldp = p; - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ((lastc == '\n' && m->g->cflags®_NEWLINE) || - (lastc == OUT && !(m->eflags®_NOTBOL))) { - flagch = BOL; - i = m->g->nbol; - } - if ((c == '\n' && m->g->cflags®_NEWLINE) || - (c == OUT && !(m->eflags®_NOTEOL))) { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, - flagch, st); - SP("boleol", st, c); - } - - /* how about a word boundary? */ - if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c))) { - flagch = BOW; - } - if ((lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c)))) { - flagch = EOW; - } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst) || p == stop || clen > stop - p) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, fresh); - assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); - SP("aft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p += clen; - } - - assert(coldp != NULL); - m->coldp = coldp; - if (ISSET(st, stopst)) - return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); - else - return (NULL); -} - -/* - * slow - step through the string more deliberately - */ -static const char * -slow(struct match *m, const char *start, const char *stop, sopno startst, - sopno stopst) -{ - states st = m->st; states empty = m->empty; states tmp = m->tmp; const char *p = start; @@ -869,6 +764,8 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, SET1(st, startst); SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); + if (fast) + ASSIGN(fresh, st); matchp = NULL; if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) c = OUT; @@ -889,6 +786,9 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, } else clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); + if (fast && EQ(st, fresh)) + matchp = p; + /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; i = 0; @@ -924,14 +824,21 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, } /* are we done? */ - if (ISSET(st, stopst)) - matchp = p; - if (EQ(st, empty) || p == stop || clen > stop - p) + if (ISSET(st, stopst)) { + if (fast) + break; + else + matchp = p; + } + if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ ASSIGN(tmp, st); - ASSIGN(st, empty); + if (fast) + ASSIGN(st, fresh); + else + ASSIGN(st, empty); assert(c != OUT); st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); @@ -939,10 +846,17 @@ slow(struct match *m, const char *start, const char *stop, sopno startst, p += clen; } - return (matchp); + if (fast) { + assert(matchp != NULL); + m->coldp = matchp; + if (ISSET(st, stopst)) + return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); + else + return (NULL); + } else + return (matchp); } - /* * step - map set of states reachable before char to set reachable after */ @@ -1028,22 +942,22 @@ step(struct re_guts *g, break; case OCH_: /* mark the first two branches */ FWD(aft, aft, 1); - assert(OP(g->strip[pc+OPND(s)]) == OOR2); + assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); FWD(aft, aft, OPND(s)); break; case OOR1: /* done a branch, find the O_CH */ if (ISSTATEIN(aft, here)) { for (look = 1; - OP(s = g->strip[pc+look]) != O_CH; + OP(s = g->strip[pc+look]) != (sop)O_CH; look += OPND(s)) - assert(OP(s) == OOR2); + assert(OP(s) == (sop)OOR2); FWD(aft, aft, look + 1); } break; case OOR2: /* propagate OCH_'s marking */ FWD(aft, aft, 1); - if (OP(g->strip[pc+OPND(s)]) != O_CH) { - assert(OP(g->strip[pc+OPND(s)]) == OOR2); + if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { + assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); FWD(aft, aft, OPND(s)); } break; @@ -1124,8 +1038,7 @@ pchar(int ch) #endif #undef matcher -#undef fast -#undef slow +#undef walk #undef dissect #undef backref #undef step |