diff options
author | Tianon Gravi <admwiggin@gmail.com> | 2015-01-15 11:54:00 -0700 |
---|---|---|
committer | Tianon Gravi <admwiggin@gmail.com> | 2015-01-15 11:54:00 -0700 |
commit | f154da9e12608589e8d5f0508f908a0c3e88a1bb (patch) | |
tree | f8255d51e10c6f1e0ed69702200b966c9556a431 /src/pkg/regexp/syntax/parse.go | |
parent | 8d8329ed5dfb9622c82a9fbec6fd99a580f9c9f6 (diff) | |
download | golang-upstream/1.4.tar.gz |
Imported Upstream version 1.4upstream/1.4
Diffstat (limited to 'src/pkg/regexp/syntax/parse.go')
-rw-r--r-- | src/pkg/regexp/syntax/parse.go | 1863 |
1 files changed, 0 insertions, 1863 deletions
diff --git a/src/pkg/regexp/syntax/parse.go b/src/pkg/regexp/syntax/parse.go deleted file mode 100644 index cb25dca39..000000000 --- a/src/pkg/regexp/syntax/parse.go +++ /dev/null @@ -1,1863 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syntax - -import ( - "sort" - "strings" - "unicode" - "unicode/utf8" -) - -// An Error describes a failure to parse a regular expression -// and gives the offending expression. -type Error struct { - Code ErrorCode - Expr string -} - -func (e *Error) Error() string { - return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" -} - -// An ErrorCode describes a failure to parse a regular expression. -type ErrorCode string - -const ( - // Unexpected error - ErrInternalError ErrorCode = "regexp/syntax: internal error" - - // Parse errors - ErrInvalidCharClass ErrorCode = "invalid character class" - ErrInvalidCharRange ErrorCode = "invalid character class range" - ErrInvalidEscape ErrorCode = "invalid escape sequence" - ErrInvalidNamedCapture ErrorCode = "invalid named capture" - ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" - ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" - ErrInvalidRepeatSize ErrorCode = "invalid repeat count" - ErrInvalidUTF8 ErrorCode = "invalid UTF-8" - ErrMissingBracket ErrorCode = "missing closing ]" - ErrMissingParen ErrorCode = "missing closing )" - ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" - ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" - ErrUnexpectedParen ErrorCode = "unexpected )" -) - -func (e ErrorCode) String() string { - return string(e) -} - -// Flags control the behavior of the parser and record information about regexp context. -type Flags uint16 - -const ( - FoldCase Flags = 1 << iota // case-insensitive match - Literal // treat pattern as literal string - ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline - DotNL // allow . to match newline - OneLine // treat ^ and $ as only matching at beginning and end of text - NonGreedy // make repetition operators default to non-greedy - PerlX // allow Perl extensions - UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation - WasDollar // regexp OpEndText was $, not \z - Simple // regexp contains no counted repetition - - MatchNL = ClassNL | DotNL - - Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible - POSIX Flags = 0 // POSIX syntax -) - -// Pseudo-ops for parsing stack. -const ( - opLeftParen = opPseudo + iota - opVerticalBar -) - -type parser struct { - flags Flags // parse mode flags - stack []*Regexp // stack of parsed expressions - free *Regexp - numCap int // number of capturing groups seen - wholeRegexp string - tmpClass []rune // temporary char class work space -} - -func (p *parser) newRegexp(op Op) *Regexp { - re := p.free - if re != nil { - p.free = re.Sub0[0] - *re = Regexp{} - } else { - re = new(Regexp) - } - re.Op = op - return re -} - -func (p *parser) reuse(re *Regexp) { - re.Sub0[0] = p.free - p.free = re -} - -// Parse stack manipulation. - -// push pushes the regexp re onto the parse stack and returns the regexp. -func (p *parser) push(re *Regexp) *Regexp { - if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { - // Single rune. - if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { - return nil - } - re.Op = OpLiteral - re.Rune = re.Rune[:1] - re.Flags = p.flags &^ FoldCase - } else if re.Op == OpCharClass && len(re.Rune) == 4 && - re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && - unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && - unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || - re.Op == OpCharClass && len(re.Rune) == 2 && - re.Rune[0]+1 == re.Rune[1] && - unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && - unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { - // Case-insensitive rune like [Aa] or [Δδ]. - if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { - return nil - } - - // Rewrite as (case-insensitive) literal. - re.Op = OpLiteral - re.Rune = re.Rune[:1] - re.Flags = p.flags | FoldCase - } else { - // Incremental concatenation. - p.maybeConcat(-1, 0) - } - - p.stack = append(p.stack, re) - return re -} - -// maybeConcat implements incremental concatenation -// of literal runes into string nodes. The parser calls this -// before each push, so only the top fragment of the stack -// might need processing. Since this is called before a push, -// the topmost literal is no longer subject to operators like * -// (Otherwise ab* would turn into (ab)*.) -// If r >= 0 and there's a node left over, maybeConcat uses it -// to push r with the given flags. -// maybeConcat reports whether r was pushed. -func (p *parser) maybeConcat(r rune, flags Flags) bool { - n := len(p.stack) - if n < 2 { - return false - } - - re1 := p.stack[n-1] - re2 := p.stack[n-2] - if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { - return false - } - - // Push re1 into re2. - re2.Rune = append(re2.Rune, re1.Rune...) - - // Reuse re1 if possible. - if r >= 0 { - re1.Rune = re1.Rune0[:1] - re1.Rune[0] = r - re1.Flags = flags - return true - } - - p.stack = p.stack[:n-1] - p.reuse(re1) - return false // did not push r -} - -// newLiteral returns a new OpLiteral Regexp with the given flags -func (p *parser) newLiteral(r rune, flags Flags) *Regexp { - re := p.newRegexp(OpLiteral) - re.Flags = flags - if flags&FoldCase != 0 { - r = minFoldRune(r) - } - re.Rune0[0] = r - re.Rune = re.Rune0[:1] - return re -} - -// minFoldRune returns the minimum rune fold-equivalent to r. -func minFoldRune(r rune) rune { - if r < minFold || r > maxFold { - return r - } - min := r - r0 := r - for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { - if min > r { - min = r - } - } - return min -} - -// literal pushes a literal regexp for the rune r on the stack -// and returns that regexp. -func (p *parser) literal(r rune) { - p.push(p.newLiteral(r, p.flags)) -} - -// op pushes a regexp with the given op onto the stack -// and returns that regexp. -func (p *parser) op(op Op) *Regexp { - re := p.newRegexp(op) - re.Flags = p.flags - return p.push(re) -} - -// repeat replaces the top stack element with itself repeated according to op, min, max. -// before is the regexp suffix starting at the repetition operator. -// after is the regexp suffix following after the repetition operator. -// repeat returns an updated 'after' and an error, if any. -func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { - flags := p.flags - if p.flags&PerlX != 0 { - if len(after) > 0 && after[0] == '?' { - after = after[1:] - flags ^= NonGreedy - } - if lastRepeat != "" { - // In Perl it is not allowed to stack repetition operators: - // a** is a syntax error, not a doubled star, and a++ means - // something else entirely, which we don't support! - return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} - } - } - n := len(p.stack) - if n == 0 { - return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} - } - sub := p.stack[n-1] - if sub.Op >= opPseudo { - return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} - } - re := p.newRegexp(op) - re.Min = min - re.Max = max - re.Flags = flags - re.Sub = re.Sub0[:1] - re.Sub[0] = sub - p.stack[n-1] = re - return after, nil -} - -// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. -func (p *parser) concat() *Regexp { - p.maybeConcat(-1, 0) - - // Scan down to find pseudo-operator | or (. - i := len(p.stack) - for i > 0 && p.stack[i-1].Op < opPseudo { - i-- - } - subs := p.stack[i:] - p.stack = p.stack[:i] - - // Empty concatenation is special case. - if len(subs) == 0 { - return p.push(p.newRegexp(OpEmptyMatch)) - } - - return p.push(p.collapse(subs, OpConcat)) -} - -// alternate replaces the top of the stack (above the topmost '(') with its alternation. -func (p *parser) alternate() *Regexp { - // Scan down to find pseudo-operator (. - // There are no | above (. - i := len(p.stack) - for i > 0 && p.stack[i-1].Op < opPseudo { - i-- - } - subs := p.stack[i:] - p.stack = p.stack[:i] - - // Make sure top class is clean. - // All the others already are (see swapVerticalBar). - if len(subs) > 0 { - cleanAlt(subs[len(subs)-1]) - } - - // Empty alternate is special case - // (shouldn't happen but easy to handle). - if len(subs) == 0 { - return p.push(p.newRegexp(OpNoMatch)) - } - - return p.push(p.collapse(subs, OpAlternate)) -} - -// cleanAlt cleans re for eventual inclusion in an alternation. -func cleanAlt(re *Regexp) { - switch re.Op { - case OpCharClass: - re.Rune = cleanClass(&re.Rune) - if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { - re.Rune = nil - re.Op = OpAnyChar - return - } - if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { - re.Rune = nil - re.Op = OpAnyCharNotNL - return - } - if cap(re.Rune)-len(re.Rune) > 100 { - // re.Rune will not grow any more. - // Make a copy or inline to reclaim storage. - re.Rune = append(re.Rune0[:0], re.Rune...) - } - } -} - -// collapse returns the result of applying op to sub. -// If sub contains op nodes, they all get hoisted up -// so that there is never a concat of a concat or an -// alternate of an alternate. -func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { - if len(subs) == 1 { - return subs[0] - } - re := p.newRegexp(op) - re.Sub = re.Sub0[:0] - for _, sub := range subs { - if sub.Op == op { - re.Sub = append(re.Sub, sub.Sub...) - p.reuse(sub) - } else { - re.Sub = append(re.Sub, sub) - } - } - if op == OpAlternate { - re.Sub = p.factor(re.Sub, re.Flags) - if len(re.Sub) == 1 { - old := re - re = re.Sub[0] - p.reuse(old) - } - } - return re -} - -// factor factors common prefixes from the alternation list sub. -// It returns a replacement list that reuses the same storage and -// frees (passes to p.reuse) any removed *Regexps. -// -// For example, -// ABC|ABD|AEF|BCX|BCY -// simplifies by literal prefix extraction to -// A(B(C|D)|EF)|BC(X|Y) -// which simplifies by character class introduction to -// A(B[CD]|EF)|BC[XY] -// -func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { - if len(sub) < 2 { - return sub - } - - // Round 1: Factor out common literal prefixes. - var str []rune - var strflags Flags - start := 0 - out := sub[:0] - for i := 0; i <= len(sub); i++ { - // Invariant: the Regexps that were in sub[0:start] have been - // used or marked for reuse, and the slice space has been reused - // for out (len(out) <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin - // with str as modified by strflags. - var istr []rune - var iflags Flags - if i < len(sub) { - istr, iflags = p.leadingString(sub[i]) - if iflags == strflags { - same := 0 - for same < len(str) && same < len(istr) && str[same] == istr[same] { - same++ - } - if same > 0 { - // Matches at least one rune in current range. - // Keep going around. - str = str[:same] - continue - } - } - } - - // Found end of a run with common leading literal string: - // sub[start:i] all begin with str[0:len(str)], but sub[i] - // does not even begin with str[0]. - // - // Factor out common string and append factored expression to out. - if i == start { - // Nothing to do - run of length 0. - } else if i == start+1 { - // Just one: don't bother factoring. - out = append(out, sub[start]) - } else { - // Construct factored form: prefix(suffix1|suffix2|...) - prefix := p.newRegexp(OpLiteral) - prefix.Flags = strflags - prefix.Rune = append(prefix.Rune[:0], str...) - - for j := start; j < i; j++ { - sub[j] = p.removeLeadingString(sub[j], len(str)) - } - suffix := p.collapse(sub[start:i], OpAlternate) // recurse - - re := p.newRegexp(OpConcat) - re.Sub = append(re.Sub[:0], prefix, suffix) - out = append(out, re) - } - - // Prepare for next iteration. - start = i - str = istr - strflags = iflags - } - sub = out - - // Round 2: Factor out common complex prefixes, - // just the first piece of each concatenation, - // whatever it is. This is good enough a lot of the time. - start = 0 - out = sub[:0] - var first *Regexp - for i := 0; i <= len(sub); i++ { - // Invariant: the Regexps that were in sub[0:start] have been - // used or marked for reuse, and the slice space has been reused - // for out (len(out) <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin with ifirst. - var ifirst *Regexp - if i < len(sub) { - ifirst = p.leadingRegexp(sub[i]) - if first != nil && first.Equal(ifirst) { - continue - } - } - - // Found end of a run with common leading regexp: - // sub[start:i] all begin with first but sub[i] does not. - // - // Factor out common regexp and append factored expression to out. - if i == start { - // Nothing to do - run of length 0. - } else if i == start+1 { - // Just one: don't bother factoring. - out = append(out, sub[start]) - } else { - // Construct factored form: prefix(suffix1|suffix2|...) - prefix := first - for j := start; j < i; j++ { - reuse := j != start // prefix came from sub[start] - sub[j] = p.removeLeadingRegexp(sub[j], reuse) - } - suffix := p.collapse(sub[start:i], OpAlternate) // recurse - - re := p.newRegexp(OpConcat) - re.Sub = append(re.Sub[:0], prefix, suffix) - out = append(out, re) - } - - // Prepare for next iteration. - start = i - first = ifirst - } - sub = out - - // Round 3: Collapse runs of single literals into character classes. - start = 0 - out = sub[:0] - for i := 0; i <= len(sub); i++ { - // Invariant: the Regexps that were in sub[0:start] have been - // used or marked for reuse, and the slice space has been reused - // for out (len(out) <= start). - // - // Invariant: sub[start:i] consists of regexps that are either - // literal runes or character classes. - if i < len(sub) && isCharClass(sub[i]) { - continue - } - - // sub[i] is not a char or char class; - // emit char class for sub[start:i]... - if i == start { - // Nothing to do - run of length 0. - } else if i == start+1 { - out = append(out, sub[start]) - } else { - // Make new char class. - // Start with most complex regexp in sub[start]. - max := start - for j := start + 1; j < i; j++ { - if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { - max = j - } - } - sub[start], sub[max] = sub[max], sub[start] - - for j := start + 1; j < i; j++ { - mergeCharClass(sub[start], sub[j]) - p.reuse(sub[j]) - } - cleanAlt(sub[start]) - out = append(out, sub[start]) - } - - // ... and then emit sub[i]. - if i < len(sub) { - out = append(out, sub[i]) - } - start = i + 1 - } - sub = out - - // Round 4: Collapse runs of empty matches into a single empty match. - start = 0 - out = sub[:0] - for i := range sub { - if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { - continue - } - out = append(out, sub[i]) - } - sub = out - - return sub -} - -// leadingString returns the leading literal string that re begins with. -// The string refers to storage in re or its children. -func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { - if re.Op == OpConcat && len(re.Sub) > 0 { - re = re.Sub[0] - } - if re.Op != OpLiteral { - return nil, 0 - } - return re.Rune, re.Flags & FoldCase -} - -// removeLeadingString removes the first n leading runes -// from the beginning of re. It returns the replacement for re. -func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { - if re.Op == OpConcat && len(re.Sub) > 0 { - // Removing a leading string in a concatenation - // might simplify the concatenation. - sub := re.Sub[0] - sub = p.removeLeadingString(sub, n) - re.Sub[0] = sub - if sub.Op == OpEmptyMatch { - p.reuse(sub) - switch len(re.Sub) { - case 0, 1: - // Impossible but handle. - re.Op = OpEmptyMatch - re.Sub = nil - case 2: - old := re - re = re.Sub[1] - p.reuse(old) - default: - copy(re.Sub, re.Sub[1:]) - re.Sub = re.Sub[:len(re.Sub)-1] - } - } - return re - } - - if re.Op == OpLiteral { - re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] - if len(re.Rune) == 0 { - re.Op = OpEmptyMatch - } - } - return re -} - -// leadingRegexp returns the leading regexp that re begins with. -// The regexp refers to storage in re or its children. -func (p *parser) leadingRegexp(re *Regexp) *Regexp { - if re.Op == OpEmptyMatch { - return nil - } - if re.Op == OpConcat && len(re.Sub) > 0 { - sub := re.Sub[0] - if sub.Op == OpEmptyMatch { - return nil - } - return sub - } - return re -} - -// removeLeadingRegexp removes the leading regexp in re. -// It returns the replacement for re. -// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. -func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { - if re.Op == OpConcat && len(re.Sub) > 0 { - if reuse { - p.reuse(re.Sub[0]) - } - re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] - switch len(re.Sub) { - case 0: - re.Op = OpEmptyMatch - re.Sub = nil - case 1: - old := re - re = re.Sub[0] - p.reuse(old) - } - return re - } - if reuse { - p.reuse(re) - } - return p.newRegexp(OpEmptyMatch) -} - -func literalRegexp(s string, flags Flags) *Regexp { - re := &Regexp{Op: OpLiteral} - re.Flags = flags - re.Rune = re.Rune0[:0] // use local storage for small strings - for _, c := range s { - if len(re.Rune) >= cap(re.Rune) { - // string is too long to fit in Rune0. let Go handle it - re.Rune = []rune(s) - break - } - re.Rune = append(re.Rune, c) - } - return re -} - -// Parsing. - -// Parse parses a regular expression string s, controlled by the specified -// Flags, and returns a regular expression parse tree. The syntax is -// described in the top-level comment. -func Parse(s string, flags Flags) (*Regexp, error) { - if flags&Literal != 0 { - // Trivial parser for literal string. - if err := checkUTF8(s); err != nil { - return nil, err - } - return literalRegexp(s, flags), nil - } - - // Otherwise, must do real work. - var ( - p parser - err error - c rune - op Op - lastRepeat string - ) - p.flags = flags - p.wholeRegexp = s - t := s - for t != "" { - repeat := "" - BigSwitch: - switch t[0] { - default: - if c, t, err = nextRune(t); err != nil { - return nil, err - } - p.literal(c) - - case '(': - if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { - // Flag changes and non-capturing groups. - if t, err = p.parsePerlFlags(t); err != nil { - return nil, err - } - break - } - p.numCap++ - p.op(opLeftParen).Cap = p.numCap - t = t[1:] - case '|': - if err = p.parseVerticalBar(); err != nil { - return nil, err - } - t = t[1:] - case ')': - if err = p.parseRightParen(); err != nil { - return nil, err - } - t = t[1:] - case '^': - if p.flags&OneLine != 0 { - p.op(OpBeginText) - } else { - p.op(OpBeginLine) - } - t = t[1:] - case '$': - if p.flags&OneLine != 0 { - p.op(OpEndText).Flags |= WasDollar - } else { - p.op(OpEndLine) - } - t = t[1:] - case '.': - if p.flags&DotNL != 0 { - p.op(OpAnyChar) - } else { - p.op(OpAnyCharNotNL) - } - t = t[1:] - case '[': - if t, err = p.parseClass(t); err != nil { - return nil, err - } - case '*', '+', '?': - before := t - switch t[0] { - case '*': - op = OpStar - case '+': - op = OpPlus - case '?': - op = OpQuest - } - after := t[1:] - if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { - return nil, err - } - repeat = before - t = after - case '{': - op = OpRepeat - before := t - min, max, after, ok := p.parseRepeat(t) - if !ok { - // If the repeat cannot be parsed, { is a literal. - p.literal('{') - t = t[1:] - break - } - if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { - // Numbers were too big, or max is present and min > max. - return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} - } - if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { - return nil, err - } - repeat = before - t = after - case '\\': - if p.flags&PerlX != 0 && len(t) >= 2 { - switch t[1] { - case 'A': - p.op(OpBeginText) - t = t[2:] - break BigSwitch - case 'b': - p.op(OpWordBoundary) - t = t[2:] - break BigSwitch - case 'B': - p.op(OpNoWordBoundary) - t = t[2:] - break BigSwitch - case 'C': - // any byte; not supported - return nil, &Error{ErrInvalidEscape, t[:2]} - case 'Q': - // \Q ... \E: the ... is always literals - var lit string - if i := strings.Index(t, `\E`); i < 0 { - lit = t[2:] - t = "" - } else { - lit = t[2:i] - t = t[i+2:] - } - p.push(literalRegexp(lit, p.flags)) - break BigSwitch - case 'z': - p.op(OpEndText) - t = t[2:] - break BigSwitch - } - } - - re := p.newRegexp(OpCharClass) - re.Flags = p.flags - - // Look for Unicode character group like \p{Han} - if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { - r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) - if err != nil { - return nil, err - } - if r != nil { - re.Rune = r - t = rest - p.push(re) - break BigSwitch - } - } - - // Perl character class escape. - if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { - re.Rune = r - t = rest - p.push(re) - break BigSwitch - } - p.reuse(re) - - // Ordinary single-character escape. - if c, t, err = p.parseEscape(t); err != nil { - return nil, err - } - p.literal(c) - } - lastRepeat = repeat - } - - p.concat() - if p.swapVerticalBar() { - // pop vertical bar - p.stack = p.stack[:len(p.stack)-1] - } - p.alternate() - - n := len(p.stack) - if n != 1 { - return nil, &Error{ErrMissingParen, s} - } - return p.stack[0], nil -} - -// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. -// If s is not of that form, it returns ok == false. -// If s has the right form but the values are too big, it returns min == -1, ok == true. -func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { - if s == "" || s[0] != '{' { - return - } - s = s[1:] - var ok1 bool - if min, s, ok1 = p.parseInt(s); !ok1 { - return - } - if s == "" { - return - } - if s[0] != ',' { - max = min - } else { - s = s[1:] - if s == "" { - return - } - if s[0] == '}' { - max = -1 - } else if max, s, ok1 = p.parseInt(s); !ok1 { - return - } else if max < 0 { - // parseInt found too big a number - min = -1 - } - } - if s == "" || s[0] != '}' { - return - } - rest = s[1:] - ok = true - return -} - -// parsePerlFlags parses a Perl flag setting or non-capturing group or both, -// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. -// The caller must have ensured that s begins with "(?". -func (p *parser) parsePerlFlags(s string) (rest string, err error) { - t := s - - // Check for named captures, first introduced in Python's regexp library. - // As usual, there are three slightly different syntaxes: - // - // (?P<name>expr) the original, introduced by Python - // (?<name>expr) the .NET alteration, adopted by Perl 5.10 - // (?'name'expr) another .NET alteration, adopted by Perl 5.10 - // - // Perl 5.10 gave in and implemented the Python version too, - // but they claim that the last two are the preferred forms. - // PCRE and languages based on it (specifically, PHP and Ruby) - // support all three as well. EcmaScript 4 uses only the Python form. - // - // In both the open source world (via Code Search) and the - // Google source tree, (?P<expr>name) is the dominant form, - // so that's the one we implement. One is enough. - if len(t) > 4 && t[2] == 'P' && t[3] == '<' { - // Pull out name. - end := strings.IndexRune(t, '>') - if end < 0 { - if err = checkUTF8(t); err != nil { - return "", err - } - return "", &Error{ErrInvalidNamedCapture, s} - } - - capture := t[:end+1] // "(?P<name>" - name := t[4:end] // "name" - if err = checkUTF8(name); err != nil { - return "", err - } - if !isValidCaptureName(name) { - return "", &Error{ErrInvalidNamedCapture, capture} - } - - // Like ordinary capture, but named. - p.numCap++ - re := p.op(opLeftParen) - re.Cap = p.numCap - re.Name = name - return t[end+1:], nil - } - - // Non-capturing group. Might also twiddle Perl flags. - var c rune - t = t[2:] // skip (? - flags := p.flags - sign := +1 - sawFlag := false -Loop: - for t != "" { - if c, t, err = nextRune(t); err != nil { - return "", err - } - switch c { - default: - break Loop - - // Flags. - case 'i': - flags |= FoldCase - sawFlag = true - case 'm': - flags &^= OneLine - sawFlag = true - case 's': - flags |= DotNL - sawFlag = true - case 'U': - flags |= NonGreedy - sawFlag = true - - // Switch to negation. - case '-': - if sign < 0 { - break Loop - } - sign = -1 - // Invert flags so that | above turn into &^ and vice versa. - // We'll invert flags again before using it below. - flags = ^flags - sawFlag = false - - // End of flags, starting group or not. - case ':', ')': - if sign < 0 { - if !sawFlag { - break Loop - } - flags = ^flags - } - if c == ':' { - // Open new group - p.op(opLeftParen) - } - p.flags = flags - return t, nil - } - } - - return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} -} - -// isValidCaptureName reports whether name -// is a valid capture name: [A-Za-z0-9_]+. -// PCRE limits names to 32 bytes. -// Python rejects names starting with digits. -// We don't enforce either of those. -func isValidCaptureName(name string) bool { - if name == "" { - return false - } - for _, c := range name { - if c != '_' && !isalnum(c) { - return false - } - } - return true -} - -// parseInt parses a decimal integer. -func (p *parser) parseInt(s string) (n int, rest string, ok bool) { - if s == "" || s[0] < '0' || '9' < s[0] { - return - } - // Disallow leading zeros. - if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { - return - } - t := s - for s != "" && '0' <= s[0] && s[0] <= '9' { - s = s[1:] - } - rest = s - ok = true - // Have digits, compute value. - t = t[:len(t)-len(s)] - for i := 0; i < len(t); i++ { - // Avoid overflow. - if n >= 1e8 { - n = -1 - break - } - n = n*10 + int(t[i]) - '0' - } - return -} - -// can this be represented as a character class? -// single-rune literal string, char class, ., and .|\n. -func isCharClass(re *Regexp) bool { - return re.Op == OpLiteral && len(re.Rune) == 1 || - re.Op == OpCharClass || - re.Op == OpAnyCharNotNL || - re.Op == OpAnyChar -} - -// does re match r? -func matchRune(re *Regexp, r rune) bool { - switch re.Op { - case OpLiteral: - return len(re.Rune) == 1 && re.Rune[0] == r - case OpCharClass: - for i := 0; i < len(re.Rune); i += 2 { - if re.Rune[i] <= r && r <= re.Rune[i+1] { - return true - } - } - return false - case OpAnyCharNotNL: - return r != '\n' - case OpAnyChar: - return true - } - return false -} - -// parseVerticalBar handles a | in the input. -func (p *parser) parseVerticalBar() error { - p.concat() - - // The concatenation we just parsed is on top of the stack. - // If it sits above an opVerticalBar, swap it below - // (things below an opVerticalBar become an alternation). - // Otherwise, push a new vertical bar. - if !p.swapVerticalBar() { - p.op(opVerticalBar) - } - - return nil -} - -// mergeCharClass makes dst = dst|src. -// The caller must ensure that dst.Op >= src.Op, -// to reduce the amount of copying. -func mergeCharClass(dst, src *Regexp) { - switch dst.Op { - case OpAnyChar: - // src doesn't add anything. - case OpAnyCharNotNL: - // src might add \n - if matchRune(src, '\n') { - dst.Op = OpAnyChar - } - case OpCharClass: - // src is simpler, so either literal or char class - if src.Op == OpLiteral { - dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) - } else { - dst.Rune = appendClass(dst.Rune, src.Rune) - } - case OpLiteral: - // both literal - if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { - break - } - dst.Op = OpCharClass - dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) - dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) - } -} - -// If the top of the stack is an element followed by an opVerticalBar -// swapVerticalBar swaps the two and returns true. -// Otherwise it returns false. -func (p *parser) swapVerticalBar() bool { - // If above and below vertical bar are literal or char class, - // can merge into a single char class. - n := len(p.stack) - if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { - re1 := p.stack[n-1] - re3 := p.stack[n-3] - // Make re3 the more complex of the two. - if re1.Op > re3.Op { - re1, re3 = re3, re1 - p.stack[n-3] = re3 - } - mergeCharClass(re3, re1) - p.reuse(re1) - p.stack = p.stack[:n-1] - return true - } - - if n >= 2 { - re1 := p.stack[n-1] - re2 := p.stack[n-2] - if re2.Op == opVerticalBar { - if n >= 3 { - // Now out of reach. - // Clean opportunistically. - cleanAlt(p.stack[n-3]) - } - p.stack[n-2] = re1 - p.stack[n-1] = re2 - return true - } - } - return false -} - -// parseRightParen handles a ) in the input. -func (p *parser) parseRightParen() error { - p.concat() - if p.swapVerticalBar() { - // pop vertical bar - p.stack = p.stack[:len(p.stack)-1] - } - p.alternate() - - n := len(p.stack) - if n < 2 { - return &Error{ErrUnexpectedParen, p.wholeRegexp} - } - re1 := p.stack[n-1] - re2 := p.stack[n-2] - p.stack = p.stack[:n-2] - if re2.Op != opLeftParen { - return &Error{ErrUnexpectedParen, p.wholeRegexp} - } - // Restore flags at time of paren. - p.flags = re2.Flags - if re2.Cap == 0 { - // Just for grouping. - p.push(re1) - } else { - re2.Op = OpCapture - re2.Sub = re2.Sub0[:1] - re2.Sub[0] = re1 - p.push(re2) - } - return nil -} - -// parseEscape parses an escape sequence at the beginning of s -// and returns the rune. -func (p *parser) parseEscape(s string) (r rune, rest string, err error) { - t := s[1:] - if t == "" { - return 0, "", &Error{ErrTrailingBackslash, ""} - } - c, t, err := nextRune(t) - if err != nil { - return 0, "", err - } - -Switch: - switch c { - default: - if c < utf8.RuneSelf && !isalnum(c) { - // Escaped non-word characters are always themselves. - // PCRE is not quite so rigorous: it accepts things like - // \q, but we don't. We once rejected \_, but too many - // programs and people insist on using it, so allow \_. - return c, t, nil - } - - // Octal escapes. - case '1', '2', '3', '4', '5', '6', '7': - // Single non-zero digit is a backreference; not supported - if t == "" || t[0] < '0' || t[0] > '7' { - break - } - fallthrough - case '0': - // Consume up to three octal digits; already have one. - r = c - '0' - for i := 1; i < 3; i++ { - if t == "" || t[0] < '0' || t[0] > '7' { - break - } - r = r*8 + rune(t[0]) - '0' - t = t[1:] - } - return r, t, nil - - // Hexadecimal escapes. - case 'x': - if t == "" { - break - } - if c, t, err = nextRune(t); err != nil { - return 0, "", err - } - if c == '{' { - // Any number of digits in braces. - // Perl accepts any text at all; it ignores all text - // after the first non-hex digit. We require only hex digits, - // and at least one. - nhex := 0 - r = 0 - for { - if t == "" { - break Switch - } - if c, t, err = nextRune(t); err != nil { - return 0, "", err - } - if c == '}' { - break - } - v := unhex(c) - if v < 0 { - break Switch - } - r = r*16 + v - if r > unicode.MaxRune { - break Switch - } - nhex++ - } - if nhex == 0 { - break Switch - } - return r, t, nil - } - - // Easy case: two hex digits. - x := unhex(c) - if c, t, err = nextRune(t); err != nil { - return 0, "", err - } - y := unhex(c) - if x < 0 || y < 0 { - break - } - return x*16 + y, t, nil - - // C escapes. There is no case 'b', to avoid misparsing - // the Perl word-boundary \b as the C backspace \b - // when in POSIX mode. In Perl, /\b/ means word-boundary - // but /[\b]/ means backspace. We don't support that. - // If you want a backspace, embed a literal backspace - // character or use \x08. - case 'a': - return '\a', t, err - case 'f': - return '\f', t, err - case 'n': - return '\n', t, err - case 'r': - return '\r', t, err - case 't': - return '\t', t, err - case 'v': - return '\v', t, err - } - return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} -} - -// parseClassChar parses a character class character at the beginning of s -// and returns it. -func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { - if s == "" { - return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} - } - - // Allow regular escape sequences even though - // many need not be escaped in this context. - if s[0] == '\\' { - return p.parseEscape(s) - } - - return nextRune(s) -} - -type charGroup struct { - sign int - class []rune -} - -// parsePerlClassEscape parses a leading Perl character class escape like \d -// from the beginning of s. If one is present, it appends the characters to r -// and returns the new slice r and the remainder of the string. -func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { - if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { - return - } - g := perlGroup[s[0:2]] - if g.sign == 0 { - return - } - return p.appendGroup(r, g), s[2:] -} - -// parseNamedClass parses a leading POSIX named character class like [:alnum:] -// from the beginning of s. If one is present, it appends the characters to r -// and returns the new slice r and the remainder of the string. -func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { - if len(s) < 2 || s[0] != '[' || s[1] != ':' { - return - } - - i := strings.Index(s[2:], ":]") - if i < 0 { - return - } - i += 2 - name, s := s[0:i+2], s[i+2:] - g := posixGroup[name] - if g.sign == 0 { - return nil, "", &Error{ErrInvalidCharRange, name} - } - return p.appendGroup(r, g), s, nil -} - -func (p *parser) appendGroup(r []rune, g charGroup) []rune { - if p.flags&FoldCase == 0 { - if g.sign < 0 { - r = appendNegatedClass(r, g.class) - } else { - r = appendClass(r, g.class) - } - } else { - tmp := p.tmpClass[:0] - tmp = appendFoldedClass(tmp, g.class) - p.tmpClass = tmp - tmp = cleanClass(&p.tmpClass) - if g.sign < 0 { - r = appendNegatedClass(r, tmp) - } else { - r = appendClass(r, tmp) - } - } - return r -} - -var anyTable = &unicode.RangeTable{ - R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, - R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, -} - -// unicodeTable returns the unicode.RangeTable identified by name -// and the table of additional fold-equivalent code points. -func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { - // Special case: "Any" means any. - if name == "Any" { - return anyTable, anyTable - } - if t := unicode.Categories[name]; t != nil { - return t, unicode.FoldCategory[name] - } - if t := unicode.Scripts[name]; t != nil { - return t, unicode.FoldScript[name] - } - return nil, nil -} - -// parseUnicodeClass parses a leading Unicode character class like \p{Han} -// from the beginning of s. If one is present, it appends the characters to r -// and returns the new slice r and the remainder of the string. -func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { - if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { - return - } - - // Committed to parse or return error. - sign := +1 - if s[1] == 'P' { - sign = -1 - } - t := s[2:] - c, t, err := nextRune(t) - if err != nil { - return - } - var seq, name string - if c != '{' { - // Single-letter name. - seq = s[:len(s)-len(t)] - name = seq[2:] - } else { - // Name is in braces. - end := strings.IndexRune(s, '}') - if end < 0 { - if err = checkUTF8(s); err != nil { - return - } - return nil, "", &Error{ErrInvalidCharRange, s} - } - seq, t = s[:end+1], s[end+1:] - name = s[3:end] - if err = checkUTF8(name); err != nil { - return - } - } - - // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. - if name != "" && name[0] == '^' { - sign = -sign - name = name[1:] - } - - tab, fold := unicodeTable(name) - if tab == nil { - return nil, "", &Error{ErrInvalidCharRange, seq} - } - - if p.flags&FoldCase == 0 || fold == nil { - if sign > 0 { - r = appendTable(r, tab) - } else { - r = appendNegatedTable(r, tab) - } - } else { - // Merge and clean tab and fold in a temporary buffer. - // This is necessary for the negative case and just tidy - // for the positive case. - tmp := p.tmpClass[:0] - tmp = appendTable(tmp, tab) - tmp = appendTable(tmp, fold) - p.tmpClass = tmp - tmp = cleanClass(&p.tmpClass) - if sign > 0 { - r = appendClass(r, tmp) - } else { - r = appendNegatedClass(r, tmp) - } - } - return r, t, nil -} - -// parseClass parses a character class at the beginning of s -// and pushes it onto the parse stack. -func (p *parser) parseClass(s string) (rest string, err error) { - t := s[1:] // chop [ - re := p.newRegexp(OpCharClass) - re.Flags = p.flags - re.Rune = re.Rune0[:0] - - sign := +1 - if t != "" && t[0] == '^' { - sign = -1 - t = t[1:] - - // If character class does not match \n, add it here, - // so that negation later will do the right thing. - if p.flags&ClassNL == 0 { - re.Rune = append(re.Rune, '\n', '\n') - } - } - - class := re.Rune - first := true // ] and - are okay as first char in class - for t == "" || t[0] != ']' || first { - // POSIX: - is only okay unescaped as first or last in class. - // Perl: - is okay anywhere. - if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { - _, size := utf8.DecodeRuneInString(t[1:]) - return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} - } - first = false - - // Look for POSIX [:alnum:] etc. - if len(t) > 2 && t[0] == '[' && t[1] == ':' { - nclass, nt, err := p.parseNamedClass(t, class) - if err != nil { - return "", err - } - if nclass != nil { - class, t = nclass, nt - continue - } - } - - // Look for Unicode character group like \p{Han}. - nclass, nt, err := p.parseUnicodeClass(t, class) - if err != nil { - return "", err - } - if nclass != nil { - class, t = nclass, nt - continue - } - - // Look for Perl character class symbols (extension). - if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { - class, t = nclass, nt - continue - } - - // Single character or simple range. - rng := t - var lo, hi rune - if lo, t, err = p.parseClassChar(t, s); err != nil { - return "", err - } - hi = lo - // [a-] means (a|-) so check for final ]. - if len(t) >= 2 && t[0] == '-' && t[1] != ']' { - t = t[1:] - if hi, t, err = p.parseClassChar(t, s); err != nil { - return "", err - } - if hi < lo { - rng = rng[:len(rng)-len(t)] - return "", &Error{Code: ErrInvalidCharRange, Expr: rng} - } - } - if p.flags&FoldCase == 0 { - class = appendRange(class, lo, hi) - } else { - class = appendFoldedRange(class, lo, hi) - } - } - t = t[1:] // chop ] - - // Use &re.Rune instead of &class to avoid allocation. - re.Rune = class - class = cleanClass(&re.Rune) - if sign < 0 { - class = negateClass(class) - } - re.Rune = class - p.push(re) - return t, nil -} - -// cleanClass sorts the ranges (pairs of elements of r), -// merges them, and eliminates duplicates. -func cleanClass(rp *[]rune) []rune { - - // Sort by lo increasing, hi decreasing to break ties. - sort.Sort(ranges{rp}) - - r := *rp - if len(r) < 2 { - return r - } - - // Merge abutting, overlapping. - w := 2 // write index - for i := 2; i < len(r); i += 2 { - lo, hi := r[i], r[i+1] - if lo <= r[w-1]+1 { - // merge with previous range - if hi > r[w-1] { - r[w-1] = hi - } - continue - } - // new disjoint range - r[w] = lo - r[w+1] = hi - w += 2 - } - - return r[:w] -} - -// appendLiteral returns the result of appending the literal x to the class r. -func appendLiteral(r []rune, x rune, flags Flags) []rune { - if flags&FoldCase != 0 { - return appendFoldedRange(r, x, x) - } - return appendRange(r, x, x) -} - -// appendRange returns the result of appending the range lo-hi to the class r. -func appendRange(r []rune, lo, hi rune) []rune { - // Expand last range or next to last range if it overlaps or abuts. - // Checking two ranges helps when appending case-folded - // alphabets, so that one range can be expanding A-Z and the - // other expanding a-z. - n := len(r) - for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 - if n >= i { - rlo, rhi := r[n-i], r[n-i+1] - if lo <= rhi+1 && rlo <= hi+1 { - if lo < rlo { - r[n-i] = lo - } - if hi > rhi { - r[n-i+1] = hi - } - return r - } - } - } - - return append(r, lo, hi) -} - -const ( - // minimum and maximum runes involved in folding. - // checked during test. - minFold = 0x0041 - maxFold = 0x1044f -) - -// appendFoldedRange returns the result of appending the range lo-hi -// and its case folding-equivalent runes to the class r. -func appendFoldedRange(r []rune, lo, hi rune) []rune { - // Optimizations. - if lo <= minFold && hi >= maxFold { - // Range is full: folding can't add more. - return appendRange(r, lo, hi) - } - if hi < minFold || lo > maxFold { - // Range is outside folding possibilities. - return appendRange(r, lo, hi) - } - if lo < minFold { - // [lo, minFold-1] needs no folding. - r = appendRange(r, lo, minFold-1) - lo = minFold - } - if hi > maxFold { - // [maxFold+1, hi] needs no folding. - r = appendRange(r, maxFold+1, hi) - hi = maxFold - } - - // Brute force. Depend on appendRange to coalesce ranges on the fly. - for c := lo; c <= hi; c++ { - r = appendRange(r, c, c) - f := unicode.SimpleFold(c) - for f != c { - r = appendRange(r, f, f) - f = unicode.SimpleFold(f) - } - } - return r -} - -// appendClass returns the result of appending the class x to the class r. -// It assume x is clean. -func appendClass(r []rune, x []rune) []rune { - for i := 0; i < len(x); i += 2 { - r = appendRange(r, x[i], x[i+1]) - } - return r -} - -// appendFolded returns the result of appending the case folding of the class x to the class r. -func appendFoldedClass(r []rune, x []rune) []rune { - for i := 0; i < len(x); i += 2 { - r = appendFoldedRange(r, x[i], x[i+1]) - } - return r -} - -// appendNegatedClass returns the result of appending the negation of the class x to the class r. -// It assumes x is clean. -func appendNegatedClass(r []rune, x []rune) []rune { - nextLo := '\u0000' - for i := 0; i < len(x); i += 2 { - lo, hi := x[i], x[i+1] - if nextLo <= lo-1 { - r = appendRange(r, nextLo, lo-1) - } - nextLo = hi + 1 - } - if nextLo <= unicode.MaxRune { - r = appendRange(r, nextLo, unicode.MaxRune) - } - return r -} - -// appendTable returns the result of appending x to the class r. -func appendTable(r []rune, x *unicode.RangeTable) []rune { - for _, xr := range x.R16 { - lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) - if stride == 1 { - r = appendRange(r, lo, hi) - continue - } - for c := lo; c <= hi; c += stride { - r = appendRange(r, c, c) - } - } - for _, xr := range x.R32 { - lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) - if stride == 1 { - r = appendRange(r, lo, hi) - continue - } - for c := lo; c <= hi; c += stride { - r = appendRange(r, c, c) - } - } - return r -} - -// appendNegatedTable returns the result of appending the negation of x to the class r. -func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { - nextLo := '\u0000' // lo end of next class to add - for _, xr := range x.R16 { - lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) - if stride == 1 { - if nextLo <= lo-1 { - r = appendRange(r, nextLo, lo-1) - } - nextLo = hi + 1 - continue - } - for c := lo; c <= hi; c += stride { - if nextLo <= c-1 { - r = appendRange(r, nextLo, c-1) - } - nextLo = c + 1 - } - } - for _, xr := range x.R32 { - lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) - if stride == 1 { - if nextLo <= lo-1 { - r = appendRange(r, nextLo, lo-1) - } - nextLo = hi + 1 - continue - } - for c := lo; c <= hi; c += stride { - if nextLo <= c-1 { - r = appendRange(r, nextLo, c-1) - } - nextLo = c + 1 - } - } - if nextLo <= unicode.MaxRune { - r = appendRange(r, nextLo, unicode.MaxRune) - } - return r -} - -// negateClass overwrites r and returns r's negation. -// It assumes the class r is already clean. -func negateClass(r []rune) []rune { - nextLo := '\u0000' // lo end of next class to add - w := 0 // write index - for i := 0; i < len(r); i += 2 { - lo, hi := r[i], r[i+1] - if nextLo <= lo-1 { - r[w] = nextLo - r[w+1] = lo - 1 - w += 2 - } - nextLo = hi + 1 - } - r = r[:w] - if nextLo <= unicode.MaxRune { - // It's possible for the negation to have one more - // range - this one - than the original class, so use append. - r = append(r, nextLo, unicode.MaxRune) - } - return r -} - -// ranges implements sort.Interface on a []rune. -// The choice of receiver type definition is strange -// but avoids an allocation since we already have -// a *[]rune. -type ranges struct { - p *[]rune -} - -func (ra ranges) Less(i, j int) bool { - p := *ra.p - i *= 2 - j *= 2 - return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] -} - -func (ra ranges) Len() int { - return len(*ra.p) / 2 -} - -func (ra ranges) Swap(i, j int) { - p := *ra.p - i *= 2 - j *= 2 - p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] -} - -func checkUTF8(s string) error { - for s != "" { - rune, size := utf8.DecodeRuneInString(s) - if rune == utf8.RuneError && size == 1 { - return &Error{Code: ErrInvalidUTF8, Expr: s} - } - s = s[size:] - } - return nil -} - -func nextRune(s string) (c rune, t string, err error) { - c, size := utf8.DecodeRuneInString(s) - if c == utf8.RuneError && size == 1 { - return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} - } - return c, s[size:], nil -} - -func isalnum(c rune) bool { - return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' -} - -func unhex(c rune) rune { - if '0' <= c && c <= '9' { - return c - '0' - } - if 'a' <= c && c <= 'f' { - return c - 'a' + 10 - } - if 'A' <= c && c <= 'F' { - return c - 'A' + 10 - } - return -1 -} |