diff options
Diffstat (limited to 'src/pkg/regexp')
-rw-r--r-- | src/pkg/regexp/all_test.go | 65 | ||||
-rw-r--r-- | src/pkg/regexp/example_test.go | 4 | ||||
-rw-r--r-- | src/pkg/regexp/exec.go | 121 | ||||
-rw-r--r-- | src/pkg/regexp/onepass.go | 582 | ||||
-rw-r--r-- | src/pkg/regexp/onepass_test.go | 208 | ||||
-rw-r--r-- | src/pkg/regexp/regexp.go | 20 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/doc.go | 4 | ||||
-rwxr-xr-x | src/pkg/regexp/syntax/make_perl_groups.pl | 4 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/parse.go | 3 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/parse_test.go | 4 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/perl_groups.go | 4 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/prog.go | 54 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/prog_test.go | 4 |
13 files changed, 1049 insertions, 28 deletions
diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index e914a7ccb..301a1dfcd 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -473,6 +473,11 @@ func TestSplit(t *testing.T) { } } +// This ran out of stack before issue 7608 was fixed. +func TestOnePassCutoff(t *testing.T) { + MustCompile(`^(?:x{1,1000}){1,1000}$`) +} + func BenchmarkLiteral(b *testing.B) { x := strings.Repeat("x", 50) + "y" b.StopTimer() @@ -578,3 +583,63 @@ func BenchmarkAnchoredLongMatch(b *testing.B) { re.Match(x) } } + +func BenchmarkOnePassShortA(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile("^.bc(d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkNotOnePassShortA(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile(".bc(d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkOnePassShortB(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile("^.bc(?:d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkNotOnePassShortB(b *testing.B) { + b.StopTimer() + x := []byte("abcddddddeeeededd") + re := MustCompile(".bc(?:d|e)*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkOnePassLongPrefix(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^abcdefghijklmnopqrstuvwxyz.*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkOnePassLongNotPrefix(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bcdefghijklmnopqrstuvwxyz.*$") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} diff --git a/src/pkg/regexp/example_test.go b/src/pkg/regexp/example_test.go index b0ad9d340..a4e0da8ea 100644 --- a/src/pkg/regexp/example_test.go +++ b/src/pkg/regexp/example_test.go @@ -1,3 +1,7 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package regexp_test import ( diff --git a/src/pkg/regexp/exec.go b/src/pkg/regexp/exec.go index 333ca2554..c4cb201f6 100644 --- a/src/pkg/regexp/exec.go +++ b/src/pkg/regexp/exec.go @@ -37,6 +37,7 @@ type thread struct { type machine struct { re *Regexp // corresponding Regexp p *syntax.Prog // compiled program + op *onePassProg // compiled onepass program, or notOnePass q0, q1 queue // two queues for runq, nextq pool []*thread // pool of available threads matched bool // whether a match was found @@ -66,8 +67,8 @@ func (m *machine) newInputReader(r io.RuneReader) input { } // progMachine returns a new machine running the prog p. -func progMachine(p *syntax.Prog) *machine { - m := &machine{p: p} +func progMachine(p *syntax.Prog, op *onePassProg) *machine { + m := &machine{p: p, op: op} n := len(m.p.Inst) m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} @@ -312,6 +313,105 @@ func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.Empty return t } +// onepass runs the machine over the input starting at pos. +// It reports whether a match was found. +// If so, m.matchcap holds the submatch information. +func (m *machine) onepass(i input, pos int) bool { + startCond := m.re.cond + if startCond == ^syntax.EmptyOp(0) { // impossible + return false + } + m.matched = false + for i := range m.matchcap { + m.matchcap[i] = -1 + } + r, r1 := endOfText, endOfText + width, width1 := 0, 0 + r, width = i.step(pos) + if r != endOfText { + r1, width1 = i.step(pos + width) + } + var flag syntax.EmptyOp + if pos == 0 { + flag = syntax.EmptyOpContext(-1, r) + } else { + flag = i.context(pos) + } + pc := m.op.Start + inst := m.op.Inst[pc] + // If there is a simple literal prefix, skip over it. + if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 && + len(m.re.prefix) > 0 && i.canCheckPrefix() { + // Match requires literal prefix; fast search for it. + if i.hasPrefix(m.re) { + pos += len(m.re.prefix) + r, width = i.step(pos) + r1, width1 = i.step(pos + width) + flag = i.context(pos) + pc = int(m.re.prefixEnd) + } else { + return m.matched + } + } + for { + inst = m.op.Inst[pc] + pc = int(inst.Out) + switch inst.Op { + default: + panic("bad inst") + case syntax.InstMatch: + m.matched = true + if len(m.matchcap) > 0 { + m.matchcap[0] = 0 + m.matchcap[1] = pos + } + return m.matched + case syntax.InstRune: + if !inst.MatchRune(r) { + return m.matched + } + case syntax.InstRune1: + if r != inst.Rune[0] { + return m.matched + } + case syntax.InstRuneAny: + // Nothing + case syntax.InstRuneAnyNotNL: + if r == '\n' { + return m.matched + } + // peek at the input rune to see which branch of the Alt to take + case syntax.InstAlt, syntax.InstAltMatch: + pc = int(onePassNext(&inst, r)) + continue + case syntax.InstFail: + return m.matched + case syntax.InstNop: + continue + case syntax.InstEmptyWidth: + if syntax.EmptyOp(inst.Arg)&^flag != 0 { + return m.matched + } + continue + case syntax.InstCapture: + if int(inst.Arg) < len(m.matchcap) { + m.matchcap[inst.Arg] = pos + } + continue + } + if width == 0 { + break + } + flag = syntax.EmptyOpContext(r, r1) + pos += width + r, width = r1, width1 + if r != endOfText { + r1, width1 = i.step(pos + width) + } + } + return m.matched +} + // empty is a non-nil 0-element slice, // so doExecute can avoid an allocation // when 0 captures are requested from a successful match. @@ -329,16 +429,23 @@ func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap i } else { i = m.newInputString(s) } - m.init(ncap) - if !m.match(i, pos) { - re.put(m) - return nil + if m.op != notOnePass { + if !m.onepass(i, pos) { + re.put(m) + return nil + } + } else { + m.init(ncap) + if !m.match(i, pos) { + re.put(m) + return nil + } } if ncap == 0 { re.put(m) return empty // empty but not nil } - cap := make([]int, ncap) + cap := make([]int, len(m.matchcap)) copy(cap, m.matchcap) re.put(m) return cap diff --git a/src/pkg/regexp/onepass.go b/src/pkg/regexp/onepass.go new file mode 100644 index 000000000..501fb28af --- /dev/null +++ b/src/pkg/regexp/onepass.go @@ -0,0 +1,582 @@ +// Copyright 2014 The Go Authors. All rights reserved. + +package regexp + +import ( + "bytes" + "regexp/syntax" + "sort" + "unicode" +) + +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// "One-pass" regexp execution. +// Some regexps can be analyzed to determine that they never need +// backtracking: they are guaranteed to run in one pass over the string +// without bothering to save all the usual NFA state. +// Detect those and execute them more quickly. + +// A onePassProg is a compiled one-pass regular expression program. +// It is the same as syntax.Prog except for the use of onePassInst. +type onePassProg struct { + Inst []onePassInst + Start int // index of start instruction + NumCap int // number of InstCapture insts in re +} + +// A onePassInst is a single instruction in a one-pass regular expression program. +// It is the same as syntax.Inst except for the new 'Next' field. +type onePassInst struct { + syntax.Inst + Next []uint32 +} + +// OnePassPrefix returns a literal string that all matches for the +// regexp must start with. Complete is true if the prefix +// is the entire match. Pc is the index of the last rune instruction +// in the string. The OnePassPrefix skips over the mandatory +// EmptyBeginText +func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { + i := &p.Inst[p.Start] + if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { + return "", i.Op == syntax.InstMatch, uint32(p.Start) + } + pc = i.Out + i = &p.Inst[pc] + for i.Op == syntax.InstNop { + pc = i.Out + i = &p.Inst[pc] + } + // Avoid allocation of buffer if prefix is empty. + if iop(i) != syntax.InstRune || len(i.Rune) != 1 { + return "", i.Op == syntax.InstMatch, uint32(p.Start) + } + + // Have prefix; gather characters. + var buf bytes.Buffer + for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { + buf.WriteRune(i.Rune[0]) + pc, i = i.Out, &p.Inst[i.Out] + } + return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc +} + +// OnePassNext selects the next actionable state of the prog, based on the input character. +// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. +// One of the alternates may ultimately lead without input to end of line. If the instruction +// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. +func onePassNext(i *onePassInst, r rune) uint32 { + next := i.MatchRunePos(r) + if next >= 0 { + return i.Next[next] + } + if i.Op == syntax.InstAltMatch { + return i.Out + } + return 0 +} + +func iop(i *syntax.Inst) syntax.InstOp { + op := i.Op + switch op { + case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + op = syntax.InstRune + } + return op +} + +// Sparse Array implementation is used as a queueOnePass. +type queueOnePass struct { + sparse []uint32 + dense []uint32 + size, nextIndex uint32 +} + +func (q *queueOnePass) empty() bool { + return q.nextIndex >= q.size +} + +func (q *queueOnePass) next() (n uint32) { + n = q.dense[q.nextIndex] + q.nextIndex++ + return +} + +func (q *queueOnePass) clear() { + q.size = 0 + q.nextIndex = 0 +} + +func (q *queueOnePass) reset() { + q.nextIndex = 0 +} + +func (q *queueOnePass) contains(u uint32) bool { + if u >= uint32(len(q.sparse)) { + return false + } + return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u +} + +func (q *queueOnePass) insert(u uint32) { + if !q.contains(u) { + q.insertNew(u) + } +} + +func (q *queueOnePass) insertNew(u uint32) { + if u >= uint32(len(q.sparse)) { + return + } + q.sparse[u] = q.size + q.dense[q.size] = u + q.size++ +} + +func newQueue(size int) (q *queueOnePass) { + return &queueOnePass{ + sparse: make([]uint32, size), + dense: make([]uint32, size), + } +} + +// mergeRuneSets merges two non-intersecting runesets, and returns the merged result, +// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index +// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a +// NextIp array with the single element mergeFailed is returned. +// The code assumes that both inputs contain ordered and non-intersecting rune pairs. +const mergeFailed = uint32(0xffffffff) + +var ( + noRune = []rune{} + noNext = []uint32{mergeFailed} +) + +func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { + leftLen := len(*leftRunes) + rightLen := len(*rightRunes) + if leftLen&0x1 != 0 || rightLen&0x1 != 0 { + panic("mergeRuneSets odd length []rune") + } + var ( + lx, rx int + ) + merged := make([]rune, 0) + next := make([]uint32, 0) + ok := true + defer func() { + if !ok { + merged = nil + next = nil + } + }() + + ix := -1 + extend := func(newLow *int, newArray *[]rune, pc uint32) bool { + if ix > 0 && (*newArray)[*newLow] <= merged[ix] { + return false + } + merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) + *newLow += 2 + ix += 2 + next = append(next, pc) + return true + } + + for lx < leftLen || rx < rightLen { + switch { + case rx >= rightLen: + ok = extend(&lx, leftRunes, leftPC) + case lx >= leftLen: + ok = extend(&rx, rightRunes, rightPC) + case (*rightRunes)[rx] < (*leftRunes)[lx]: + ok = extend(&rx, rightRunes, rightPC) + default: + ok = extend(&lx, leftRunes, leftPC) + } + if !ok { + return noRune, noNext + } + } + return merged, next +} + +// cleanupOnePass drops working memory, and restores certain shortcut instructions. +func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { + for ix, instOriginal := range original.Inst { + switch instOriginal.Op { + case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: + case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: + prog.Inst[ix].Next = nil + case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + prog.Inst[ix].Next = nil + prog.Inst[ix] = onePassInst{Inst: instOriginal} + } + } +} + +// onePassCopy creates a copy of the original Prog, as we'll be modifying it +func onePassCopy(prog *syntax.Prog) *onePassProg { + p := &onePassProg{ + Start: prog.Start, + NumCap: prog.NumCap, + } + for _, inst := range prog.Inst { + p.Inst = append(p.Inst, onePassInst{Inst: inst}) + } + + // rewrites one or more common Prog constructs that enable some otherwise + // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at + // ip A, that points to ips B & C. + // A:BC + B:DA => A:BC + B:CD + // A:BC + B:DC => A:DC + B:DC + for pc := range p.Inst { + switch p.Inst[pc].Op { + default: + continue + case syntax.InstAlt, syntax.InstAltMatch: + // A:Bx + B:Ay + p_A_Other := &p.Inst[pc].Out + p_A_Alt := &p.Inst[pc].Arg + // make sure a target is another Alt + instAlt := p.Inst[*p_A_Alt] + if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { + p_A_Alt, p_A_Other = p_A_Other, p_A_Alt + instAlt = p.Inst[*p_A_Alt] + if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { + continue + } + } + instOther := p.Inst[*p_A_Other] + // Analyzing both legs pointing to Alts is for another day + if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { + // too complicated + continue + } + // simple empty transition loop + // A:BC + B:DA => A:BC + B:DC + p_B_Alt := &p.Inst[*p_A_Alt].Out + p_B_Other := &p.Inst[*p_A_Alt].Arg + patch := false + if instAlt.Out == uint32(pc) { + patch = true + } else if instAlt.Arg == uint32(pc) { + patch = true + p_B_Alt, p_B_Other = p_B_Other, p_B_Alt + } + if patch { + *p_B_Alt = *p_A_Other + } + + // empty transition to common target + // A:BC + B:DC => A:DC + B:DC + if *p_A_Other == *p_B_Alt { + *p_A_Alt = *p_B_Other + } + } + } + return p +} + +// runeSlice exists to permit sorting the case-folded rune sets. +type runeSlice []rune + +func (p runeSlice) Len() int { return len(p) } +func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } +func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// Sort is a convenience method. +func (p runeSlice) Sort() { + sort.Sort(p) +} + +var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} +var anyRune = []rune{0, unicode.MaxRune} + +// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, +// the match engine can always tell which branch to take. The routine may modify +// p if it is turned into a onepass Prog. If it isn't possible for this to be a +// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive +// to the size of the Prog. +func makeOnePass(p *onePassProg) *onePassProg { + // If the machine is very long, it's not worth the time to check if we can use one pass. + if len(p.Inst) >= 1000 { + return notOnePass + } + + var ( + instQueue = newQueue(len(p.Inst)) + visitQueue = newQueue(len(p.Inst)) + build func(uint32, *queueOnePass) + check func(uint32, map[uint32]bool) bool + onePassRunes = make([][]rune, len(p.Inst)) + ) + build = func(pc uint32, q *queueOnePass) { + if q.contains(pc) { + return + } + inst := p.Inst[pc] + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + q.insert(inst.Out) + build(inst.Out, q) + q.insert(inst.Arg) + case syntax.InstMatch, syntax.InstFail: + default: + q.insert(inst.Out) + } + } + + // check that paths from Alt instructions are unambiguous, and rebuild the new + // program as a onepass program + check = func(pc uint32, m map[uint32]bool) (ok bool) { + ok = true + inst := &p.Inst[pc] + if visitQueue.contains(pc) { + return + } + visitQueue.insert(pc) + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + ok = check(inst.Out, m) && check(inst.Arg, m) + // check no-input paths to InstMatch + matchOut := m[inst.Out] + matchArg := m[inst.Arg] + if matchOut && matchArg { + ok = false + break + } + // Match on empty goes in inst.Out + if matchArg { + inst.Out, inst.Arg = inst.Arg, inst.Out + matchOut, matchArg = matchArg, matchOut + } + if matchOut { + m[pc] = true + inst.Op = syntax.InstAltMatch + } + + // build a dispatch operator from the two legs of the alt. + onePassRunes[pc], inst.Next = mergeRuneSets( + &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) + if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { + ok = false + break + } + case syntax.InstCapture, syntax.InstNop: + ok = check(inst.Out, m) + m[pc] = m[inst.Out] + // pass matching runes back through these no-ops. + onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + case syntax.InstEmptyWidth: + ok = check(inst.Out, m) + m[pc] = m[inst.Out] + onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + case syntax.InstMatch, syntax.InstFail: + m[pc] = inst.Op == syntax.InstMatch + break + case syntax.InstRune: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + if len(inst.Rune) == 0 { + onePassRunes[pc] = []rune{} + inst.Next = []uint32{inst.Out} + break + } + runes := make([]rune, 0) + if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { + r0 := inst.Rune[0] + runes = append(runes, r0, r0) + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + runes = append(runes, r1, r1) + } + sort.Sort(runeSlice(runes)) + } else { + runes = append(runes, inst.Rune...) + } + onePassRunes[pc] = runes + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + inst.Op = syntax.InstRune + case syntax.InstRune1: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + runes := []rune{} + // expand case-folded runes + if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { + r0 := inst.Rune[0] + runes = append(runes, r0, r0) + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + runes = append(runes, r1, r1) + } + sort.Sort(runeSlice(runes)) + } else { + runes = append(runes, inst.Rune[0], inst.Rune[0]) + } + onePassRunes[pc] = runes + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + inst.Op = syntax.InstRune + case syntax.InstRuneAny: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + onePassRunes[pc] = append([]rune{}, anyRune...) + inst.Next = []uint32{inst.Out} + case syntax.InstRuneAnyNotNL: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + } + return + } + + instQueue.clear() + instQueue.insert(uint32(p.Start)) + m := make(map[uint32]bool, len(p.Inst)) + for !instQueue.empty() { + pc := instQueue.next() + inst := p.Inst[pc] + visitQueue.clear() + if !check(uint32(pc), m) { + p = notOnePass + break + } + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + instQueue.insert(inst.Out) + instQueue.insert(inst.Arg) + case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop: + instQueue.insert(inst.Out) + case syntax.InstMatch: + case syntax.InstFail: + case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + default: + } + } + if p != notOnePass { + for i, _ := range p.Inst { + p.Inst[i].Rune = onePassRunes[i] + } + } + return p +} + +// walk visits each Inst in the prog once, and applies the argument +// function(ip, next), in pre-order. +func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) { + var walk1 func(uint32) + progQueue := newQueue(len(prog.Inst)) + walk1 = func(ip uint32) { + if progQueue.contains(ip) { + return + } + progQueue.insert(ip) + inst := prog.Inst[ip] + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + for _, f := range funcs { + f(ip, inst.Out) + f(ip, inst.Arg) + } + walk1(inst.Out) + walk1(inst.Arg) + default: + for _, f := range funcs { + f(ip, inst.Out) + } + walk1(inst.Out) + } + } + walk1(uint32(prog.Start)) +} + +// find returns the Insts that match the argument predicate function +func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) { + matches = []uint32{} + + for ip := range prog.Inst { + if f(prog, ip) { + matches = append(matches, uint32(ip)) + } + } + return +} + +var notOnePass *onePassProg = nil + +// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog +// can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the +// Prog cannot be converted. For a one pass prog, the fundamental condition that must +// be true is: at any InstAlt, there must be no ambiguity about what branch to take. +func compileOnePass(prog *syntax.Prog) (p *onePassProg) { + if prog.Start == 0 { + return notOnePass + } + // onepass regexp is anchored + if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || + syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { + return notOnePass + } + // every instruction leading to InstMatch must be EmptyEndText + for _, inst := range prog.Inst { + opOut := prog.Inst[inst.Out].Op + switch inst.Op { + default: + if opOut == syntax.InstMatch { + return notOnePass + } + case syntax.InstAlt, syntax.InstAltMatch: + if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { + return notOnePass + } + case syntax.InstEmptyWidth: + if opOut == syntax.InstMatch { + if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { + continue + } + return notOnePass + } + } + } + // Creates a slightly optimized copy of the original Prog + // that cleans up some Prog idioms that block valid onepass programs + p = onePassCopy(prog) + + // checkAmbiguity on InstAlts, build onepass Prog if possible + p = makeOnePass(p) + + if p != notOnePass { + cleanupOnePass(p, prog) + } + return p +} diff --git a/src/pkg/regexp/onepass_test.go b/src/pkg/regexp/onepass_test.go new file mode 100644 index 000000000..7b2beea67 --- /dev/null +++ b/src/pkg/regexp/onepass_test.go @@ -0,0 +1,208 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "reflect" + "regexp/syntax" + "testing" +) + +var runeMergeTests = []struct { + left, right, merged []rune + next []uint32 + leftPC, rightPC uint32 +}{ + { + // empty rhs + []rune{69, 69}, + []rune{}, + []rune{69, 69}, + []uint32{1}, + 1, 2, + }, + { + // identical runes, identical targets + []rune{69, 69}, + []rune{69, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 1, + }, + { + // identical runes, different targets + []rune{69, 69}, + []rune{69, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // append right-first + []rune{69, 69}, + []rune{71, 71}, + []rune{69, 69, 71, 71}, + []uint32{1, 2}, + 1, 2, + }, + { + // append, left-first + []rune{71, 71}, + []rune{69, 69}, + []rune{69, 69, 71, 71}, + []uint32{2, 1}, + 1, 2, + }, + { + // successful interleave + []rune{60, 60, 71, 71, 101, 101}, + []rune{69, 69, 88, 88}, + []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101}, + []uint32{1, 2, 1, 2, 1}, + 1, 2, + }, + { + // left surrounds right + []rune{69, 74}, + []rune{71, 71}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // right surrounds left + []rune{69, 74}, + []rune{68, 75}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap at interval begin + []rune{69, 74}, + []rune{74, 75}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap ar interval end + []rune{69, 74}, + []rune{65, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap from above + []rune{69, 74}, + []rune{71, 74}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap from below + []rune{69, 74}, + []rune{65, 71}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // out of order []rune + []rune{69, 74, 60, 65}, + []rune{66, 67}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, +} + +func TestMergeRuneSet(t *testing.T) { + for ix, test := range runeMergeTests { + merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC) + if !reflect.DeepEqual(merged, test.merged) { + t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged) + } + if !reflect.DeepEqual(next, test.next) { + t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next) + } + } +} + +const noStr = `!` + +var onePass = &onePassProg{} + +var onePassTests = []struct { + re string + onePass *onePassProg + prog string +}{ + {`^(?:a|(?:a*))$`, notOnePass, noStr}, + {`^(?:(a)|(?:a*))$`, notOnePass, noStr}, + {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`}, + {`^abcd$`, onePass, `abcd`}, + {`^abcd$`, onePass, `abcde`}, + {`^(?:(?:a{0,})*?)$`, onePass, `a`}, + {`^(?:(?:a+)*)$`, onePass, ``}, + {`^(?:(?:a|(?:aa)))$`, onePass, ``}, + {`^(?:[^\s\S])$`, onePass, ``}, + {`^(?:(?:a{3,4}){0,})$`, notOnePass, `aaaaaa`}, + {`^(?:(?:a+)*)$`, onePass, `a`}, + {`^(?:(?:(?:a*)+))$`, onePass, noStr}, + {`^(?:(?:a+)*)$`, onePass, ``}, + {`^[a-c]+$`, onePass, `abc`}, + {`^[a-c]*$`, onePass, `abcdabc`}, + {`^(?:a*)$`, onePass, `aaaaaaa`}, + {`^(?:(?:aa)|a)$`, onePass, `a`}, + {`^[a-c]*`, notOnePass, `abcdabc`}, + {`^[a-c]*$`, onePass, `abc`}, + {`^...$`, onePass, ``}, + {`^(?:a|(?:aa))$`, onePass, `a`}, + {`^[a-c]*`, notOnePass, `abcabc`}, + {`^a((b))c$`, onePass, noStr}, + {`^a.[l-nA-Cg-j]?e$`, onePass, noStr}, + {`^a((b))$`, onePass, noStr}, + {`^a(?:(b)|(c))c$`, onePass, noStr}, + {`^a(?:(b*)|(c))c$`, notOnePass, noStr}, + {`^a(?:b|c)$`, onePass, noStr}, + {`^a(?:b?|c)$`, onePass, noStr}, + {`^a(?:b?|c?)$`, notOnePass, noStr}, + {`^a(?:b?|c+)$`, onePass, noStr}, + {`^a(?:b+|(bc))d$`, notOnePass, noStr}, + {`^a(?:bc)+$`, onePass, noStr}, + {`^a(?:[bcd])+$`, onePass, noStr}, + {`^a((?:[bcd])+)$`, onePass, noStr}, + {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`}, + {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`}, + {`^(?:(?:aa)|.)$`, notOnePass, `a`}, + {`^(?:(?:a{1,2}){1,2})$`, notOnePass, `aaaa`}, +} + +func TestCompileOnePass(t *testing.T) { + var ( + p *syntax.Prog + re *syntax.Regexp + err error + ) + for _, test := range onePassTests { + if re, err = syntax.Parse(test.re, syntax.Perl); err != nil { + t.Errorf("Parse(%q) got err:%s, want success", test.re, err) + continue + } + // needs to be done before compile... + re = re.Simplify() + if p, err = syntax.Compile(re); err != nil { + t.Errorf("Compile(%q) got err:%s, want success", test.re, err) + continue + } + onePass = compileOnePass(p) + if (onePass == notOnePass) != (test.onePass == notOnePass) { + t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass) + } + } +} diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index 0046026ea..0b8336a04 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -11,6 +11,14 @@ // For an overview of the syntax, run // godoc regexp/syntax // +// The regexp implementation provided by this package is +// guaranteed to run in time linear in the size of the input. +// (This is a property not guaranteed by most open source +// implementations of regular expressions.) For more information +// about this property, see +// http://swtch.com/~rsc/regexp/regexp1.html +// or any book about automata theory. +// // All characters are UTF-8-encoded code points. // // There are 16 methods of Regexp that match a regular expression and identify @@ -70,16 +78,17 @@ import ( var debug = false // Regexp is the representation of a compiled regular expression. -// The public interface is entirely through methods. // A Regexp is safe for concurrent use by multiple goroutines. type Regexp struct { // read-only after Compile expr string // as passed to Compile prog *syntax.Prog // compiled program + onepass *onePassProg // onpass program or nil prefix string // required prefix in unanchored matches prefixBytes []byte // prefix, as a []byte prefixComplete bool // prefix is the entire regexp prefixRune rune // first rune in prefix + prefixEnd uint32 // pc for last rune in prefix cond syntax.EmptyOp // empty-width conditions required at start of match numSubexp int subexpNames []string @@ -156,12 +165,17 @@ func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { regexp := &Regexp{ expr: expr, prog: prog, + onepass: compileOnePass(prog), numSubexp: maxCap, subexpNames: capNames, cond: prog.StartCond(), longest: longest, } - regexp.prefix, regexp.prefixComplete = prog.Prefix() + if regexp.onepass == notOnePass { + regexp.prefix, regexp.prefixComplete = prog.Prefix() + } else { + regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) + } if regexp.prefix != "" { // TODO(rsc): Remove this allocation by adding // IndexString to package bytes. @@ -183,7 +197,7 @@ func (re *Regexp) get() *machine { return z } re.mu.Unlock() - z := progMachine(re.prog) + z := progMachine(re.prog, re.onepass) z.re = re return z } diff --git a/src/pkg/regexp/syntax/doc.go b/src/pkg/regexp/syntax/doc.go index e52632ef7..8e72c90d3 100644 --- a/src/pkg/regexp/syntax/doc.go +++ b/src/pkg/regexp/syntax/doc.go @@ -46,6 +46,10 @@ Repetitions: x{n,}? n or more x, prefer fewer x{n}? exactly n x +Implementation restriction: The counting forms x{n} etc. (but not the other +forms x* etc.) have an upper limit of n=1000. Negative or higher explicit +counts yield the parse error ErrInvalidRepeatSize. + Grouping: (re) numbered capturing group (submatch) (?P<name>re) named & numbered capturing group (submatch) diff --git a/src/pkg/regexp/syntax/make_perl_groups.pl b/src/pkg/regexp/syntax/make_perl_groups.pl index d024f5090..90040fcb4 100755 --- a/src/pkg/regexp/syntax/make_perl_groups.pl +++ b/src/pkg/regexp/syntax/make_perl_groups.pl @@ -92,6 +92,10 @@ sub PrintClasses($@) { } print <<EOF; +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + // GENERATED BY make_perl_groups.pl; DO NOT EDIT. // make_perl_groups.pl >perl_groups.go diff --git a/src/pkg/regexp/syntax/parse.go b/src/pkg/regexp/syntax/parse.go index 42d0bf4a1..cb25dca39 100644 --- a/src/pkg/regexp/syntax/parse.go +++ b/src/pkg/regexp/syntax/parse.go @@ -668,7 +668,6 @@ func Parse(s string, flags Flags) (*Regexp, error) { c rune op Op lastRepeat string - min, max int ) p.flags = flags p.wholeRegexp = s @@ -740,7 +739,7 @@ func Parse(s string, flags Flags) (*Regexp, error) { op = OpQuest } after := t[1:] - if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { + if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { return nil, err } repeat = before diff --git a/src/pkg/regexp/syntax/parse_test.go b/src/pkg/regexp/syntax/parse_test.go index 269d6c3b8..f3089294c 100644 --- a/src/pkg/regexp/syntax/parse_test.go +++ b/src/pkg/regexp/syntax/parse_test.go @@ -100,12 +100,12 @@ var parseTests = []parseTest{ {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, - {`\pZ`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, - {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, diff --git a/src/pkg/regexp/syntax/perl_groups.go b/src/pkg/regexp/syntax/perl_groups.go index 1a11ca62f..effe4e686 100644 --- a/src/pkg/regexp/syntax/perl_groups.go +++ b/src/pkg/regexp/syntax/perl_groups.go @@ -1,3 +1,7 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + // GENERATED BY make_perl_groups.pl; DO NOT EDIT. // make_perl_groups.pl >perl_groups.go diff --git a/src/pkg/regexp/syntax/prog.go b/src/pkg/regexp/syntax/prog.go index a482a82f2..29bd282d0 100644 --- a/src/pkg/regexp/syntax/prog.go +++ b/src/pkg/regexp/syntax/prog.go @@ -37,6 +37,27 @@ const ( InstRuneAnyNotNL ) +var instOpNames = []string{ + "InstAlt", + "InstAltMatch", + "InstCapture", + "InstEmptyWidth", + "InstMatch", + "InstFail", + "InstNop", + "InstRune", + "InstRune1", + "InstRuneAny", + "InstRuneAnyNotNL", +} + +func (i InstOp) String() string { + if uint(i) >= uint(len(instOpNames)) { + return "" + } + return instOpNames[i] +} + // An EmptyOp specifies a kind or mixture of zero-width assertions. type EmptyOp uint8 @@ -103,13 +124,13 @@ func (p *Prog) String() string { // skipNop follows any no-op or capturing instructions // and returns the resulting pc. -func (p *Prog) skipNop(pc uint32) *Inst { +func (p *Prog) skipNop(pc uint32) (*Inst, uint32) { i := &p.Inst[pc] for i.Op == InstNop || i.Op == InstCapture { pc = i.Out i = &p.Inst[pc] } - return i + return i, pc } // op returns i.Op but merges all the Rune special cases into InstRune @@ -126,7 +147,7 @@ func (i *Inst) op() InstOp { // regexp must start with. Complete is true if the prefix // is the entire match. func (p *Prog) Prefix() (prefix string, complete bool) { - i := p.skipNop(uint32(p.Start)) + i, _ := p.skipNop(uint32(p.Start)) // Avoid allocation of buffer if prefix is empty. if i.op() != InstRune || len(i.Rune) != 1 { @@ -137,7 +158,7 @@ func (p *Prog) Prefix() (prefix string, complete bool) { var buf bytes.Buffer for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { buf.WriteRune(i.Rune[0]) - i = p.skipNop(i.Out) + i, _ = p.skipNop(i.Out) } return buf.String(), i.Op == InstMatch } @@ -166,35 +187,46 @@ Loop: return flag } +const noMatch = -1 + // MatchRune returns true if the instruction matches (and consumes) r. // It should only be called when i.Op == InstRune. func (i *Inst) MatchRune(r rune) bool { + return i.MatchRunePos(r) != noMatch +} + +// MatchRunePos checks whether the instruction matches (and consumes) r. +// If so, MatchRunePos returns the index of the matching rune pair +// (or, when len(i.Rune) == 1, rune singleton). +// If not, MatchRunePos returns -1. +// MatchRunePos should only be called when i.Op == InstRune. +func (i *Inst) MatchRunePos(r rune) int { rune := i.Rune // Special case: single-rune slice is from literal string, not char class. if len(rune) == 1 { r0 := rune[0] if r == r0 { - return true + return 0 } if Flags(i.Arg)&FoldCase != 0 { for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { if r == r1 { - return true + return 0 } } } - return false + return noMatch } // Peek at the first few pairs. // Should handle ASCII well. for j := 0; j < len(rune) && j <= 8; j += 2 { if r < rune[j] { - return false + return noMatch } if r <= rune[j+1] { - return true + return j / 2 } } @@ -205,14 +237,14 @@ func (i *Inst) MatchRune(r rune) bool { m := lo + (hi-lo)/2 if c := rune[2*m]; c <= r { if r <= rune[2*m+1] { - return true + return m } lo = m + 1 } else { hi = m } } - return false + return noMatch } // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char. diff --git a/src/pkg/regexp/syntax/prog_test.go b/src/pkg/regexp/syntax/prog_test.go index cd71abc2a..50bfa3d4b 100644 --- a/src/pkg/regexp/syntax/prog_test.go +++ b/src/pkg/regexp/syntax/prog_test.go @@ -4,9 +4,7 @@ package syntax -import ( - "testing" -) +import "testing" var compileTests = []struct { Regexp string |