diff options
author | Ondřej Surý <ondrej@sury.org> | 2011-01-17 12:40:45 +0100 |
---|---|---|
committer | Ondřej Surý <ondrej@sury.org> | 2011-01-17 12:40:45 +0100 |
commit | 3e45412327a2654a77944249962b3652e6142299 (patch) | |
tree | bc3bf69452afa055423cbe0c5cfa8ca357df6ccf /src/pkg/regexp | |
parent | c533680039762cacbc37db8dc7eed074c3e497be (diff) | |
download | golang-3e45412327a2654a77944249962b3652e6142299.tar.gz |
Imported Upstream version 2011.01.12upstream/2011.01.12
Diffstat (limited to 'src/pkg/regexp')
-rw-r--r-- | src/pkg/regexp/Makefile | 2 | ||||
-rw-r--r-- | src/pkg/regexp/all_test.go | 479 | ||||
-rw-r--r-- | src/pkg/regexp/find_test.go | 459 | ||||
-rw-r--r-- | src/pkg/regexp/regexp.go | 950 |
4 files changed, 1145 insertions, 745 deletions
diff --git a/src/pkg/regexp/Makefile b/src/pkg/regexp/Makefile index 9f91c8e7a..9024e66da 100644 --- a/src/pkg/regexp/Makefile +++ b/src/pkg/regexp/Makefile @@ -2,7 +2,7 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -include ../../Make.$(GOARCH) +include ../../Make.inc TARG=regexp GOFILES=\ diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 4bdd6c67e..3b2c489bc 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -37,80 +37,18 @@ type stringError struct { } var bad_re = []stringError{ - stringError{`*`, ErrBareClosure}, - stringError{`(abc`, ErrUnmatchedLpar}, - stringError{`abc)`, ErrUnmatchedRpar}, - stringError{`x[a-z`, ErrUnmatchedLbkt}, - stringError{`abc]`, ErrUnmatchedRbkt}, - stringError{`[z-a]`, ErrBadRange}, - stringError{`abc\`, ErrExtraneousBackslash}, - stringError{`a**`, ErrBadClosure}, - stringError{`a*+`, ErrBadClosure}, - stringError{`a??`, ErrBadClosure}, - stringError{`*`, ErrBareClosure}, - stringError{`\x`, ErrBadBackslash}, -} - -type vec []int - -type tester struct { - re string - text string - match vec -} - -var matches = []tester{ - tester{`^abcdefg`, "abcdefg", vec{0, 7}}, - tester{`a+`, "baaab", vec{1, 4}}, - tester{"abcd..", "abcdef", vec{0, 6}}, - tester{``, "", vec{0, 0}}, - tester{`a`, "a", vec{0, 1}}, - tester{`x`, "y", vec{}}, - tester{`b`, "abc", vec{1, 2}}, - tester{`.`, "a", vec{0, 1}}, - tester{`.*`, "abcdef", vec{0, 6}}, - tester{`^`, "abcde", vec{0, 0}}, - tester{`$`, "abcde", vec{5, 5}}, - tester{`^abcd$`, "abcd", vec{0, 4}}, - tester{`^bcd'`, "abcdef", vec{}}, - tester{`^abcd$`, "abcde", vec{}}, - tester{`a+`, "baaab", vec{1, 4}}, - tester{`a*`, "baaab", vec{0, 0}}, - tester{`[a-z]+`, "abcd", vec{0, 4}}, - tester{`[^a-z]+`, "ab1234cd", vec{2, 6}}, - tester{`[a\-\]z]+`, "az]-bcz", vec{0, 4}}, - tester{`[^\n]+`, "abcd\n", vec{0, 4}}, - tester{`[日本語]+`, "日本語日本語", vec{0, 18}}, - tester{`日本語+`, "日本語", vec{0, 9}}, - tester{`日本語+`, "日本語語語語", vec{0, 18}}, - tester{`()`, "", vec{0, 0, 0, 0}}, - tester{`(a)`, "a", vec{0, 1, 0, 1}}, - tester{`(.)(.)`, "日a", vec{0, 4, 0, 3, 3, 4}}, - tester{`(.*)`, "", vec{0, 0, 0, 0}}, - tester{`(.*)`, "abcd", vec{0, 4, 0, 4}}, - tester{`(..)(..)`, "abcd", vec{0, 4, 0, 2, 2, 4}}, - tester{`(([^xyz]*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 3, 4}}, - tester{`((a|b|c)*(d))`, "abcd", vec{0, 4, 0, 4, 2, 3, 3, 4}}, - tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}}, - tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}}, - tester{`(.*).*`, "ab", vec{0, 2, 0, 2}}, - tester{`[.]`, ".", vec{0, 1}}, - tester{`/$`, "/abc/", vec{4, 5}}, - tester{`/$`, "/abc", vec{}}, - - // fixed bugs - tester{`ab$`, "cab", vec{1, 3}}, - tester{`axxb$`, "axxcb", vec{}}, - tester{`data`, "daXY data", vec{5, 9}}, - tester{`da(.)a$`, "daXY data", vec{5, 9, 7, 8}}, - - // can backslash-escape any punctuation - tester{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, - `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, vec{0, 31}}, - tester{`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, - `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, vec{0, 31}}, - tester{"\\`", "`", vec{0, 1}}, - tester{"[\\`]+", "`", vec{0, 1}}, + {`*`, ErrBareClosure}, + {`(abc`, ErrUnmatchedLpar}, + {`abc)`, ErrUnmatchedRpar}, + {`x[a-z`, ErrUnmatchedLbkt}, + {`abc]`, ErrUnmatchedRbkt}, + {`[z-a]`, ErrBadRange}, + {`abc\`, ErrExtraneousBackslash}, + {`a**`, ErrBadClosure}, + {`a*+`, ErrBadClosure}, + {`a??`, ErrBadClosure}, + {`*`, ErrBareClosure}, + {`\x`, ErrBadBackslash}, } func compileTest(t *testing.T, expr string, error os.Error) *Regexp { @@ -121,66 +59,6 @@ func compileTest(t *testing.T, expr string, error os.Error) *Regexp { return re } -func printVec(t *testing.T, m []int) { - l := len(m) - if l == 0 { - t.Log("\t<no match>") - } else { - if m[len(m)-1] == -1 { - m = m[0 : len(m)-2] - } - t.Log("\t", m) - } -} - -func equal(m1, m2 []int) bool { - l := len(m1) - if l != len(m2) { - return false - } - for i := 0; i < l; i++ { - if m1[i] != m2[i] { - return false - } - } - return true -} - -func equalStrings(m1, m2 []string) bool { - l := len(m1) - if l != len(m2) { - return false - } - for i := 0; i < l; i++ { - if m1[i] != m2[i] { - return false - } - } - return true -} - -func executeTest(t *testing.T, expr string, str string, match []int) { - re := compileTest(t, expr, nil) - if re == nil { - return - } - m := re.ExecuteString(str) - if !equal(m, match) { - t.Errorf("ExecuteString failure on %#q matching %q:", expr, str) - printVec(t, m) - t.Log("should be:") - printVec(t, match) - } - // now try bytes - m = re.Execute([]byte(str)) - if !equal(m, match) { - t.Errorf("Execute failure on %#q matching %q:", expr, str) - printVec(t, m) - t.Log("should be:") - printVec(t, match) - } -} - func TestGoodCompile(t *testing.T) { for i := 0; i < len(good_re); i++ { compileTest(t, good_re[i], nil) @@ -193,57 +71,41 @@ func TestBadCompile(t *testing.T) { } } -func TestExecute(t *testing.T) { - for i := 0; i < len(matches); i++ { - test := &matches[i] - executeTest(t, test.re, test.text, test.match) - } -} - -func matchTest(t *testing.T, expr string, str string, match []int) { - re := compileTest(t, expr, nil) +func matchTest(t *testing.T, test *FindTest) { + re := compileTest(t, test.pat, nil) if re == nil { return } - m := re.MatchString(str) - if m != (len(match) > 0) { - t.Errorf("MatchString failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0) + m := re.MatchString(test.text) + if m != (len(test.matches) > 0) { + t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) } // now try bytes - m = re.Match([]byte(str)) - if m != (len(match) > 0) { - t.Errorf("Match failure on %#q matching %q: %t should be %t", expr, str, m, len(match) > 0) + m = re.Match([]byte(test.text)) + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) } } func TestMatch(t *testing.T) { - for i := 0; i < len(matches); i++ { - test := &matches[i] - matchTest(t, test.re, test.text, test.match) + for _, test := range findTests { + matchTest(t, &test) } } -func TestMatchStrings(t *testing.T) { - for i := 0; i < len(matches); i++ { - test := &matches[i] - matchTest(t, test.re, test.text, test.match) - } -} - -func matchFunctionTest(t *testing.T, expr string, str string, match []int) { - m, err := MatchString(expr, str) +func matchFunctionTest(t *testing.T, test *FindTest) { + m, err := MatchString(test.pat, test.text) if err == nil { return } - if m != (len(match) > 0) { - t.Errorf("Match failure on %#q matching %q: %d should be %d", expr, str, m, len(match) > 0) + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) } } func TestMatchFunction(t *testing.T) { - for i := 0; i < len(matches); i++ { - test := &matches[i] - matchFunctionTest(t, test.re, test.text, test.match) + for _, test := range findTests { + matchFunctionTest(t, &test) } } @@ -253,64 +115,64 @@ type ReplaceTest struct { var replaceTests = []ReplaceTest{ // Test empty input and/or replacement, with pattern that matches the empty string. - ReplaceTest{"", "", "", ""}, - ReplaceTest{"", "x", "", "x"}, - ReplaceTest{"", "", "abc", "abc"}, - ReplaceTest{"", "x", "abc", "xaxbxcx"}, + {"", "", "", ""}, + {"", "x", "", "x"}, + {"", "", "abc", "abc"}, + {"", "x", "abc", "xaxbxcx"}, // Test empty input and/or replacement, with pattern that does not match the empty string. - ReplaceTest{"b", "", "", ""}, - ReplaceTest{"b", "x", "", ""}, - ReplaceTest{"b", "", "abc", "ac"}, - ReplaceTest{"b", "x", "abc", "axc"}, - ReplaceTest{"y", "", "", ""}, - ReplaceTest{"y", "x", "", ""}, - ReplaceTest{"y", "", "abc", "abc"}, - ReplaceTest{"y", "x", "abc", "abc"}, + {"b", "", "", ""}, + {"b", "x", "", ""}, + {"b", "", "abc", "ac"}, + {"b", "x", "abc", "axc"}, + {"y", "", "", ""}, + {"y", "x", "", ""}, + {"y", "", "abc", "abc"}, + {"y", "x", "abc", "abc"}, // Multibyte characters -- verify that we don't try to match in the middle // of a character. - ReplaceTest{"[a-c]*", "x", "\u65e5", "x\u65e5x"}, - ReplaceTest{"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, + {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, + {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, // Start and end of a string. - ReplaceTest{"^[a-c]*", "x", "abcdabc", "xdabc"}, - ReplaceTest{"[a-c]*$", "x", "abcdabc", "abcdx"}, - ReplaceTest{"^[a-c]*$", "x", "abcdabc", "abcdabc"}, - ReplaceTest{"^[a-c]*", "x", "abc", "x"}, - ReplaceTest{"[a-c]*$", "x", "abc", "x"}, - ReplaceTest{"^[a-c]*$", "x", "abc", "x"}, - ReplaceTest{"^[a-c]*", "x", "dabce", "xdabce"}, - ReplaceTest{"[a-c]*$", "x", "dabce", "dabcex"}, - ReplaceTest{"^[a-c]*$", "x", "dabce", "dabce"}, - ReplaceTest{"^[a-c]*", "x", "", "x"}, - ReplaceTest{"[a-c]*$", "x", "", "x"}, - ReplaceTest{"^[a-c]*$", "x", "", "x"}, - - ReplaceTest{"^[a-c]+", "x", "abcdabc", "xdabc"}, - ReplaceTest{"[a-c]+$", "x", "abcdabc", "abcdx"}, - ReplaceTest{"^[a-c]+$", "x", "abcdabc", "abcdabc"}, - ReplaceTest{"^[a-c]+", "x", "abc", "x"}, - ReplaceTest{"[a-c]+$", "x", "abc", "x"}, - ReplaceTest{"^[a-c]+$", "x", "abc", "x"}, - ReplaceTest{"^[a-c]+", "x", "dabce", "dabce"}, - ReplaceTest{"[a-c]+$", "x", "dabce", "dabce"}, - ReplaceTest{"^[a-c]+$", "x", "dabce", "dabce"}, - ReplaceTest{"^[a-c]+", "x", "", ""}, - ReplaceTest{"[a-c]+$", "x", "", ""}, - ReplaceTest{"^[a-c]+$", "x", "", ""}, + {"^[a-c]*", "x", "abcdabc", "xdabc"}, + {"[a-c]*$", "x", "abcdabc", "abcdx"}, + {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]*", "x", "abc", "x"}, + {"[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*", "x", "dabce", "xdabce"}, + {"[a-c]*$", "x", "dabce", "dabcex"}, + {"^[a-c]*$", "x", "dabce", "dabce"}, + {"^[a-c]*", "x", "", "x"}, + {"[a-c]*$", "x", "", "x"}, + {"^[a-c]*$", "x", "", "x"}, + + {"^[a-c]+", "x", "abcdabc", "xdabc"}, + {"[a-c]+$", "x", "abcdabc", "abcdx"}, + {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]+", "x", "abc", "x"}, + {"[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+", "x", "dabce", "dabce"}, + {"[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+", "x", "", ""}, + {"[a-c]+$", "x", "", ""}, + {"^[a-c]+$", "x", "", ""}, // Other cases. - ReplaceTest{"abc", "def", "abcdefg", "defdefg"}, - ReplaceTest{"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, - ReplaceTest{"abc", "", "abcdabc", "d"}, - ReplaceTest{"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, - ReplaceTest{"abc", "d", "", ""}, - ReplaceTest{"abc", "d", "abc", "d"}, - ReplaceTest{".+", "x", "abc", "x"}, - ReplaceTest{"[a-c]*", "x", "def", "xdxexfx"}, - ReplaceTest{"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, - ReplaceTest{"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, + {"abc", "def", "abcdefg", "defdefg"}, + {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, + {"abc", "", "abcdabc", "d"}, + {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, + {"abc", "d", "", ""}, + {"abc", "d", "abc", "d"}, + {".+", "x", "abc", "x"}, + {"[a-c]*", "x", "def", "xdxexfx"}, + {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, + {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, } type ReplaceFuncTest struct { @@ -320,9 +182,9 @@ type ReplaceFuncTest struct { } var replaceFuncTests = []ReplaceFuncTest{ - ReplaceFuncTest{"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, - ReplaceFuncTest{"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, - ReplaceFuncTest{"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, + {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, + {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, + {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, } func TestReplaceAll(t *testing.T) { @@ -367,18 +229,21 @@ func TestReplaceAllFunc(t *testing.T) { } } -type QuoteMetaTest struct { - pattern, output string +type MetaTest struct { + pattern, output, literal string + isLiteral bool } -var quoteMetaTests = []QuoteMetaTest{ - QuoteMetaTest{``, ``}, - QuoteMetaTest{`foo`, `foo`}, - QuoteMetaTest{`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[{\]}\\\|,<\.>/\?~`}, +var metaTests = []MetaTest{ + {``, ``, ``, true}, + {`foo`, `foo`, `foo`, true}, + {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator + {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators + {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[{\]}\\\|,<\.>/\?~`, `!@#`, false}, } func TestQuoteMeta(t *testing.T) { - for _, tc := range quoteMetaTests { + for _, tc := range metaTests { // Verify that QuoteMeta returns the expected string. quoted := QuoteMeta(tc.pattern) if quoted != tc.output { @@ -407,96 +272,16 @@ func TestQuoteMeta(t *testing.T) { } } -type matchCase struct { - matchfunc string - input string - n int - regexp string - expected []string -} - -var matchCases = []matchCase{ - matchCase{"match", " aa b", 0, "[^ ]+", []string{"aa", "b"}}, - matchCase{"match", " aa b", 0, "[^ ]*", []string{"", "aa", "b"}}, - matchCase{"match", "a b c", 0, "[^ ]*", []string{"a", "b", "c"}}, - matchCase{"match", "a:a: a:", 0, "^.:", []string{"a:"}}, - matchCase{"match", "", 0, "[^ ]*", []string{""}}, - matchCase{"match", "", 0, "", []string{""}}, - matchCase{"match", "a", 0, "", []string{"", ""}}, - matchCase{"match", "ab", 0, "^", []string{""}}, - matchCase{"match", "ab", 0, "$", []string{""}}, - matchCase{"match", "ab", 0, "X*", []string{"", "", ""}}, - matchCase{"match", "aX", 0, "X*", []string{"", "X"}}, - matchCase{"match", "XabX", 0, "X*", []string{"X", "", "X"}}, - - matchCase{"matchit", "", 0, ".", []string{}}, - matchCase{"matchit", "abc", 2, ".", []string{"a", "b"}}, - matchCase{"matchit", "abc", 0, ".", []string{"a", "b", "c"}}, -} - -func printStringSlice(t *testing.T, s []string) { - t.Logf("%#v", s) -} - -func TestAllMatches(t *testing.T) { - ch := make(chan matchCase) - go func() { - for _, c := range matchCases { - ch <- c - stringCase := matchCase{ - "string" + c.matchfunc, - c.input, - c.n, - c.regexp, - c.expected, - } - ch <- stringCase - } - close(ch) - }() - - for c := range ch { - var result []string - re, _ := Compile(c.regexp) - - switch c.matchfunc { - case "matchit": - result = make([]string, len(c.input)+1) - i := 0 - b := []byte(c.input) - for match := range re.AllMatchesIter(b, c.n) { - result[i] = string(match) - i++ - } - result = result[0:i] - case "stringmatchit": - result = make([]string, len(c.input)+1) - i := 0 - for match := range re.AllMatchesStringIter(c.input, c.n) { - result[i] = match - i++ - } - result = result[0:i] - case "match": - result = make([]string, len(c.input)+1) - b := []byte(c.input) - i := 0 - for _, match := range re.AllMatches(b, c.n) { - result[i] = string(match) - i++ - } - result = result[0:i] - case "stringmatch": - result = re.AllMatchesString(c.input, c.n) +func TestLiteralPrefix(t *testing.T) { + for _, tc := range metaTests { + // Literal method needs to scan the pattern. + re := MustCompile(tc.pattern) + str, complete := re.LiteralPrefix() + if complete != tc.isLiteral { + t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) } - - if !equalStrings(result, c.expected) { - t.Errorf("testing '%s'.%s('%s', %d), expected: ", - c.regexp, c.matchfunc, c.input, c.n) - printStringSlice(t, c.expected) - t.Log("got: ") - printStringSlice(t, result) - t.Log("\n") + if str != tc.literal { + t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) } } } @@ -507,16 +292,16 @@ type numSubexpCase struct { } var numSubexpCases = []numSubexpCase{ - numSubexpCase{``, 0}, - numSubexpCase{`.*`, 0}, - numSubexpCase{`abba`, 0}, - numSubexpCase{`ab(b)a`, 1}, - numSubexpCase{`ab(.*)a`, 1}, - numSubexpCase{`(.*)ab(.*)a`, 2}, - numSubexpCase{`(.*)(ab)(.*)a`, 3}, - numSubexpCase{`(.*)((a)b)(.*)a`, 4}, - numSubexpCase{`(.*)(\(ab)(.*)a`, 3}, - numSubexpCase{`(.*)(\(a\)b)(.*)a`, 3}, + {``, 0}, + {`.*`, 0}, + {`abba`, 0}, + {`ab(b)a`, 1}, + {`ab(.*)a`, 1}, + {`(.*)ab(.*)a`, 2}, + {`(.*)(ab)(.*)a`, 3}, + {`(.*)((a)b)(.*)a`, 4}, + {`(.*)(\(ab)(.*)a`, 3}, + {`(.*)(\(a\)b)(.*)a`, 3}, } func TestNumSubexp(t *testing.T) { @@ -592,3 +377,49 @@ func BenchmarkReplaceAll(b *testing.B) { re.ReplaceAllString(x, "") } } + +func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredShortMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLongMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} diff --git a/src/pkg/regexp/find_test.go b/src/pkg/regexp/find_test.go new file mode 100644 index 000000000..1690711dd --- /dev/null +++ b/src/pkg/regexp/find_test.go @@ -0,0 +1,459 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "fmt" + "testing" +) + +// For each pattern/text pair, what is the expected output of each function? +// We can derive the textual results from the indexed results, the non-submatch +// results from the submatched results, the single results from the 'all' results, +// and the byte results from the string results. Therefore the table includes +// only the FindAllStringSubmatchIndex result. +type FindTest struct { + pat string + text string + matches [][]int +} + +func (t FindTest) String() string { + return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) +} + +var findTests = []FindTest{ + {``, ``, build(1, 0, 0)}, + {`^abcdefg`, "abcdefg", build(1, 0, 7)}, + {`a+`, "baaab", build(1, 1, 4)}, + {"abcd..", "abcdef", build(1, 0, 6)}, + {`a`, "a", build(1, 0, 1)}, + {`x`, "y", nil}, + {`b`, "abc", build(1, 1, 2)}, + {`.`, "a", build(1, 0, 1)}, + {`.*`, "abcdef", build(1, 0, 6)}, + {`^`, "abcde", build(1, 0, 0)}, + {`$`, "abcde", build(1, 5, 5)}, + {`^abcd$`, "abcd", build(1, 0, 4)}, + {`^bcd'`, "abcdef", nil}, + {`^abcd$`, "abcde", nil}, + {`a+`, "baaab", build(1, 1, 4)}, + {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, + {`[a-z]+`, "abcd", build(1, 0, 4)}, + {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, + {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, + {`[^\n]+`, "abcd\n", build(1, 0, 4)}, + {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, + {`日本語+`, "日本語", build(1, 0, 9)}, + {`日本語+`, "日本語語語語", build(1, 0, 18)}, + {`()`, "", build(1, 0, 0, 0, 0)}, + {`(a)`, "a", build(1, 0, 1, 0, 1)}, + {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, + {`(.*)`, "", build(1, 0, 0, 0, 0)}, + {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, + {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, + {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, + {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, + {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, + {`\a\b\f\n\r\t\v`, "\a\b\f\n\r\t\v", build(1, 0, 7)}, + {`[\a\b\f\n\r\t\v]+`, "\a\b\f\n\r\t\v", build(1, 0, 7)}, + + {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, + {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, + {`[.]`, ".", build(1, 0, 1)}, + {`/$`, "/abc/", build(1, 4, 5)}, + {`/$`, "/abc", nil}, + + // multiple matches + {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, + {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, + {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, + {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, + {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, + + // fixed bugs + {`ab$`, "cab", build(1, 1, 3)}, + {`axxb$`, "axxcb", nil}, + {`data`, "daXY data", build(1, 5, 9)}, + {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, + {`zx+`, "zzx", build(1, 1, 3)}, + + // can backslash-escape any punctuation + {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {"\\`", "`", build(1, 0, 1)}, + {"[\\`]+", "`", build(1, 0, 1)}, + + // long set of matches (longer than startSize) + { + ".", + "qwertyuiopasdfghjklzxcvbnm1234567890", + build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, + 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, + 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), + }, +} + +// build is a helper to construct a [][]int by extracting n sequences from x. +// This represents n matches with len(x)/n submatches each. +func build(n int, x ...int) [][]int { + ret := make([][]int, n) + runLength := len(x) / n + j := 0 + for i := range ret { + ret[i] = make([]int, runLength) + copy(ret[i], x[j:]) + j += runLength + if j > len(x) { + panic("invalid build entry") + } + } + return ret +} + +// First the simple cases. + +func TestFind(t *testing.T) { + for _, test := range findTests { + re := MustCompile(test.pat) + if re.String() != test.pat { + t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) + } + result := re.Find([]byte(test.text)) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != string(result) { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func TestFindString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindString(test.text) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != "": + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == "": + // Tricky because an empty result has two meanings: no match or empty match. + if test.matches[0][0] != test.matches[0][1] { + t.Errorf("expected match; got none: %s", test) + } + case test.matches != nil && result != "": + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != result { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func testFindIndex(test *FindTest, result []int, t *testing.T) { + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.matches[0] + if expect[0] != result[0] || expect[1] != result[1] { + t.Errorf("expected %v got %v: %s", expect, result, test) + } + } +} + +func TestFindIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) + } +} + +func TestFindStringIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) + } +} + +// Now come the simple All cases. + +func TestFindAll(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAll([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != string(result[k]) { + t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test) + } + } + } + } +} + +func TestFindAllString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllString(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != result[k] { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } + } +} + +func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + return + } + for k, e := range test.matches { + if e[0] != result[k][0] || e[1] != result[k][1] { + t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) + } + } + } +} + +func TestFindAllIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) + } +} + +// Now come the Submatch cases. + +func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != nil { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != string(result[k/2]) { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchBytes(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != "" { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != result[k/2] { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindStringSubmatch(test.text) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchString(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { + if len(expect) != len(result) { + t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) + return + } + for k, e := range expect { + if e != result[k] { + t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) + } + } +} + +func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchIndices(test, 0, test.matches[0], result, t) + } +} + +func TestFindSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) + } +} + +func TestFindStringSubmatchndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) + } +} + +// Now come the monster AllSubmatch cases. + +func TestFindAllSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchBytes(&test, k, match, result[k], t) + } + } + } +} + +func TestFindAllStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchString(&test, k, match, result[k], t) + } + } + } +} + +func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchIndices(test, k, match, result[k], t) + } + } +} + +func TestFindAllSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringSubmatchndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) + } +} diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index 4dd430ea6..2e03b798a 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -16,14 +16,50 @@ // '$' // '.' // character -// '[' [ '^' ] character-ranges ']' +// '[' [ '^' ] { character-range } ']' // '(' regexp ')' +// character-range: +// character [ '-' character ] +// +// All characters are UTF-8-encoded code points. Backslashes escape special +// characters, including inside character classes. The standard Go character +// escapes are also recognized: \a \b \f \n \r \t \v. +// +// There are 16 methods of Regexp that match a regular expression and identify +// the matched text. Their names are matched by this regular expression: +// +// Find(All)?(String)?(Submatch)?(Index)? +// +// If 'All' is present, the routine matches successive non-overlapping +// matches of the entire expression. Empty matches abutting a preceding +// match are ignored. The return value is a slice containing the successive +// return values of the corresponding non-'All' routine. These routines take +// an extra integer argument, n; if n >= 0, the function returns at most n +// matches/submatches. +// +// If 'String' is present, the argument is a string; otherwise it is a slice +// of bytes; return values are adjusted as appropriate. +// +// If 'Submatch' is present, the return value is a slice identifying the +// successive submatches of the expression. Submatches are matches of +// parenthesized subexpressions within the regular expression, numbered from +// left to right in order of opening parenthesis. Submatch 0 is the match of +// the entire expression, submatch 1 the match of the first parenthesized +// subexpression, and so on. +// +// If 'Index' is present, matches and submatches are identified by byte index +// pairs within the input string: result[2*n:2*n+1] identifies the indexes of +// the nth submatch. The pair for n==0 identifies the match of the entire +// expression. If 'Index' is not present, the match is identified by the +// text of the match/submatch. If an index is negative, it means that +// subexpression did not match any string in the input. +// +// (There are a few other methods that do not match this pattern.) // package regexp import ( "bytes" - "container/vector" "io" "os" "strings" @@ -53,121 +89,90 @@ var ( ErrBadBackslash = Error("illegal backslash escape") ) -// An instruction executed by the NFA -type instr interface { - kind() int // the type of this instruction: _CHAR, _ANY, etc. - next() instr // the instruction to execute after this one - setNext(i instr) - index() int - setIndex(i int) - print() -} +const ( + iStart = iota // beginning of program + iEnd // end of program: success + iBOT // '^' beginning of text + iEOT // '$' end of text + iChar // 'a' regular character + iCharClass // [a-z] character class + iAny // '.' any character including newline + iNotNL // [^\n] special case: any character but newline + iBra // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right + iAlt // '|' alternation + iNop // do nothing; makes it easy to link without patching +) -// Fields and methods common to all instructions -type common struct { - _next instr - _index int +// An instruction executed by the NFA +type instr struct { + kind int // the type of this instruction: iChar, iAny, etc. + index int // used only in debugging; could be eliminated + next *instr // the instruction to execute after this one + // Special fields valid only for some items. + char int // iChar + braNum int // iBra, iEbra + cclass *charClass // iCharClass + left *instr // iAlt, other branch +} + +func (i *instr) print() { + switch i.kind { + case iStart: + print("start") + case iEnd: + print("end") + case iBOT: + print("bot") + case iEOT: + print("eot") + case iChar: + print("char ", string(i.char)) + case iCharClass: + i.cclass.print() + case iAny: + print("any") + case iNotNL: + print("notnl") + case iBra: + if i.braNum&1 == 0 { + print("bra", i.braNum/2) + } else { + print("ebra", i.braNum/2) + } + case iAlt: + print("alt(", i.left.index, ")") + case iNop: + print("nop") + } } -func (c *common) next() instr { return c._next } -func (c *common) setNext(i instr) { c._next = i } -func (c *common) index() int { return c._index } -func (c *common) setIndex(i int) { c._index = i } - // Regexp is the representation of a compiled regular expression. // The public interface is entirely through methods. type Regexp struct { expr string // the original expression prefix string // initial plain text string prefixBytes []byte // initial plain text bytes - inst *vector.Vector - start instr // first instruction of machine - prefixStart instr // where to start if there is a prefix - nbra int // number of brackets in expression, for subexpressions -} - -const ( - _START = iota // beginning of program - _END // end of program: success - _BOT // '^' beginning of text - _EOT // '$' end of text - _CHAR // 'a' regular character - _CHARCLASS // [a-z] character class - _ANY // '.' any character including newline - _NOTNL // [^\n] special case: any character but newline - _BRA // '(' parenthesized expression - _EBRA // ')'; end of '(' parenthesized expression - _ALT // '|' alternation - _NOP // do nothing; makes it easy to link without patching -) - -// --- START start of program -type _Start struct { - common + inst []*instr + start *instr // first instruction of machine + prefixStart *instr // where to start if there is a prefix + nbra int // number of brackets in expression, for subexpressions } -func (start *_Start) kind() int { return _START } -func (start *_Start) print() { print("start") } - -// --- END end of program -type _End struct { - common -} - -func (end *_End) kind() int { return _END } -func (end *_End) print() { print("end") } - -// --- BOT beginning of text -type _Bot struct { - common -} - -func (bot *_Bot) kind() int { return _BOT } -func (bot *_Bot) print() { print("bot") } - -// --- EOT end of text -type _Eot struct { - common -} - -func (eot *_Eot) kind() int { return _EOT } -func (eot *_Eot) print() { print("eot") } - -// --- CHAR a regular character -type _Char struct { - common - char int -} - -func (char *_Char) kind() int { return _CHAR } -func (char *_Char) print() { print("char ", string(char.char)) } - -func newChar(char int) *_Char { - c := new(_Char) - c.char = char - return c -} - -// --- CHARCLASS [a-z] - -type _CharClass struct { - common +type charClass struct { negate bool // is character class negated? ([^a-z]) - // vector of int, stored pairwise: [a-z] is (a,z); x is (x,x): - ranges *vector.IntVector + // slice of int, stored pairwise: [a-z] is (a,z); x is (x,x): + ranges []int cmin, cmax int } -func (cclass *_CharClass) kind() int { return _CHARCLASS } - -func (cclass *_CharClass) print() { +func (cclass *charClass) print() { print("charclass") if cclass.negate { print(" (negated)") } - for i := 0; i < cclass.ranges.Len(); i += 2 { - l := cclass.ranges.At(i) - r := cclass.ranges.At(i + 1) + for i := 0; i < len(cclass.ranges); i += 2 { + l := cclass.ranges[i] + r := cclass.ranges[i+1] if l == r { print(" [", string(l), "]") } else { @@ -176,10 +181,9 @@ func (cclass *_CharClass) print() { } } -func (cclass *_CharClass) addRange(a, b int) { +func (cclass *charClass) addRange(a, b int) { // range is a through b inclusive - cclass.ranges.Push(a) - cclass.ranges.Push(b) + cclass.ranges = append(cclass.ranges, a, b) if a < cclass.cmin { cclass.cmin = a } @@ -188,11 +192,11 @@ func (cclass *_CharClass) addRange(a, b int) { } } -func (cclass *_CharClass) matches(c int) bool { +func (cclass *charClass) matches(c int) bool { if c < cclass.cmin || c > cclass.cmax { return cclass.negate } - ranges := []int(*cclass.ranges) + ranges := cclass.ranges for i := 0; i < len(ranges); i = i + 2 { if ranges[i] <= c && c <= ranges[i+1] { return !cclass.negate @@ -201,68 +205,18 @@ func (cclass *_CharClass) matches(c int) bool { return cclass.negate } -func newCharClass() *_CharClass { - c := new(_CharClass) - c.ranges = new(vector.IntVector) - c.cmin = 0x10FFFF + 1 // MaxRune + 1 - c.cmax = -1 - return c -} - -// --- ANY any character -type _Any struct { - common -} - -func (any *_Any) kind() int { return _ANY } -func (any *_Any) print() { print("any") } - -// --- NOTNL any character but newline -type _NotNl struct { - common -} - -func (notnl *_NotNl) kind() int { return _NOTNL } -func (notnl *_NotNl) print() { print("notnl") } - -// --- BRA parenthesized expression -type _Bra struct { - common - n int // subexpression number -} - -func (bra *_Bra) kind() int { return _BRA } -func (bra *_Bra) print() { print("bra", bra.n) } - -// --- EBRA end of parenthesized expression -type _Ebra struct { - common - n int // subexpression number -} - -func (ebra *_Ebra) kind() int { return _EBRA } -func (ebra *_Ebra) print() { print("ebra ", ebra.n) } - -// --- ALT alternation -type _Alt struct { - common - left instr // other branch -} - -func (alt *_Alt) kind() int { return _ALT } -func (alt *_Alt) print() { print("alt(", alt.left.index(), ")") } - -// --- NOP no operation -type _Nop struct { - common +func newCharClass() *instr { + i := &instr{kind: iCharClass} + i.cclass = new(charClass) + i.cclass.ranges = make([]int, 0, 4) + i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1 + i.cclass.cmax = -1 + return i } -func (nop *_Nop) kind() int { return _NOP } -func (nop *_Nop) print() { print("nop") } - -func (re *Regexp) add(i instr) instr { - i.setIndex(re.inst.Len()) - re.inst.Push(i) +func (re *Regexp) add(i *instr) *instr { + i.index = len(re.inst) + re.inst = append(re.inst, i) return i } @@ -317,8 +271,21 @@ func ispunct(c int) bool { return false } -func (p *parser) charClass() instr { - cc := newCharClass() +var escapes = []byte("abfnrtv") +var escaped = []byte("\a\b\f\n\r\t\v") + +func escape(c int) int { + for i, b := range escapes { + if int(b) == c { + return i + } + } + return -1 +} + +func (p *parser) charClass() *instr { + i := newCharClass() + cc := i.cclass if p.c() == '^' { cc.negate = true p.nextc() @@ -331,20 +298,20 @@ func (p *parser) charClass() instr { p.error(ErrBadRange) } // Is it [^\n]? - if cc.negate && cc.ranges.Len() == 2 && - cc.ranges.At(0) == '\n' && cc.ranges.At(1) == '\n' { - nl := new(_NotNl) + if cc.negate && len(cc.ranges) == 2 && + cc.ranges[0] == '\n' && cc.ranges[1] == '\n' { + nl := &instr{kind: iNotNL} p.re.add(nl) return nl } // Special common case: "[a]" -> "a" - if !cc.negate && cc.ranges.Len() == 2 && cc.ranges.At(0) == cc.ranges.At(1) { - c := newChar(cc.ranges.At(0)) + if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] { + c := &instr{kind: iChar, char: cc.ranges[0]} p.re.add(c) return c } - p.re.add(cc) - return cc + p.re.add(i) + return i case '-': // do this before backslash processing p.error(ErrBadRange) case '\\': @@ -352,10 +319,10 @@ func (p *parser) charClass() instr { switch { case c == endOfFile: p.error(ErrExtraneousBackslash) - case c == 'n': - c = '\n' case ispunct(c): // c is as delivered + case escape(c) >= 0: + c = int(escaped[escape(c)]) default: p.error(ErrBadBackslash) } @@ -381,7 +348,7 @@ func (p *parser) charClass() instr { return nil } -func (p *parser) term() (start, end instr) { +func (p *parser) term() (start, end *instr) { switch c := p.c(); c { case '|', endOfFile: return nil, nil @@ -396,15 +363,15 @@ func (p *parser) term() (start, end instr) { p.error(ErrUnmatchedRbkt) case '^': p.nextc() - start = p.re.add(new(_Bot)) + start = p.re.add(&instr{kind: iBOT}) return start, start case '$': p.nextc() - start = p.re.add(new(_Eot)) + start = p.re.add(&instr{kind: iEOT}) return start, start case '.': p.nextc() - start = p.re.add(new(_Any)) + start = p.re.add(&instr{kind: iAny}) return start, start case '[': p.nextc() @@ -425,12 +392,10 @@ func (p *parser) term() (start, end instr) { } p.nlpar-- p.nextc() - bra := new(_Bra) + bra := &instr{kind: iBra, braNum: 2 * nbra} p.re.add(bra) - ebra := new(_Ebra) + ebra := &instr{kind: iBra, braNum: 2*nbra + 1} p.re.add(ebra) - bra.n = nbra - ebra.n = nbra if start == nil { if end == nil { p.error(ErrInternal) @@ -438,33 +403,33 @@ func (p *parser) term() (start, end instr) { } start = ebra } else { - end.setNext(ebra) + end.next = ebra } - bra.setNext(start) + bra.next = start return bra, ebra case '\\': c = p.nextc() switch { case c == endOfFile: p.error(ErrExtraneousBackslash) - case c == 'n': - c = '\n' case ispunct(c): // c is as delivered + case escape(c) >= 0: + c = int(escaped[escape(c)]) default: p.error(ErrBadBackslash) } fallthrough default: p.nextc() - start = newChar(c) + start = &instr{kind: iChar, char: c} p.re.add(start) return start, start } panic("unreachable") } -func (p *parser) closure() (start, end instr) { +func (p *parser) closure() (start, end *instr) { start, end = p.term() if start == nil { return @@ -472,28 +437,28 @@ func (p *parser) closure() (start, end instr) { switch p.c() { case '*': // (start,end)*: - alt := new(_Alt) + alt := &instr{kind: iAlt} p.re.add(alt) - end.setNext(alt) // after end, do alt + end.next = alt // after end, do alt alt.left = start // alternate brach: return to start start = alt // alt becomes new (start, end) end = alt case '+': // (start,end)+: - alt := new(_Alt) + alt := &instr{kind: iAlt} p.re.add(alt) - end.setNext(alt) // after end, do alt + end.next = alt // after end, do alt alt.left = start // alternate brach: return to start end = alt // start is unchanged; end is alt case '?': // (start,end)?: - alt := new(_Alt) + alt := &instr{kind: iAlt} p.re.add(alt) - nop := new(_Nop) + nop := &instr{kind: iNop} p.re.add(nop) alt.left = start // alternate branch is start - alt.setNext(nop) // follow on to nop - end.setNext(nop) // after end, go to nop + alt.next = nop // follow on to nop + end.next = nop // after end, go to nop start = alt // start is now alt end = nop // end is nop pointed to by both branches default: @@ -506,27 +471,27 @@ func (p *parser) closure() (start, end instr) { return } -func (p *parser) concatenation() (start, end instr) { +func (p *parser) concatenation() (start, end *instr) { for { nstart, nend := p.closure() switch { case nstart == nil: // end of this concatenation if start == nil { // this is the empty string - nop := p.re.add(new(_Nop)) + nop := p.re.add(&instr{kind: iNop}) return nop, nop } return case start == nil: // this is first element of concatenation start, end = nstart, nend default: - end.setNext(nstart) + end.next = nstart end = nend } } panic("unreachable") } -func (p *parser) regexp() (start, end instr) { +func (p *parser) regexp() (start, end *instr) { start, end = p.concatenation() for { switch p.c() { @@ -535,49 +500,46 @@ func (p *parser) regexp() (start, end instr) { case '|': p.nextc() nstart, nend := p.concatenation() - alt := new(_Alt) + alt := &instr{kind: iAlt} p.re.add(alt) alt.left = start - alt.setNext(nstart) - nop := new(_Nop) + alt.next = nstart + nop := &instr{kind: iNop} p.re.add(nop) - end.setNext(nop) - nend.setNext(nop) + end.next = nop + nend.next = nop start, end = alt, nop } } panic("unreachable") } -func unNop(i instr) instr { - for i.kind() == _NOP { - i = i.next() +func unNop(i *instr) *instr { + for i.kind == iNop { + i = i.next } return i } func (re *Regexp) eliminateNops() { - for i := 0; i < re.inst.Len(); i++ { - inst := re.inst.At(i).(instr) - if inst.kind() == _END { + for _, inst := range re.inst { + if inst.kind == iEnd { continue } - inst.setNext(unNop(inst.next())) - if inst.kind() == _ALT { - alt := inst.(*_Alt) - alt.left = unNop(alt.left) + inst.next = unNop(inst.next) + if inst.kind == iAlt { + inst.left = unNop(inst.left) } } } func (re *Regexp) dump() { print("prefix <", re.prefix, ">\n") - for i := 0; i < re.inst.Len(); i++ { - inst := re.inst.At(i).(instr) - print(inst.index(), ": ") + for _, inst := range re.inst { + print(inst.index, ": ") inst.print() - if inst.kind() != _END { - print(" -> ", inst.next().index()) + if inst.kind != iEnd { + print(" -> ", inst.next.index) } print("\n") } @@ -585,12 +547,12 @@ func (re *Regexp) dump() { func (re *Regexp) doParse() { p := newParser(re) - start := new(_Start) + start := &instr{kind: iStart} re.add(start) s, e := p.regexp() - start.setNext(s) + start.next = s re.start = start - e.setNext(re.add(new(_End))) + e.next = re.add(&instr{kind: iEnd}) if debug { re.dump() @@ -609,36 +571,44 @@ func (re *Regexp) doParse() { } } -// Extract regular text from the beginning of the pattern. +// Extract regular text from the beginning of the pattern, +// possibly after a leading iBOT. // That text can be used by doExecute to speed up matching. func (re *Regexp) setPrefix() { var b []byte var utf = make([]byte, utf8.UTFMax) - // First instruction is start; skip that. - i := re.inst.At(0).(instr).next().index() + var inst *instr + // First instruction is start; skip that. Also skip any initial iBOT. + inst = re.inst[0].next + for inst.kind == iBOT { + inst = inst.next + } Loop: - for i < re.inst.Len() { - inst := re.inst.At(i).(instr) + for ; inst.kind != iEnd; inst = inst.next { // stop if this is not a char - if inst.kind() != _CHAR { + if inst.kind != iChar { break } // stop if this char can be followed by a match for an empty string, // which includes closures, ^, and $. - switch re.inst.At(inst.next().index()).(instr).kind() { - case _BOT, _EOT, _ALT: + switch inst.next.kind { + case iBOT, iEOT, iAlt: break Loop } - n := utf8.EncodeRune(inst.(*_Char).char, utf) - b = bytes.Add(b, utf[0:n]) - i = inst.next().index() + n := utf8.EncodeRune(utf, inst.char) + b = append(b, utf[0:n]...) } // point prefixStart instruction to first non-CHAR after prefix - re.prefixStart = re.inst.At(i).(instr) + re.prefixStart = inst re.prefixBytes = b re.prefix = string(b) } +// String returns the source text used to compile the regular expression. +func (re *Regexp) String() string { + return re.expr +} + // Compile parses a regular expression and returns, if successful, a Regexp // object that can be used to match against text. func Compile(str string) (regexp *Regexp, error os.Error) { @@ -651,7 +621,7 @@ func Compile(str string) (regexp *Regexp, error os.Error) { } }() regexp.expr = str - regexp.inst = new(vector.Vector) + regexp.inst = make([]*instr, 0, 10) regexp.doParse() return } @@ -727,60 +697,45 @@ func (a *matchArena) noMatch() *matchVec { } type state struct { - inst instr // next instruction to execute - prefixed bool // this match began with a fixed prefix + inst *instr // next instruction to execute + prefixed bool // this match began with a fixed prefix match *matchVec } // Append new state to to-do list. Leftmost-longest wins so avoid // adding a state that's already active. The matchVec will be inc-ref'ed // if it is assigned to a state. -func (a *matchArena) addState(s []state, inst instr, prefixed bool, match *matchVec, pos, end int) []state { - switch inst.kind() { - case _BOT: +func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state { + switch inst.kind { + case iBOT: if pos == 0 { - s = a.addState(s, inst.next(), prefixed, match, pos, end) + s = a.addState(s, inst.next, prefixed, match, pos, end) } return s - case _EOT: + case iEOT: if pos == end { - s = a.addState(s, inst.next(), prefixed, match, pos, end) + s = a.addState(s, inst.next, prefixed, match, pos, end) } return s - case _BRA: - n := inst.(*_Bra).n - match.m[2*n] = pos - s = a.addState(s, inst.next(), prefixed, match, pos, end) - return s - case _EBRA: - n := inst.(*_Ebra).n - match.m[2*n+1] = pos - s = a.addState(s, inst.next(), prefixed, match, pos, end) + case iBra: + match.m[inst.braNum] = pos + s = a.addState(s, inst.next, prefixed, match, pos, end) return s } - index := inst.index() l := len(s) // States are inserted in order so it's sufficient to see if we have the same // instruction; no need to see if existing match is earlier (it is). for i := 0; i < l; i++ { - if s[i].inst.index() == index { + if s[i].inst == inst { return s } } - if l == cap(s) { - s1 := make([]state, 2*l)[0:l] - copy(s1, s) - s = s1 - } - s = s[0 : l+1] - s[l].inst = inst - s[l].prefixed = prefixed - s[l].match = match + s = append(s, state{inst, prefixed, match}) match.ref++ - if inst.kind() == _ALT { - s = a.addState(s, inst.(*_Alt).left, prefixed, a.copy(match), pos, end) + if inst.kind == iAlt { + s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end) // give other branch a copy of this match vector - s = a.addState(s, inst.next(), prefixed, a.copy(match), pos, end) + s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end) } return s } @@ -789,8 +744,8 @@ func (a *matchArena) addState(s []state, inst instr, prefixed bool, match *match // If bytes == nil, scan str. func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { var s [2][]state - s[0] = make([]state, 10)[0:0] - s[1] = make([]state, 10)[0:0] + s[0] = make([]state, 0, 10) + s[1] = make([]state, 0, 10) in, out := 0, 1 var final state found := false @@ -798,34 +753,46 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { if bytestr != nil { end = len(bytestr) } + anchored := re.inst[0].next.kind == iBOT + if anchored && pos > 0 { + return nil + } // fast check for initial plain substring - prefixed := false // has this iteration begun by skipping a prefix? if re.prefix != "" { - var advance int - if bytestr == nil { - advance = strings.Index(str[pos:], re.prefix) + advance := 0 + if anchored { + if bytestr == nil { + if !strings.HasPrefix(str, re.prefix) { + return nil + } + } else { + if !bytes.HasPrefix(bytestr, re.prefixBytes) { + return nil + } + } } else { - advance = bytes.Index(bytestr[pos:], re.prefixBytes) + if bytestr == nil { + advance = strings.Index(str[pos:], re.prefix) + } else { + advance = bytes.Index(bytestr[pos:], re.prefixBytes) + } } if advance == -1 { - return []int{} + return nil } - pos += advance + len(re.prefix) - prefixed = true + pos += advance } arena := &matchArena{nil, 2 * (re.nbra + 1)} - for pos <= end { - if !found { + for startPos := pos; pos <= end; { + if !found && (pos == startPos || !anchored) { // prime the pump if we haven't seen a match yet match := arena.noMatch() match.m[0] = pos - if prefixed { - s[out] = arena.addState(s[out], re.prefixStart, true, match, pos, end) - prefixed = false // next iteration should start at beginning of machine. - } else { - s[out] = arena.addState(s[out], re.start.next(), false, match, pos, end) - } + s[out] = arena.addState(s[out], re.start.next, false, match, pos, end) arena.free(match) // if addState saved it, ref was incremented + } else if len(s[out]) == 0 { + // machine has completed + break } in, out = out, in // old out state is new in state // clear out old state @@ -834,10 +801,6 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { arena.free(state.match) } s[out] = old[0:0] // truncate state vector - if found && len(s[in]) == 0 { - // machine has completed - break - } charwidth := 1 c := endOfFile if pos < end { @@ -849,29 +812,28 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { } pos += charwidth for _, st := range s[in] { - switch st.inst.kind() { - case _BOT: - case _EOT: - case _CHAR: - if c == st.inst.(*_Char).char { - s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + switch st.inst.kind { + case iBOT: + case iEOT: + case iChar: + if c == st.inst.char { + s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) } - case _CHARCLASS: - if st.inst.(*_CharClass).matches(c) { - s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + case iCharClass: + if st.inst.cclass.matches(c) { + s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) } - case _ANY: + case iAny: if c != endOfFile { - s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) } - case _NOTNL: + case iNotNL: if c != endOfFile && c != '\n' { - s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) } - case _BRA: - case _EBRA: - case _ALT: - case _END: + case iBra: + case iAlt: + case iEnd: // choose leftmost longest if !found || // first st.match.m[0] < final.match.m[0] || // leftmost @@ -900,77 +862,33 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { return final.match.m } - -// ExecuteString matches the Regexp against the string s. -// The return value is an array of integers, in pairs, identifying the positions of -// substrings matched by the expression. -// s[a[0]:a[1]] is the substring matched by the entire expression. -// s[a[2*i]:a[2*i+1]] for i > 0 is the substring matched by the ith parenthesized subexpression. -// A negative value means the subexpression did not match any element of the string. -// An empty array means "no match". -func (re *Regexp) ExecuteString(s string) (a []int) { - return re.doExecute(s, nil, 0) +// LiteralPrefix returns a literal string that must begin any match +// of the regular expression re. It returns the boolean true if the +// literal string comprises the entire regular expression. +func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { + c := make([]int, len(re.inst)-2) // minus start and end. + // First instruction is start; skip that. + i := 0 + for inst := re.inst[0].next; inst.kind != iEnd; inst = inst.next { + // stop if this is not a char + if inst.kind != iChar { + return string(c[:i]), false + } + c[i] = inst.char + i++ + } + return string(c[:i]), true } - -// Execute matches the Regexp against the byte slice b. -// The return value is an array of integers, in pairs, identifying the positions of -// subslices matched by the expression. -// b[a[0]:a[1]] is the subslice matched by the entire expression. -// b[a[2*i]:a[2*i+1]] for i > 0 is the subslice matched by the ith parenthesized subexpression. -// A negative value means the subexpression did not match any element of the slice. -// An empty array means "no match". -func (re *Regexp) Execute(b []byte) (a []int) { return re.doExecute("", b, 0) } - - // MatchString returns whether the Regexp matches the string s. // The return value is a boolean: true for match, false for no match. func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 } - // Match returns whether the Regexp matches the byte slice b. // The return value is a boolean: true for match, false for no match. func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 } -// MatchStrings matches the Regexp against the string s. -// The return value is an array of strings matched by the expression. -// a[0] is the substring matched by the entire expression. -// a[i] for i > 0 is the substring matched by the ith parenthesized subexpression. -// An empty array means ``no match''. -func (re *Regexp) MatchStrings(s string) (a []string) { - r := re.doExecute(s, nil, 0) - if r == nil { - return nil - } - a = make([]string, len(r)/2) - for i := 0; i < len(r); i += 2 { - if r[i] != -1 { // -1 means no match for this subexpression - a[i/2] = s[r[i]:r[i+1]] - } - } - return -} - -// MatchSlices matches the Regexp against the byte slice b. -// The return value is an array of subslices matched by the expression. -// a[0] is the subslice matched by the entire expression. -// a[i] for i > 0 is the subslice matched by the ith parenthesized subexpression. -// An empty array means ``no match''. -func (re *Regexp) MatchSlices(b []byte) (a [][]byte) { - r := re.doExecute("", b, 0) - if r == nil { - return nil - } - a = make([][]byte, len(r)/2) - for i := 0; i < len(r); i += 2 { - if r[i] != -1 { // -1 means no match for this subexpression - a[i/2] = b[r[i]:r[i+1]] - } - } - return -} - // MatchString checks whether a textual regular expression // matches a string. More complicated queries need // to use Compile and the full Regexp interface. @@ -1117,7 +1035,7 @@ func QuoteMeta(s string) string { } // Find matches in slice b if b is non-nil, otherwise find matches in string s. -func (re *Regexp) allMatches(s string, b []byte, n int, deliver func(int, int)) { +func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { var end int if b == nil { end = len(s) @@ -1156,78 +1074,270 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func(int, int)) prevMatchEnd = matches[1] if accept { - deliver(matches[0], matches[1]) + deliver(matches) i++ } } } -// AllMatches slices the byte slice b into substrings that are successive -// matches of the Regexp within b. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a slice -// containing the matching substrings. -func (re *Regexp) AllMatches(b []byte, n int) [][]byte { - if n <= 0 { +// Find returns a slice holding the text of the leftmost match in b of the regular expression. +// A return value of nil indicates no match. +func (re *Regexp) Find(b []byte) []byte { + a := re.doExecute("", b, 0) + if a == nil { + return nil + } + return b[a[0]:a[1]] +} + +// FindIndex returns a two-element slice of integers defining the location of +// the leftmost match in b of the regular expression. The match itself is at +// b[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindIndex(b []byte) (loc []int) { + a := re.doExecute("", b, 0) + if a == nil { + return nil + } + return a[0:2] +} + +// FindString returns a string holding the text of the leftmost match in s of the regular +// expression. If there is no match, the return value is an empty string, +// but it will also be empty if the regular expression successfully matches +// an empty string. Use FindStringIndex or FindStringSubmatch if it is +// necessary to distinguish these cases. +func (re *Regexp) FindString(s string) string { + a := re.doExecute(s, nil, 0) + if a == nil { + return "" + } + return s[a[0]:a[1]] +} + +// FindStringIndex returns a two-element slice of integers defining the +// location of the leftmost match in s of the regular expression. The match +// itself is at s[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindStringIndex(s string) []int { + a := re.doExecute(s, nil, 0) + if a == nil { + return nil + } + return a[0:2] +} + +// FindSubmatch returns a slice of slices holding the text of the leftmost +// match of the regular expression in b and the matches, if any, of its +// subexpressions, as defined by the 'Submatch' descriptions in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatch(b []byte) [][]byte { + a := re.doExecute("", b, 0) + if a == nil { + return nil + } + ret := make([][]byte, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = b[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindSubmatchIndex returns a slice holding the index pairs identifying the +// leftmost match of the regular expression in b and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatchIndex(b []byte) []int { + return re.doExecute("", b, 0) +} + +// FindStringSubmatch returns a slice of strings holding the text of the +// leftmost match of the regular expression in s and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatch(s string) []string { + a := re.doExecute(s, nil, 0) + if a == nil { + return nil + } + ret := make([]string, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = s[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindStringSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression in s and the +// matches, if any, of its subexpressions, as defined by the 'Submatch' and +// 'Index' descriptions in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatchIndex(s string) []int { + return re.doExecute(s, nil, 0) +} + +const startSize = 10 // The size at which to start a slice in the 'All' routines. + +// FindAll is the 'All' version of Find; it returns a slice of all successive +// matches of the expression, as defined by the 'All' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAll(b []byte, n int) [][]byte { + if n < 0 { n = len(b) + 1 } - result := make([][]byte, n) - i := 0 - re.allMatches("", b, n, func(start, end int) { - result[i] = b[start:end] - i++ + result := make([][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, b[match[0]:match[1]]) }) - return result[0:i] + if len(result) == 0 { + return nil + } + return result +} + +// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result } -// AllMatchesString slices the string s into substrings that are successive -// matches of the Regexp within s. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a slice -// containing the matching substrings. -func (re *Regexp) AllMatchesString(s string, n int) []string { - if n <= 0 { +// FindAllString is the 'All' version of FindString; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllString(s string, n int) []string { + if n < 0 { n = len(s) + 1 } - result := make([]string, n) - i := 0 - re.allMatches(s, nil, n, func(start, end int) { - result[i] = s[start:end] - i++ + result := make([]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, s[match[0]:match[1]]) }) - return result[0:i] + if len(result) == 0 { + return nil + } + return result } -// AllMatchesIter slices the byte slice b into substrings that are successive -// matches of the Regexp within b. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a -// channel that iterates over the matching substrings. -func (re *Regexp) AllMatchesIter(b []byte, n int) <-chan []byte { - if n <= 0 { +// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a +// slice of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice +// of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { + if n < 0 { n = len(b) + 1 } - c := make(chan []byte, 10) - go func() { - re.allMatches("", b, n, func(start, end int) { c <- b[start:end] }) - close(c) - }() - return c + result := make([][][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + slice := make([][]byte, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = b[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns +// a slice of all successive matches of the expression, as defined by the +// 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result } -// AllMatchesStringIter slices the string s into substrings that are successive -// matches of the Regexp within s. If n > 0, the function returns at most n -// matches. Text that does not match the expression will be skipped. Empty -// matches abutting a preceding match are ignored. The function returns a -// channel that iterates over the matching substrings. -func (re *Regexp) AllMatchesStringIter(s string, n int) <-chan string { - if n <= 0 { +// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it +// returns a slice of all successive matches of the expression, as defined by +// the 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { + if n < 0 { n = len(s) + 1 } - c := make(chan string, 10) - go func() { - re.allMatches(s, nil, n, func(start, end int) { c <- s[start:end] }) - close(c) - }() - return c + result := make([][]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + slice := make([]string, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = s[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringSubmatchIndex is the 'All' version of +// FindStringSubmatchIndex; it returns a slice of all successive matches of +// the expression, as defined by the 'All' description in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result } |