diff options
author | Tianon Gravi <admwiggin@gmail.com> | 2015-01-15 11:54:00 -0700 |
---|---|---|
committer | Tianon Gravi <admwiggin@gmail.com> | 2015-01-15 11:54:00 -0700 |
commit | f154da9e12608589e8d5f0508f908a0c3e88a1bb (patch) | |
tree | f8255d51e10c6f1e0ed69702200b966c9556a431 /src/pkg/regexp/exec_test.go | |
parent | 8d8329ed5dfb9622c82a9fbec6fd99a580f9c9f6 (diff) | |
download | golang-upstream/1.4.tar.gz |
Imported Upstream version 1.4upstream/1.4
Diffstat (limited to 'src/pkg/regexp/exec_test.go')
-rw-r--r-- | src/pkg/regexp/exec_test.go | 715 |
1 files changed, 0 insertions, 715 deletions
diff --git a/src/pkg/regexp/exec_test.go b/src/pkg/regexp/exec_test.go deleted file mode 100644 index 70d069c06..000000000 --- a/src/pkg/regexp/exec_test.go +++ /dev/null @@ -1,715 +0,0 @@ -// Copyright 2010 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package regexp - -import ( - "bufio" - "compress/bzip2" - "fmt" - "io" - "os" - "path/filepath" - "regexp/syntax" - "strconv" - "strings" - "testing" - "unicode/utf8" -) - -// TestRE2 tests this package's regexp API against test cases -// considered during RE2's exhaustive tests, which run all possible -// regexps over a given set of atoms and operators, up to a given -// complexity, over all possible strings over a given alphabet, -// up to a given size. Rather than try to link with RE2, we read a -// log file containing the test cases and the expected matches. -// The log file, re2.txt, is generated by running 'make exhaustive-log' -// in the open source RE2 distribution. http://code.google.com/p/re2/ -// -// The test file format is a sequence of stanzas like: -// -// strings -// "abc" -// "123x" -// regexps -// "[a-z]+" -// 0-3;0-3 -// -;- -// "([0-9])([0-9])([0-9])" -// -;- -// -;0-3 0-1 1-2 2-3 -// -// The stanza begins by defining a set of strings, quoted -// using Go double-quote syntax, one per line. Then the -// regexps section gives a sequence of regexps to run on -// the strings. In the block that follows a regexp, each line -// gives the semicolon-separated match results of running -// the regexp on the corresponding string. -// Each match result is either a single -, meaning no match, or a -// space-separated sequence of pairs giving the match and -// submatch indices. An unmatched subexpression formats -// its pair as a single - (not illustrated above). For now -// each regexp run produces two match results, one for a -// ``full match'' that restricts the regexp to matching the entire -// string or nothing, and one for a ``partial match'' that gives -// the leftmost first match found in the string. -// -// Lines beginning with # are comments. Lines beginning with -// a capital letter are test names printed during RE2's test suite -// and are echoed into t but otherwise ignored. -// -// At time of writing, re2.txt is 32 MB but compresses to 760 kB, -// so we store re2.txt.gz in the repository and decompress it on the fly. -// -func TestRE2Search(t *testing.T) { - testRE2(t, "testdata/re2-search.txt") -} - -func testRE2(t *testing.T, file string) { - f, err := os.Open(file) - if err != nil { - t.Fatal(err) - } - defer f.Close() - var txt io.Reader - if strings.HasSuffix(file, ".bz2") { - z := bzip2.NewReader(f) - txt = z - file = file[:len(file)-len(".bz2")] // for error messages - } else { - txt = f - } - lineno := 0 - scanner := bufio.NewScanner(txt) - var ( - str []string - input []string - inStrings bool - re *Regexp - refull *Regexp - nfail int - ncase int - ) - for lineno := 1; scanner.Scan(); lineno++ { - line := scanner.Text() - switch { - case line == "": - t.Fatalf("%s:%d: unexpected blank line", file, lineno) - case line[0] == '#': - continue - case 'A' <= line[0] && line[0] <= 'Z': - // Test name. - t.Logf("%s\n", line) - continue - case line == "strings": - str = str[:0] - inStrings = true - case line == "regexps": - inStrings = false - case line[0] == '"': - q, err := strconv.Unquote(line) - if err != nil { - // Fatal because we'll get out of sync. - t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err) - } - if inStrings { - str = append(str, q) - continue - } - // Is a regexp. - if len(input) != 0 { - t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q) - } - re, err = tryCompile(q) - if err != nil { - if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" { - // We don't and likely never will support \C; keep going. - continue - } - t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err) - if nfail++; nfail >= 100 { - t.Fatalf("stopping after %d errors", nfail) - } - continue - } - full := `\A(?:` + q + `)\z` - refull, err = tryCompile(full) - if err != nil { - // Fatal because q worked, so this should always work. - t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err) - } - input = str - case line[0] == '-' || '0' <= line[0] && line[0] <= '9': - // A sequence of match results. - ncase++ - if re == nil { - // Failed to compile: skip results. - continue - } - if len(input) == 0 { - t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno) - } - var text string - text, input = input[0], input[1:] - if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { - // RE2's \B considers every byte position, - // so it sees 'not word boundary' in the - // middle of UTF-8 sequences. This package - // only considers the positions between runes, - // so it disagrees. Skip those cases. - continue - } - res := strings.Split(line, ";") - if len(res) != len(run) { - t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run)) - } - for i := range res { - have, suffix := run[i](re, refull, text) - want := parseResult(t, file, lineno, res[i]) - if !same(have, want) { - t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want) - if nfail++; nfail >= 100 { - t.Fatalf("stopping after %d errors", nfail) - } - continue - } - b, suffix := match[i](re, refull, text) - if b != (want != nil) { - t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b) - if nfail++; nfail >= 100 { - t.Fatalf("stopping after %d errors", nfail) - } - continue - } - } - - default: - t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line) - } - } - if err := scanner.Err(); err != nil { - t.Fatalf("%s:%d: %v", file, lineno, err) - } - if len(input) != 0 { - t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input)) - } - t.Logf("%d cases tested", ncase) -} - -var run = []func(*Regexp, *Regexp, string) ([]int, string){ - runFull, - runPartial, - runFullLongest, - runPartialLongest, -} - -func runFull(re, refull *Regexp, text string) ([]int, string) { - refull.longest = false - return refull.FindStringSubmatchIndex(text), "[full]" -} - -func runPartial(re, refull *Regexp, text string) ([]int, string) { - re.longest = false - return re.FindStringSubmatchIndex(text), "" -} - -func runFullLongest(re, refull *Regexp, text string) ([]int, string) { - refull.longest = true - return refull.FindStringSubmatchIndex(text), "[full,longest]" -} - -func runPartialLongest(re, refull *Regexp, text string) ([]int, string) { - re.longest = true - return re.FindStringSubmatchIndex(text), "[longest]" -} - -var match = []func(*Regexp, *Regexp, string) (bool, string){ - matchFull, - matchPartial, - matchFullLongest, - matchPartialLongest, -} - -func matchFull(re, refull *Regexp, text string) (bool, string) { - refull.longest = false - return refull.MatchString(text), "[full]" -} - -func matchPartial(re, refull *Regexp, text string) (bool, string) { - re.longest = false - return re.MatchString(text), "" -} - -func matchFullLongest(re, refull *Regexp, text string) (bool, string) { - refull.longest = true - return refull.MatchString(text), "[full,longest]" -} - -func matchPartialLongest(re, refull *Regexp, text string) (bool, string) { - re.longest = true - return re.MatchString(text), "[longest]" -} - -func isSingleBytes(s string) bool { - for _, c := range s { - if c >= utf8.RuneSelf { - return false - } - } - return true -} - -func tryCompile(s string) (re *Regexp, err error) { - // Protect against panic during Compile. - defer func() { - if r := recover(); r != nil { - err = fmt.Errorf("panic: %v", r) - } - }() - return Compile(s) -} - -func parseResult(t *testing.T, file string, lineno int, res string) []int { - // A single - indicates no match. - if res == "-" { - return nil - } - // Otherwise, a space-separated list of pairs. - n := 1 - for j := 0; j < len(res); j++ { - if res[j] == ' ' { - n++ - } - } - out := make([]int, 2*n) - i := 0 - n = 0 - for j := 0; j <= len(res); j++ { - if j == len(res) || res[j] == ' ' { - // Process a single pair. - means no submatch. - pair := res[i:j] - if pair == "-" { - out[n] = -1 - out[n+1] = -1 - } else { - k := strings.Index(pair, "-") - if k < 0 { - t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) - } - lo, err1 := strconv.Atoi(pair[:k]) - hi, err2 := strconv.Atoi(pair[k+1:]) - if err1 != nil || err2 != nil || lo > hi { - t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) - } - out[n] = lo - out[n+1] = hi - } - n += 2 - i = j + 1 - } - } - return out -} - -func same(x, y []int) bool { - if len(x) != len(y) { - return false - } - for i, xi := range x { - if xi != y[i] { - return false - } - } - return true -} - -// TestFowler runs this package's regexp API against the -// POSIX regular expression tests collected by Glenn Fowler -// at http://www2.research.att.com/~gsf/testregex/. -func TestFowler(t *testing.T) { - files, err := filepath.Glob("testdata/*.dat") - if err != nil { - t.Fatal(err) - } - for _, file := range files { - t.Log(file) - testFowler(t, file) - } -} - -var notab = MustCompilePOSIX(`[^\t]+`) - -func testFowler(t *testing.T, file string) { - f, err := os.Open(file) - if err != nil { - t.Error(err) - return - } - defer f.Close() - b := bufio.NewReader(f) - lineno := 0 - lastRegexp := "" -Reading: - for { - lineno++ - line, err := b.ReadString('\n') - if err != nil { - if err != io.EOF { - t.Errorf("%s:%d: %v", file, lineno, err) - } - break Reading - } - - // http://www2.research.att.com/~gsf/man/man1/testregex.html - // - // INPUT FORMAT - // Input lines may be blank, a comment beginning with #, or a test - // specification. A specification is five fields separated by one - // or more tabs. NULL denotes the empty string and NIL denotes the - // 0 pointer. - if line[0] == '#' || line[0] == '\n' { - continue Reading - } - line = line[:len(line)-1] - field := notab.FindAllString(line, -1) - for i, f := range field { - if f == "NULL" { - field[i] = "" - } - if f == "NIL" { - t.Logf("%s:%d: skip: %s", file, lineno, line) - continue Reading - } - } - if len(field) == 0 { - continue Reading - } - - // Field 1: the regex(3) flags to apply, one character per REG_feature - // flag. The test is skipped if REG_feature is not supported by the - // implementation. If the first character is not [BEASKLP] then the - // specification is a global control line. One or more of [BEASKLP] may be - // specified; the test will be repeated for each mode. - // - // B basic BRE (grep, ed, sed) - // E REG_EXTENDED ERE (egrep) - // A REG_AUGMENTED ARE (egrep with negation) - // S REG_SHELL SRE (sh glob) - // K REG_SHELL|REG_AUGMENTED KRE (ksh glob) - // L REG_LITERAL LRE (fgrep) - // - // a REG_LEFT|REG_RIGHT implicit ^...$ - // b REG_NOTBOL lhs does not match ^ - // c REG_COMMENT ignore space and #...\n - // d REG_SHELL_DOT explicit leading . match - // e REG_NOTEOL rhs does not match $ - // f REG_MULTIPLE multiple \n separated patterns - // g FNM_LEADING_DIR testfnmatch only -- match until / - // h REG_MULTIREF multiple digit backref - // i REG_ICASE ignore case - // j REG_SPAN . matches \n - // k REG_ESCAPE \ to ecape [...] delimiter - // l REG_LEFT implicit ^... - // m REG_MINIMAL minimal match - // n REG_NEWLINE explicit \n match - // o REG_ENCLOSED (|&) magic inside [@|&](...) - // p REG_SHELL_PATH explicit / match - // q REG_DELIMITED delimited pattern - // r REG_RIGHT implicit ...$ - // s REG_SHELL_ESCAPED \ not special - // t REG_MUSTDELIM all delimiters must be specified - // u standard unspecified behavior -- errors not counted - // v REG_CLASS_ESCAPE \ special inside [...] - // w REG_NOSUB no subexpression match array - // x REG_LENIENT let some errors slide - // y REG_LEFT regexec() implicit ^... - // z REG_NULL NULL subexpressions ok - // $ expand C \c escapes in fields 2 and 3 - // / field 2 is a regsubcomp() expression - // = field 3 is a regdecomp() expression - // - // Field 1 control lines: - // - // C set LC_COLLATE and LC_CTYPE to locale in field 2 - // - // ?test ... output field 5 if passed and != EXPECTED, silent otherwise - // &test ... output field 5 if current and previous passed - // |test ... output field 5 if current passed and previous failed - // ; ... output field 2 if previous failed - // {test ... skip if failed until } - // } end of skip - // - // : comment comment copied as output NOTE - // :comment:test :comment: ignored - // N[OTE] comment comment copied as output NOTE - // T[EST] comment comment - // - // number use number for nmatch (20 by default) - flag := field[0] - switch flag[0] { - case '?', '&', '|', ';', '{', '}': - // Ignore all the control operators. - // Just run everything. - flag = flag[1:] - if flag == "" { - continue Reading - } - case ':': - i := strings.Index(flag[1:], ":") - if i < 0 { - t.Logf("skip: %s", line) - continue Reading - } - flag = flag[1+i+1:] - case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - t.Logf("skip: %s", line) - continue Reading - } - - // Can check field count now that we've handled the myriad comment formats. - if len(field) < 4 { - t.Errorf("%s:%d: too few fields: %s", file, lineno, line) - continue Reading - } - - // Expand C escapes (a.k.a. Go escapes). - if strings.Contains(flag, "$") { - f := `"` + field[1] + `"` - if field[1], err = strconv.Unquote(f); err != nil { - t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) - } - f = `"` + field[2] + `"` - if field[2], err = strconv.Unquote(f); err != nil { - t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) - } - } - - // Field 2: the regular expression pattern; SAME uses the pattern from - // the previous specification. - // - if field[1] == "SAME" { - field[1] = lastRegexp - } - lastRegexp = field[1] - - // Field 3: the string to match. - text := field[2] - - // Field 4: the test outcome... - ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3]) - if !ok { - t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3]) - continue Reading - } - - // Field 5: optional comment appended to the report. - - Testing: - // Run test once for each specified capital letter mode that we support. - for _, c := range flag { - pattern := field[1] - syn := syntax.POSIX | syntax.ClassNL - switch c { - default: - continue Testing - case 'E': - // extended regexp (what we support) - case 'L': - // literal - pattern = QuoteMeta(pattern) - } - - for _, c := range flag { - switch c { - case 'i': - syn |= syntax.FoldCase - } - } - - re, err := compile(pattern, syn, true) - if err != nil { - if shouldCompile { - t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern) - } - continue Testing - } - if !shouldCompile { - t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern) - continue Testing - } - match := re.MatchString(text) - if match != shouldMatch { - t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch) - continue Testing - } - have := re.FindStringSubmatchIndex(text) - if (len(have) > 0) != match { - t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have) - continue Testing - } - if len(have) > len(pos) { - have = have[:len(pos)] - } - if !same(have, pos) { - t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos) - } - } - } -} - -func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) { - // Field 4: the test outcome. This is either one of the posix error - // codes (with REG_ omitted) or the match array, a list of (m,n) - // entries with m and n being first and last+1 positions in the - // field 3 string, or NULL if REG_NOSUB is in effect and success - // is expected. BADPAT is acceptable in place of any regcomp(3) - // error code. The match[] array is initialized to (-2,-2) before - // each test. All array elements from 0 to nmatch-1 must be specified - // in the outcome. Unspecified endpoints (offset -1) are denoted by ?. - // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a - // matched (?{...}) expression, where x is the text enclosed by {...}, - // o is the expression ordinal counting from 1, and n is the length of - // the unmatched portion of the subject string. If x starts with a - // number then that is the return value of re_execf(), otherwise 0 is - // returned. - switch { - case s == "": - // Match with no position information. - ok = true - compiled = true - matched = true - return - case s == "NOMATCH": - // Match failure. - ok = true - compiled = true - matched = false - return - case 'A' <= s[0] && s[0] <= 'Z': - // All the other error codes are compile errors. - ok = true - compiled = false - return - } - compiled = true - - var x []int - for s != "" { - var end byte = ')' - if len(x)%2 == 0 { - if s[0] != '(' { - ok = false - return - } - s = s[1:] - end = ',' - } - i := 0 - for i < len(s) && s[i] != end { - i++ - } - if i == 0 || i == len(s) { - ok = false - return - } - var v = -1 - var err error - if s[:i] != "?" { - v, err = strconv.Atoi(s[:i]) - if err != nil { - ok = false - return - } - } - x = append(x, v) - s = s[i+1:] - } - if len(x)%2 != 0 { - ok = false - return - } - ok = true - matched = true - pos = x - return -} - -var text []byte - -func makeText(n int) []byte { - if len(text) >= n { - return text[:n] - } - text = make([]byte, n) - x := ^uint32(0) - for i := range text { - x += x - x ^= 1 - if int32(x) < 0 { - x ^= 0x88888eef - } - if x%31 == 0 { - text[i] = '\n' - } else { - text[i] = byte(x%(0x7E+1-0x20) + 0x20) - } - } - return text -} - -func benchmark(b *testing.B, re string, n int) { - r := MustCompile(re) - t := makeText(n) - b.ResetTimer() - b.SetBytes(int64(n)) - for i := 0; i < b.N; i++ { - if r.Match(t) { - b.Fatal("match!") - } - } -} - -const ( - easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" - easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" - medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" - hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" - parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" + - "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" -) - -func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) } -func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) } -func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) } -func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) } -func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) } -func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) } -func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) } -func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) } -func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) } -func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) } -func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 32<<0) } -func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) } -func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) } -func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) } -func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) } -func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) } -func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) } -func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) } -func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) } -func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) } - -func TestLongest(t *testing.T) { - re, err := Compile(`a(|b)`) - if err != nil { - t.Fatal(err) - } - if g, w := re.FindString("ab"), "a"; g != w { - t.Errorf("first match was %q, want %q", g, w) - } - re.Longest() - if g, w := re.FindString("ab"), "ab"; g != w { - t.Errorf("longest match was %q, want %q", g, w) - } -} |