From 4cecda6c347bd6902b960c6a35a967add7070b0d Mon Sep 17 00:00:00 2001 From: Ondřej Surý Date: Mon, 30 Jan 2012 15:38:19 +0100 Subject: Imported Upstream version 2012.01.27 --- src/pkg/regexp/syntax/simplify_test.go | 151 +++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 src/pkg/regexp/syntax/simplify_test.go (limited to 'src/pkg/regexp/syntax/simplify_test.go') diff --git a/src/pkg/regexp/syntax/simplify_test.go b/src/pkg/regexp/syntax/simplify_test.go new file mode 100644 index 000000000..879eff5be --- /dev/null +++ b/src/pkg/regexp/syntax/simplify_test.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import "testing" + +var simplifyTests = []struct { + Regexp string + Simple string +}{ + // Already-simple constructs + {`a`, `a`}, + {`ab`, `ab`}, + {`a|b`, `[a-b]`}, + {`ab|cd`, `ab|cd`}, + {`(ab)*`, `(ab)*`}, + {`(ab)+`, `(ab)+`}, + {`(ab)?`, `(ab)?`}, + {`.`, `(?s:.)`}, + {`^`, `^`}, + {`$`, `$`}, + {`[ac]`, `[ac]`}, + {`[^ac]`, `[^ac]`}, + + // Posix character classes + {`[[:alnum:]]`, `[0-9A-Za-z]`}, + {`[[:alpha:]]`, `[A-Za-z]`}, + {`[[:blank:]]`, `[\t ]`}, + {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, + {`[[:digit:]]`, `[0-9]`}, + {`[[:graph:]]`, `[!-~]`}, + {`[[:lower:]]`, `[a-z]`}, + {`[[:print:]]`, `[ -~]`}, + {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, + {`[[:space:]]`, `[\t-\r ]`}, + {`[[:upper:]]`, `[A-Z]`}, + {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, + + // Perl character classes + {`\d`, `[0-9]`}, + {`\s`, `[\t-\n\f-\r ]`}, + {`\w`, `[0-9A-Z_a-z]`}, + {`\D`, `[^0-9]`}, + {`\S`, `[^\t-\n\f-\r ]`}, + {`\W`, `[^0-9A-Z_a-z]`}, + {`[\d]`, `[0-9]`}, + {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\w]`, `[0-9A-Z_a-z]`}, + {`[\D]`, `[^0-9]`}, + {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\W]`, `[^0-9A-Z_a-z]`}, + + // Posix repetitions + {`a{1}`, `a`}, + {`a{2}`, `aa`}, + {`a{5}`, `aaaaa`}, + {`a{0,1}`, `a?`}, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version with fewer parens. + {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? + {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? + {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,2}`, `(?:aa?)?`}, // (aa?)? + {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? + {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,}`, `a*`}, + {`a{1,}`, `a+`}, + {`a{2,}`, `aa+`}, + {`a{5,}`, `aaaaa+`}, + + // Test that operators simplify their arguments. + {`(?:a{1,}){1,}`, `a+`}, + {`(a{1,}b{1,})`, `(a+b+)`}, + {`a{1,}|b{1,}`, `a+|b+`}, + {`(?:a{1,})*`, `(?:a+)*`}, + {`(?:a{1,})+`, `a+`}, + {`(?:a{1,})?`, `(?:a+)?`}, + {``, `(?:)`}, + {`a{0}`, `(?:)`}, + + // Character class simplification + {`[ab]`, `[a-b]`}, + {`[a-za-za-z]`, `[a-z]`}, + {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, + {`[ABCDEFGH]`, `[A-H]`}, + {`[AB-CD-EF-GH]`, `[A-H]`}, + {`[W-ZP-XE-R]`, `[E-Z]`}, + {`[a-ee-gg-m]`, `[a-m]`}, + {`[a-ea-ha-m]`, `[a-m]`}, + {`[a-ma-ha-e]`, `[a-m]`}, + {`[a-zA-Z0-9 -~]`, `[ -~]`}, + + // Empty character classes + {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, + + // Full character classes + {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`}, + + // Unicode case folding. + {`(?i)A`, `(?i:A)`}, + {`(?i)a`, `(?i:A)`}, + {`(?i)[A]`, `(?i:A)`}, + {`(?i)[a]`, `(?i:A)`}, + {`(?i)K`, `(?i:K)`}, + {`(?i)k`, `(?i:K)`}, + {`(?i)\x{212a}`, "(?i:K)"}, + {`(?i)[K]`, "[Kk\u212A]"}, + {`(?i)[k]`, "[Kk\u212A]"}, + {`(?i)[\x{212a}]`, "[Kk\u212A]"}, + {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, + {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, + {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`}, + + // Empty string as a regular expression. + // The empty string must be preserved inside parens in order + // to make submatches work right, so these tests are less + // interesting than they might otherwise be. String inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + {`(a|b|)`, `([a-b]|(?:))`}, + {`(|)`, `()`}, + {`a()`, `a()`}, + {`(()|())`, `(()|())`}, + {`(a|)`, `(a|(?:))`}, + {`ab()cd()`, `ab()cd()`}, + {`()`, `()`}, + {`()*`, `()*`}, + {`()+`, `()+`}, + {`()?`, `()?`}, + {`(){0}`, `(?:)`}, + {`(){1}`, `()`}, + {`(){1,}`, `()+`}, + {`(){0,2}`, `(?:()()?)?`}, +} + +func TestSimplify(t *testing.T) { + for _, tt := range simplifyTests { + re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) + if err != nil { + t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) + continue + } + s := re.Simplify().String() + if s != tt.Simple { + t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) + } + } +} -- cgit v1.2.3