diff options
Diffstat (limited to 'src/pkg/regexp')
-rw-r--r-- | src/pkg/regexp/Makefile | 12 | ||||
-rw-r--r-- | src/pkg/regexp/all_test.go | 88 | ||||
-rw-r--r-- | src/pkg/regexp/regexp.go | 284 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/Makefile | 16 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/parse.go | 11 | ||||
-rw-r--r-- | src/pkg/regexp/syntax/regexp.go | 2 |
6 files changed, 315 insertions, 98 deletions
diff --git a/src/pkg/regexp/Makefile b/src/pkg/regexp/Makefile deleted file mode 100644 index 60406c773..000000000 --- a/src/pkg/regexp/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2011 The Go Authors. All rights reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - -include ../../Make.inc - -TARG=regexp -GOFILES=\ - exec.go\ - regexp.go\ - -include ../../Make.pkg diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 107dfe37c..f7b41a674 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -176,6 +176,45 @@ var replaceTests = []ReplaceTest{ {"[a-c]*", "x", "def", "xdxexfx"}, {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, + + // Substitutions + {"a+", "($0)", "banana", "b(a)n(a)n(a)"}, + {"a+", "(${0})", "banana", "b(a)n(a)n(a)"}, + {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, + {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, + {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"}, + {"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "}, + {"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"}, + {"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<hello, world><world><><>"}, + {"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"}, + {"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "hihihi"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "byebyebye"}, + {"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", ""}, + {"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "hiyz"}, + {"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $x"}, + {"a+", "${oops", "aaa", "${oops"}, + {"a+", "$$", "aaa", "$"}, + {"a+", "$", "aaa", "$"}, +} + +var replaceLiteralTests = []ReplaceTest{ + // Substitutions + {"a+", "($0)", "banana", "b($0)n($0)n($0)"}, + {"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"}, + {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, + {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, + {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"}, + {"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"}, + {"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "$x$x$x"}, + {"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "$x$x$x"}, + {"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", "$xyz"}, + {"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "${x}yz"}, + {"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $$x"}, + {"a+", "${oops", "aaa", "${oops"}, + {"a+", "$$", "aaa", "$$"}, + {"a+", "$", "aaa", "$"}, } type ReplaceFuncTest struct { @@ -199,13 +238,58 @@ func TestReplaceAll(t *testing.T) { } actual := re.ReplaceAllString(tc.input, tc.replacement) if actual != tc.output { - t.Errorf("%q.Replace(%q,%q) = %q; want %q", + t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q", tc.pattern, tc.input, tc.replacement, actual, tc.output) } // now try bytes actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) if actual != tc.output { - t.Errorf("%q.Replace(%q,%q) = %q; want %q", + t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +func TestReplaceAllLiteral(t *testing.T) { + // Run ReplaceAll tests that do not have $ expansions. + for _, tc := range replaceTests { + if strings.Contains(tc.replacement, "$") { + continue + } + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } + + // Run literal-specific tests. + for _, tc := range replaceLiteralTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", tc.pattern, tc.input, tc.replacement, actual, tc.output) } } diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index 7aebd3728..54c53776c 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -61,6 +61,7 @@ import ( "strconv" "strings" "sync" + "unicode" "unicode/utf8" ) @@ -416,41 +417,79 @@ func Match(pattern string, b []byte) (matched bool, error error) { return re.Match(b), nil } -// ReplaceAllString returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement string. +// ReplaceAllString returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. func (re *Regexp) ReplaceAllString(src, repl string) string { - return re.ReplaceAllStringFunc(src, func(string) string { return repl }) + n := 2 + if strings.Index(repl, "$") >= 0 { + n = 2 * (re.numSubexp + 1) + } + b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { + return re.expand(dst, repl, nil, src, match) + }) + return string(b) } -// ReplaceAllStringFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched string). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. +// ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { + return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + })) +} + +// ReplaceAllStringFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of of function repl applied +// to the matched substring. The replacement returned by repl is substituted +// directly, without using Expand. func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { + b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) + return string(b) +} + +func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { lastMatchEnd := 0 // end position of the most recent match searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(nil, nil, src, searchPos, 2) + var buf []byte + var endPos int + if bsrc != nil { + endPos = len(bsrc) + } else { + endPos = len(src) + } + for searchPos <= endPos { + a := re.doExecute(nil, bsrc, src, searchPos, nmatch) if len(a) == 0 { break // no more matches } // Copy the unmatched characters before this match. - io.WriteString(buf, src[lastMatchEnd:a[0]]) + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:a[0]]...) + } else { + buf = append(buf, src[lastMatchEnd:a[0]]...) + } // Now insert a copy of the replacement string, but not for a // match of the empty string immediately after another match. // (Otherwise, we get double replacement for patterns that // match both empty and nonempty strings.) if a[1] > lastMatchEnd || a[0] == 0 { - io.WriteString(buf, repl(src[a[0]:a[1]])) + buf = repl(buf, a) } lastMatchEnd = a[1] // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRuneInString(src[searchPos:]) + var width int + if bsrc != nil { + _, width = utf8.DecodeRune(bsrc[searchPos:]) + } else { + _, width = utf8.DecodeRuneInString(src[searchPos:]) + } if searchPos+width > a[1] { searchPos += width } else if searchPos+1 > a[1] { @@ -463,61 +502,50 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str } // Copy the unmatched characters after the last match. - io.WriteString(buf, src[lastMatchEnd:]) + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:]...) + } else { + buf = append(buf, src[lastMatchEnd:]...) + } - return buf.String() + return buf } -// ReplaceAll returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement text. +// ReplaceAll returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. func (re *Regexp) ReplaceAll(src, repl []byte) []byte { - return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) -} - -// ReplaceAllFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched []byte). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. -func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { - lastMatchEnd := 0 // end position of the most recent match - searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(nil, src, "", searchPos, 2) - if len(a) == 0 { - break // no more matches - } - - // Copy the unmatched characters before this match. - buf.Write(src[lastMatchEnd:a[0]]) - - // Now insert a copy of the replacement string, but not for a - // match of the empty string immediately after another match. - // (Otherwise, we get double replacement for patterns that - // match both empty and nonempty strings.) - if a[1] > lastMatchEnd || a[0] == 0 { - buf.Write(repl(src[a[0]:a[1]])) - } - lastMatchEnd = a[1] - - // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRune(src[searchPos:]) - if searchPos+width > a[1] { - searchPos += width - } else if searchPos+1 > a[1] { - // This clause is only needed at the end of the input - // string. In that case, DecodeRuneInString returns width=0. - searchPos++ - } else { - searchPos = a[1] - } + n := 2 + if bytes.IndexByte(repl, '$') >= 0 { + n = 2 * (re.numSubexp + 1) } + srepl := "" + b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { + if len(srepl) != len(repl) { + srepl = string(repl) + } + return re.expand(dst, srepl, src, "", match) + }) + return b +} - // Copy the unmatched characters after the last match. - buf.Write(src[lastMatchEnd:]) +// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp +// with the replacement bytes repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + }) +} - return buf.Bytes() +// ReplaceAllFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of of function repl applied +// to the matched byte slice. The replacement returned by repl is substituted +// directly, without using Expand. +func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) } var specialBytes = []byte(`\.+*?()|[]{}^$`) @@ -648,7 +676,7 @@ func (re *Regexp) FindString(s string) string { // location of the leftmost match in s of the regular expression. The match // itself is at s[loc[0]:loc[1]]. // A return value of nil indicates no match. -func (re *Regexp) FindStringIndex(s string) []int { +func (re *Regexp) FindStringIndex(s string) (loc []int) { a := re.doExecute(nil, nil, s, 0, 2) if a == nil { return nil @@ -660,7 +688,7 @@ func (re *Regexp) FindStringIndex(s string) []int { // location of the leftmost match of the regular expression in text read from // the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return // value of nil indicates no match. -func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { +func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { a := re.doExecute(r, nil, "", 0, 2) if a == nil { return nil @@ -687,6 +715,134 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte { return ret } +// Expand appends template to dst and returns the result; during the +// append, Expand replaces variables in the template with corresponding +// matches drawn from src. The match slice should have been returned by +// FindSubmatchIndex. +// +// In the template, a variable is denoted by a substring of the form +// $name or ${name}, where name is a non-empty sequence of letters, +// digits, and underscores. A purely numeric name like $1 refers to +// the submatch with the corresponding index; other names refer to +// capturing parentheses named with the (?P<name>...) syntax. A +// reference to an out of range or unmatched index or a name that is not +// present in the regular expression is replaced with an empty string. +// +// In the $name form, name is taken to be as long as possible: $1x is +// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. +// +// To insert a literal $ in the output, use $$ in the template. +func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { + return re.expand(dst, string(template), src, "", match) +} + +// ExpandString is like Expand but the template and source are strings. +// It appends to and returns a byte slice in order to give the calling +// code control over allocation. +func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { + return re.expand(dst, template, nil, src, match) +} + +func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { + for len(template) > 0 { + i := strings.Index(template, "$") + if i < 0 { + break + } + dst = append(dst, template[:i]...) + template = template[i:] + if len(template) > 1 && template[1] == '$' { + // Treat $$ as $. + dst = append(dst, '$') + template = template[2:] + continue + } + name, num, rest, ok := extract(template) + if !ok { + // Malformed; treat $ as raw text. + dst = append(dst, '$') + template = template[1:] + continue + } + template = rest + if num >= 0 { + if 2*num+1 < len(match) { + if bsrc != nil { + dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) + } else { + dst = append(dst, src[match[2*num]:match[2*num+1]]...) + } + } + } else { + for i, namei := range re.subexpNames { + if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { + if bsrc != nil { + dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) + } else { + dst = append(dst, src[match[2*i]:match[2*i+1]]...) + } + break + } + } + } + } + dst = append(dst, template...) + return dst +} + +// extract returns the name from a leading "$name" or "${name}" in str. +// If it is a number, extract returns num set to that number; otherwise num = -1. +func extract(str string) (name string, num int, rest string, ok bool) { + if len(str) < 2 || str[0] != '$' { + return + } + brace := false + if str[1] == '{' { + brace = true + str = str[2:] + } else { + str = str[1:] + } + i := 0 + for i < len(str) { + rune, size := utf8.DecodeRuneInString(str[i:]) + if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { + break + } + i += size + } + if i == 0 { + // empty name is not okay + return + } + name = str[:i] + if brace { + if i >= len(str) || str[i] != '}' { + // missing closing brace + return + } + i++ + } + + // Parse number. + num = 0 + for i := 0; i < len(name); i++ { + if name[i] < '0' || '9' < name[i] || num >= 1e8 { + num = -1 + break + } + num = num*10 + int(name[i]) - '0' + } + // Disallow leading zeros. + if name[0] == '0' && len(name) > 1 { + num = -1 + } + + rest = str[i:] + ok = true + return +} + // FindSubmatchIndex returns a slice holding the index pairs identifying the // leftmost match of the regular expression in b and the matches, if any, of // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions diff --git a/src/pkg/regexp/syntax/Makefile b/src/pkg/regexp/syntax/Makefile deleted file mode 100644 index 0b3764ea7..000000000 --- a/src/pkg/regexp/syntax/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright 2011 The Go Authors. All rights reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - -include ../../../Make.inc - -TARG=regexp/syntax -GOFILES=\ - compile.go\ - parse.go\ - perl_groups.go\ - prog.go\ - regexp.go\ - simplify.go\ - -include ../../../Make.pkg diff --git a/src/pkg/regexp/syntax/parse.go b/src/pkg/regexp/syntax/parse.go index 6f8acbbef..2df775025 100644 --- a/src/pkg/regexp/syntax/parse.go +++ b/src/pkg/regexp/syntax/parse.go @@ -2,6 +2,10 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +// Package syntax parses regular expressions into parse trees and compiles +// parse trees into programs. Most clients of regular expressions will use +// the facilities of package regexp (such as Compile and Match) instead of +// this package. package syntax import ( @@ -648,6 +652,9 @@ func literalRegexp(s string, flags Flags) *Regexp { // Parsing. +// Parse parses a regular expression string s, controlled by the specified +// Flags, and returns a regular expression parse tree. The syntax is +// described in the top-level comment for package regexp. func Parse(s string, flags Flags) (*Regexp, error) { if flags&Literal != 0 { // Trivial parser for literal string. @@ -1377,8 +1384,8 @@ func (p *parser) appendGroup(r []rune, g charGroup) []rune { } var anyTable = &unicode.RangeTable{ - []unicode.Range16{{0, 1<<16 - 1, 1}}, - []unicode.Range32{{1 << 16, unicode.MaxRune, 1}}, + R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, + R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, } // unicodeTable returns the unicode.RangeTable identified by name diff --git a/src/pkg/regexp/syntax/regexp.go b/src/pkg/regexp/syntax/regexp.go index 668a07764..329a90e01 100644 --- a/src/pkg/regexp/syntax/regexp.go +++ b/src/pkg/regexp/syntax/regexp.go @@ -2,8 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Package syntax parses regular expressions into syntax trees. -// WORK IN PROGRESS. package syntax // Note to implementers: |