diff options
Diffstat (limited to 'src/pkg/utf8/string.go')
-rw-r--r-- | src/pkg/utf8/string.go | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/src/pkg/utf8/string.go b/src/pkg/utf8/string.go new file mode 100644 index 000000000..83b56b944 --- /dev/null +++ b/src/pkg/utf8/string.go @@ -0,0 +1,211 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package utf8 + +// String wraps a regular string with a small structure that provides more +// efficient indexing by code point index, as opposed to byte index. +// Scanning incrementally forwards or backwards is O(1) per index operation +// (although not as fast a range clause going forwards). Random access is +// O(N) in the length of the string, but the overhead is less than always +// scanning from the beginning. +// If the string is ASCII, random access is O(1). +// Unlike the built-in string type, String has internal mutable state and +// is not thread-safe. +type String struct { + str string + numRunes int + // If width > 0, the rune at runePos starts at bytePos and has the specified width. + width int + bytePos int + runePos int + nonASCII int // byte index of the first non-ASCII rune. +} + +// NewString returns a new UTF-8 string with the provided contents. +func NewString(contents string) *String { + return new(String).Init(contents) +} + +// Init initializes an existing String to hold the provided contents. +// It returns a pointer to the initialized String. +func (s *String) Init(contents string) *String { + s.str = contents + s.bytePos = 0 + s.runePos = 0 + for i := 0; i < len(contents); i++ { + if contents[i] >= RuneSelf { + // Not ASCII. + s.numRunes = RuneCountInString(contents) + _, s.width = DecodeRuneInString(contents) + s.nonASCII = i + return s + } + } + // ASCII is simple. Also, the empty string is ASCII. + s.numRunes = len(contents) + s.width = 0 + s.nonASCII = len(contents) + return s +} + +// String returns the contents of the String. This method also means the +// String is directly printable by fmt.Print. +func (s *String) String() string { + return s.str +} + +// RuneCount returns the number of runes (Unicode code points) in the String. +func (s *String) RuneCount() int { + return s.numRunes +} + +// IsASCII returns a boolean indicating whether the String contains only ASCII bytes. +func (s *String) IsASCII() bool { + return s.width == 0 +} + +// Slice returns the string sliced at rune positions [i:j]. +func (s *String) Slice(i, j int) string { + // ASCII is easy. Let the compiler catch the indexing error if there is one. + if j < s.nonASCII { + return s.str[i:j] + } + if i < 0 || j > s.numRunes || i > j { + panic(sliceOutOfRange) + } + if i == j { + return "" + } + // For non-ASCII, after At(i), bytePos is always the position of the indexed character. + var low, high int + switch { + case i < s.nonASCII: + low = i + case i == s.numRunes: + low = len(s.str) + default: + s.At(i) + low = s.bytePos + } + switch { + case j == s.numRunes: + high = len(s.str) + default: + s.At(j) + high = s.bytePos + } + return s.str[low:high] +} + +// At returns the rune with index i in the String. The sequence of runes is the same +// as iterating over the contents with a "for range" clause. +func (s *String) At(i int) int { + // ASCII is easy. Let the compiler catch the indexing error if there is one. + if i < s.nonASCII { + return int(s.str[i]) + } + + // Now we do need to know the index is valid. + if i < 0 || i >= s.numRunes { + panic(outOfRange) + } + + var rune int + + // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end. + // With these cases, all scans from beginning or end work in O(1) time per rune. + switch { + + case i == s.runePos-1: // backing up one rune + rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos]) + s.runePos = i + s.bytePos -= s.width + return rune + case i == s.runePos+1: // moving ahead one rune + s.runePos = i + s.bytePos += s.width + fallthrough + case i == s.runePos: + rune, s.width = DecodeRuneInString(s.str[s.bytePos:]) + return rune + case i == 0: // start of string + rune, s.width = DecodeRuneInString(s.str) + s.runePos = 0 + s.bytePos = 0 + return rune + + case i == s.numRunes-1: // last rune in string + rune, s.width = DecodeLastRuneInString(s.str) + s.runePos = i + s.bytePos = len(s.str) - s.width + return rune + } + + // We need to do a linear scan. There are three places to start from: + // 1) The beginning + // 2) bytePos/runePos. + // 3) The end + // Choose the closest in rune count, scanning backwards if necessary. + forward := true + if i < s.runePos { + // Between beginning and pos. Which is closer? + // Since both i and runePos are guaranteed >= nonASCII, that's the + // lowest location we need to start from. + if i < (s.runePos-s.nonASCII)/2 { + // Scan forward from beginning + s.bytePos, s.runePos = s.nonASCII, s.nonASCII + } else { + // Scan backwards from where we are + forward = false + } + } else { + // Between pos and end. Which is closer? + if i-s.runePos < (s.numRunes-s.runePos)/2 { + // Scan forward from pos + } else { + // Scan backwards from end + s.bytePos, s.runePos = len(s.str), s.numRunes + forward = false + } + } + if forward { + // TODO: Is it much faster to use a range loop for this scan? + for { + rune, s.width = DecodeRuneInString(s.str[s.bytePos:]) + if s.runePos == i { + break + } + s.runePos++ + s.bytePos += s.width + } + } else { + for { + rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos]) + s.runePos-- + s.bytePos -= s.width + if s.runePos == i { + break + } + } + } + return rune +} + +// We want the panic in At(i) to satisfy os.Error, because that's what +// runtime panics satisfy, but we can't import os. This is our solution. + +// error is the type of the error returned if a user calls String.At(i) with i out of range. +// It satisfies os.Error and runtime.Error. +type error string + +func (err error) String() string { + return string(err) +} + +func (err error) RunTimeError() { +} + +var outOfRange = error("utf8.String: index out of range") +var sliceOutOfRange = error("utf8.String: slice index out of range") |