diff options
Diffstat (limited to 'src/pkg/bytes/bytes.go')
| -rw-r--r-- | src/pkg/bytes/bytes.go | 287 | 
1 files changed, 192 insertions, 95 deletions
| diff --git a/src/pkg/bytes/bytes.go b/src/pkg/bytes/bytes.go index bcf7b8609..bfe2ef39d 100644 --- a/src/pkg/bytes/bytes.go +++ b/src/pkg/bytes/bytes.go @@ -3,7 +3,7 @@  // license that can be found in the LICENSE file.  // The bytes package implements functions for the manipulation of byte slices. -// Analagous to the facilities of the strings package. +// Analogous to the facilities of the strings package.  package bytes  import ( @@ -127,7 +127,21 @@ func LastIndex(s, sep []byte) int {  	return -1  } -// IndexAny interprets s as a sequence of UTF-8 encoded Unicode code points. +// IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points. +// It returns the byte index of the first occurrence in s of the given rune. +// It returns -1 if rune is not present in s. +func IndexRune(s []byte, rune int) int { +	for i := 0; i < len(s); { +		r, size := utf8.DecodeRune(s[i:]) +		if r == rune { +			return i +		} +		i += size +	} +	return -1 +} + +// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.  // It returns the byte index of the first occurrence in s of any of the Unicode  // code points in chars.  It returns -1 if chars is empty or if there is no code  // point in common. @@ -151,6 +165,25 @@ func IndexAny(s []byte, chars string) int {  	return -1  } +// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code +// points.  It returns the byte index of the last occurrence in s of any of +// the Unicode code points in chars.  It returns -1 if chars is empty or if +// there is no code point in common. +func LastIndexAny(s []byte, chars string) int { +	if len(chars) > 0 { +		for i := len(s); i > 0; { +			rune, size := utf8.DecodeLastRune(s[0:i]) +			i -= size +			for _, m := range chars { +				if rune == m { +					return i +				} +			} +		} +	} +	return -1 +} +  // Generic split: splits after each instance of sep,  // including sepSave bytes of sep in the subarrays.  func genSplit(s, sep []byte, sepSave, n int) [][]byte { @@ -179,17 +212,22 @@ func genSplit(s, sep []byte, sepSave, n int) [][]byte {  	return a[0 : na+1]  } -// Split splits the array s around each instance of sep, returning an array of subarrays of s. -// If sep is empty, Split splits s after each UTF-8 sequence. -// If n >= 0, Split splits s into at most n subarrays; the last subarray will contain an unsplit remainder. -// Thus if n == 0, the result will ne nil. +// Split slices s into subslices separated by sep and returns a slice of +// the subslices between those separators. +// If sep is empty, Split splits after each UTF-8 sequence. +// The count determines the number of subslices to return: +//   n > 0: at most n subslices; the last subslice will be the unsplit remainder. +//   n == 0: the result is nil (zero subslices) +//   n < 0: all subslices  func Split(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) } -// SplitAfter splits the array s after each instance of sep, returning an array of subarrays of s. -// If sep is empty, SplitAfter splits s after each UTF-8 sequence. -// If n >= 0, SplitAfter splits s into at most n subarrays; the last subarray will contain an -// unsplit remainder. -// Thus if n == 0, the result will ne nil. +// SplitAfter slices s into subslices after each instance of sep and +// returns a slice of those subslices. +// If sep is empty, Split splits after each UTF-8 sequence. +// The count determines the number of subslices to return: +//   n > 0: at most n subslices; the last subslice will be the unsplit remainder. +//   n == 0: the result is nil (zero subslices) +//   n < 0: all subslices  func SplitAfter(s, sep []byte, n int) [][]byte {  	return genSplit(s, sep, len(sep), n)  } @@ -197,12 +235,20 @@ func SplitAfter(s, sep []byte, n int) [][]byte {  // Fields splits the array s around each instance of one or more consecutive white space  // characters, returning a slice of subarrays of s or an empty list if s contains only white space.  func Fields(s []byte) [][]byte { +	return FieldsFunc(s, unicode.IsSpace) +} + +// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points. +// It splits the array s at each run of code points c satisfying f(c) and +// returns a slice of subarrays of s.  If no code points in s satisfy f(c), an +// empty slice is returned. +func FieldsFunc(s []byte, f func(int) bool) [][]byte {  	n := 0  	inField := false  	for i := 0; i < len(s); {  		rune, size := utf8.DecodeRune(s[i:])  		wasInField := inField -		inField = !unicode.IsSpace(rune) +		inField = !f(rune)  		if inField && !wasInField {  			n++  		} @@ -214,12 +260,12 @@ func Fields(s []byte) [][]byte {  	fieldStart := -1  	for i := 0; i <= len(s) && na < n; {  		rune, size := utf8.DecodeRune(s[i:]) -		if fieldStart < 0 && size > 0 && !unicode.IsSpace(rune) { +		if fieldStart < 0 && size > 0 && !f(rune) {  			fieldStart = i  			i += size  			continue  		} -		if fieldStart >= 0 && (size == 0 || unicode.IsSpace(rune)) { +		if fieldStart >= 0 && (size == 0 || f(rune)) {  			a[na] = s[fieldStart:i]  			na++  			fieldStart = -1 @@ -278,7 +324,7 @@ func HasSuffix(s, suffix []byte) bool {  // Map returns a copy of the byte array s with all its characters modified  // according to the mapping function. If mapping returns a negative value, the character is  // dropped from the string with no replacement.  The characters in s and the -// output are interpreted as UTF-8 encoded Unicode code points. +// output are interpreted as UTF-8-encoded Unicode code points.  func Map(mapping func(rune int) int, s []byte) []byte {  	// In the worst case, the array can grow when mapped, making  	// things unpleasant.  But it's so rare we barge in assuming it's @@ -298,12 +344,10 @@ func Map(mapping func(rune int) int, s []byte) []byte {  				// Grow the buffer.  				maxbytes = maxbytes*2 + utf8.UTFMax  				nb := make([]byte, maxbytes) -				for i, c := range b[0:nbytes] { -					nb[i] = c -				} +				copy(nb, b[0:nbytes])  				b = nb  			} -			nbytes += utf8.EncodeRune(rune, b[nbytes:maxbytes]) +			nbytes += utf8.EncodeRune(b[nbytes:maxbytes], rune)  		}  		i += wid  	} @@ -332,52 +376,147 @@ func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }  // ToTitle returns a copy of the byte array s with all Unicode letters mapped to their title case.  func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) } -// TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8 encoded +// ToUpperSpecial returns a copy of the byte array s with all Unicode letters mapped to their +// upper case, giving priority to the special casing rules. +func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte { +	return Map(func(r int) int { return _case.ToUpper(r) }, s) +} + +// ToLowerSpecial returns a copy of the byte array s with all Unicode letters mapped to their +// lower case, giving priority to the special casing rules. +func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte { +	return Map(func(r int) int { return _case.ToLower(r) }, s) +} + +// ToTitleSpecial returns a copy of the byte array s with all Unicode letters mapped to their +// title case, giving priority to the special casing rules. +func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte { +	return Map(func(r int) int { return _case.ToTitle(r) }, s) +} + + +// isSeparator reports whether the rune could mark a word boundary. +// TODO: update when package unicode captures more of the properties. +func isSeparator(rune int) bool { +	// ASCII alphanumerics and underscore are not separators +	if rune <= 0x7F { +		switch { +		case '0' <= rune && rune <= '9': +			return false +		case 'a' <= rune && rune <= 'z': +			return false +		case 'A' <= rune && rune <= 'Z': +			return false +		case rune == '_': +			return false +		} +		return true +	} +	// Letters and digits are not separators +	if unicode.IsLetter(rune) || unicode.IsDigit(rune) { +		return false +	} +	// Otherwise, all we can do for now is treat spaces as separators. +	return unicode.IsSpace(rune) +} + +// BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly. + +// Title returns a copy of s with all Unicode letters that begin words +// mapped to their title case. +func Title(s []byte) []byte { +	// Use a closure here to remember state. +	// Hackish but effective. Depends on Map scanning in order and calling +	// the closure once per rune. +	prev := ' ' +	return Map( +		func(r int) int { +			if isSeparator(prev) { +				prev = r +				return unicode.ToTitle(r) +			} +			prev = r +			return r +		}, +		s) +} + +// TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded  // Unicode code points c that satisfy f(c).  func TrimLeftFunc(s []byte, f func(r int) bool) []byte { -	var start, wid int -	for start = 0; start < len(s); start += wid { -		wid = 1 -		rune := int(s[start]) -		if rune >= utf8.RuneSelf { -			rune, wid = utf8.DecodeRune(s[start:]) -		} -		if !f(rune) { -			break -		} +	i := indexFunc(s, f, false) +	if i == -1 { +		return nil  	} -	return s[start:] +	return s[i:]  }  // TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8  // encoded Unicode code points c that satisfy f(c).  func TrimRightFunc(s []byte, f func(r int) bool) []byte { -	var end, wid int -	for end = len(s); end > 0; end -= wid { -		wid = 1 -		rune := int(s[end-wid]) -		if rune >= utf8.RuneSelf { -			// Back up & look for beginning of rune. Mustn't pass start. -			for wid = 2; end-wid >= 0 && !utf8.RuneStart(s[end-wid]); wid++ { -			} -			if end-wid < 0 { // invalid UTF-8 sequence; stop processing -				break -			} -			rune, wid = utf8.DecodeRune(s[end-wid : end]) -		} -		if !f(rune) { -			break -		} +	i := lastIndexFunc(s, f, false) +	if i >= 0 && s[i] >= utf8.RuneSelf { +		_, wid := utf8.DecodeRune(s[i:]) +		i += wid +	} else { +		i++  	} -	return s[0:end] +	return s[0:i]  }  // TrimFunc returns a subslice of s by slicing off all leading and trailing -// UTF-8 encoded Unicode code points c that satisfy f(c). +// UTF-8-encoded Unicode code points c that satisfy f(c).  func TrimFunc(s []byte, f func(r int) bool) []byte {  	return TrimRightFunc(TrimLeftFunc(s, f), f)  } +// IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points. +// It returns the byte index in s of the first Unicode +// code point satisfying f(c), or -1 if none do. +func IndexFunc(s []byte, f func(r int) bool) int { +	return indexFunc(s, f, true) +} + +// LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points. +// It returns the byte index in s of the last Unicode +// code point satisfying f(c), or -1 if none do. +func LastIndexFunc(s []byte, f func(r int) bool) int { +	return lastIndexFunc(s, f, true) +} + +// indexFunc is the same as IndexFunc except that if +// truth==false, the sense of the predicate function is +// inverted. +func indexFunc(s []byte, f func(r int) bool, truth bool) int { +	start := 0 +	for start < len(s) { +		wid := 1 +		rune := int(s[start]) +		if rune >= utf8.RuneSelf { +			rune, wid = utf8.DecodeRune(s[start:]) +		} +		if f(rune) == truth { +			return start +		} +		start += wid +	} +	return -1 +} + +// lastIndexFunc is the same as LastIndexFunc except that if +// truth==false, the sense of the predicate function is +// inverted. +func lastIndexFunc(s []byte, f func(r int) bool, truth bool) int { +	for i := len(s); i > 0; { +		rune, size := utf8.DecodeLastRune(s[0:i]) +		i -= size +		if f(rune) == truth { +			return i +		} +	} +	return -1 +} +  func makeCutsetFunc(cutset string) func(rune int) bool {  	return func(rune int) bool {  		for _, c := range cutset { @@ -390,71 +529,29 @@ func makeCutsetFunc(cutset string) func(rune int) bool {  }  // Trim returns a subslice of s by slicing off all leading and -// trailing UTF-8 encoded Unicode code points contained in cutset. +// trailing UTF-8-encoded Unicode code points contained in cutset.  func Trim(s []byte, cutset string) []byte {  	return TrimFunc(s, makeCutsetFunc(cutset))  }  // TrimLeft returns a subslice of s by slicing off all leading -// UTF-8 encoded Unicode code points contained in cutset. +// UTF-8-encoded Unicode code points contained in cutset.  func TrimLeft(s []byte, cutset string) []byte {  	return TrimLeftFunc(s, makeCutsetFunc(cutset))  }  // TrimRight returns a subslice of s by slicing off all trailing -// UTF-8 encoded Unicode code points that are contained in cutset. +// UTF-8-encoded Unicode code points that are contained in cutset.  func TrimRight(s []byte, cutset string) []byte {  	return TrimRightFunc(s, makeCutsetFunc(cutset))  }  // TrimSpace returns a subslice of s by slicing off all leading and -// trailing white space, as as defined by Unicode. +// trailing white space, as defined by Unicode.  func TrimSpace(s []byte) []byte {  	return TrimFunc(s, unicode.IsSpace)  } -// How big to make a byte array when growing. -// Heuristic: Scale by 50% to give n log n time. -func resize(n int) int { -	if n < 16 { -		n = 16 -	} -	return n + n/2 -} - -// Add appends the contents of t to the end of s and returns the result. -// If s has enough capacity, it is extended in place; otherwise a -// new array is allocated and returned. -func Add(s, t []byte) []byte { -	lens := len(s) -	lent := len(t) -	if lens+lent <= cap(s) { -		s = s[0 : lens+lent] -	} else { -		news := make([]byte, lens+lent, resize(lens+lent)) -		copy(news, s) -		s = news -	} -	copy(s[lens:lens+lent], t) -	return s -} - -// AddByte appends byte b to the end of s and returns the result. -// If s has enough capacity, it is extended in place; otherwise a -// new array is allocated and returned. -func AddByte(s []byte, t byte) []byte { -	lens := len(s) -	if lens+1 <= cap(s) { -		s = s[0 : lens+1] -	} else { -		news := make([]byte, lens+1, resize(lens+1)) -		copy(news, s) -		s = news -	} -	s[lens] = t -	return s -} -  // Runes returns a slice of runes (Unicode code points) equivalent to s.  func Runes(s []byte) []int {  	t := make([]int, utf8.RuneCount(s)) | 
