diff options
Diffstat (limited to 'src/pkg/unicode/utf8')
-rw-r--r-- | src/pkg/unicode/utf8/example_test.go | 196 | ||||
-rw-r--r-- | src/pkg/unicode/utf8/utf8.go | 435 | ||||
-rw-r--r-- | src/pkg/unicode/utf8/utf8_test.go | 444 |
3 files changed, 0 insertions, 1075 deletions
diff --git a/src/pkg/unicode/utf8/example_test.go b/src/pkg/unicode/utf8/example_test.go deleted file mode 100644 index 7b3e7ac74..000000000 --- a/src/pkg/unicode/utf8/example_test.go +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package utf8_test - -import ( - "fmt" - "unicode/utf8" -) - -func ExampleDecodeLastRune() { - b := []byte("Hello, 世界") - - for len(b) > 0 { - r, size := utf8.DecodeLastRune(b) - fmt.Printf("%c %v\n", r, size) - - b = b[:len(b)-size] - } - // Output: - // 界 3 - // 世 3 - // 1 - // , 1 - // o 1 - // l 1 - // l 1 - // e 1 - // H 1 -} - -func ExampleDecodeLastRuneInString() { - str := "Hello, 世界" - - for len(str) > 0 { - r, size := utf8.DecodeLastRuneInString(str) - fmt.Printf("%c %v\n", r, size) - - str = str[:len(str)-size] - } - // Output: - // 界 3 - // 世 3 - // 1 - // , 1 - // o 1 - // l 1 - // l 1 - // e 1 - // H 1 - -} - -func ExampleDecodeRune() { - b := []byte("Hello, 世界") - - for len(b) > 0 { - r, size := utf8.DecodeRune(b) - fmt.Printf("%c %v\n", r, size) - - b = b[size:] - } - // Output: - // H 1 - // e 1 - // l 1 - // l 1 - // o 1 - // , 1 - // 1 - // 世 3 - // 界 3 -} - -func ExampleDecodeRuneInString() { - str := "Hello, 世界" - - for len(str) > 0 { - r, size := utf8.DecodeRuneInString(str) - fmt.Printf("%c %v\n", r, size) - - str = str[size:] - } - // Output: - // H 1 - // e 1 - // l 1 - // l 1 - // o 1 - // , 1 - // 1 - // 世 3 - // 界 3 -} - -func ExampleEncodeRune() { - r := '世' - buf := make([]byte, 3) - - n := utf8.EncodeRune(buf, r) - - fmt.Println(buf) - fmt.Println(n) - // Output: - // [228 184 150] - // 3 -} - -func ExampleFullRune() { - buf := []byte{228, 184, 150} // 世 - fmt.Println(utf8.FullRune(buf)) - fmt.Println(utf8.FullRune(buf[:2])) - // Output: - // true - // false -} - -func ExampleFullRuneInString() { - str := "世" - fmt.Println(utf8.FullRuneInString(str)) - fmt.Println(utf8.FullRuneInString(str[:2])) - // Output: - // true - // false -} - -func ExampleRuneCount() { - buf := []byte("Hello, 世界") - fmt.Println("bytes =", len(buf)) - fmt.Println("runes =", utf8.RuneCount(buf)) - // Output: - // bytes = 13 - // runes = 9 -} - -func ExampleRuneCountInString() { - str := "Hello, 世界" - fmt.Println("bytes =", len(str)) - fmt.Println("runes =", utf8.RuneCountInString(str)) - // Output: - // bytes = 13 - // runes = 9 -} - -func ExampleRuneLen() { - fmt.Println(utf8.RuneLen('a')) - fmt.Println(utf8.RuneLen('界')) - // Output: - // 1 - // 3 -} - -func ExampleRuneStart() { - buf := []byte("a界") - fmt.Println(utf8.RuneStart(buf[0])) - fmt.Println(utf8.RuneStart(buf[1])) - fmt.Println(utf8.RuneStart(buf[2])) - // Output: - // true - // true - // false -} - -func ExampleValid() { - valid := []byte("Hello, 世界") - invalid := []byte{0xff, 0xfe, 0xfd} - - fmt.Println(utf8.Valid(valid)) - fmt.Println(utf8.Valid(invalid)) - // Output: - // true - // false -} - -func ExampleValidRune() { - valid := 'a' - invalid := rune(0xfffffff) - - fmt.Println(utf8.ValidRune(valid)) - fmt.Println(utf8.ValidRune(invalid)) - // Output: - // true - // false -} - -func ExampleValidString() { - valid := "Hello, 世界" - invalid := string([]byte{0xff, 0xfe, 0xfd}) - - fmt.Println(utf8.ValidString(valid)) - fmt.Println(utf8.ValidString(invalid)) - // Output: - // true - // false -} diff --git a/src/pkg/unicode/utf8/utf8.go b/src/pkg/unicode/utf8/utf8.go deleted file mode 100644 index 0dc859a04..000000000 --- a/src/pkg/unicode/utf8/utf8.go +++ /dev/null @@ -1,435 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package utf8 implements functions and constants to support text encoded in -// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. -package utf8 - -// The conditions RuneError==unicode.ReplacementChar and -// MaxRune==unicode.MaxRune are verified in the tests. -// Defining them locally avoids this package depending on package unicode. - -// Numbers fundamental to the encoding. -const ( - RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character" - RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte. - MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. - UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. -) - -// Code points in the surrogate range are not valid for UTF-8. -const ( - surrogateMin = 0xD800 - surrogateMax = 0xDFFF -) - -const ( - t1 = 0x00 // 0000 0000 - tx = 0x80 // 1000 0000 - t2 = 0xC0 // 1100 0000 - t3 = 0xE0 // 1110 0000 - t4 = 0xF0 // 1111 0000 - t5 = 0xF8 // 1111 1000 - - maskx = 0x3F // 0011 1111 - mask2 = 0x1F // 0001 1111 - mask3 = 0x0F // 0000 1111 - mask4 = 0x07 // 0000 0111 - - rune1Max = 1<<7 - 1 - rune2Max = 1<<11 - 1 - rune3Max = 1<<16 - 1 -) - -func decodeRuneInternal(p []byte) (r rune, size int, short bool) { - n := len(p) - if n < 1 { - return RuneError, 0, true - } - c0 := p[0] - - // 1-byte, 7-bit sequence? - if c0 < tx { - return rune(c0), 1, false - } - - // unexpected continuation byte? - if c0 < t2 { - return RuneError, 1, false - } - - // need first continuation byte - if n < 2 { - return RuneError, 1, true - } - c1 := p[1] - if c1 < tx || t2 <= c1 { - return RuneError, 1, false - } - - // 2-byte, 11-bit sequence? - if c0 < t3 { - r = rune(c0&mask2)<<6 | rune(c1&maskx) - if r <= rune1Max { - return RuneError, 1, false - } - return r, 2, false - } - - // need second continuation byte - if n < 3 { - return RuneError, 1, true - } - c2 := p[2] - if c2 < tx || t2 <= c2 { - return RuneError, 1, false - } - - // 3-byte, 16-bit sequence? - if c0 < t4 { - r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) - if r <= rune2Max { - return RuneError, 1, false - } - if surrogateMin <= r && r <= surrogateMax { - return RuneError, 1, false - } - return r, 3, false - } - - // need third continuation byte - if n < 4 { - return RuneError, 1, true - } - c3 := p[3] - if c3 < tx || t2 <= c3 { - return RuneError, 1, false - } - - // 4-byte, 21-bit sequence? - if c0 < t5 { - r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) - if r <= rune3Max || MaxRune < r { - return RuneError, 1, false - } - return r, 4, false - } - - // error - return RuneError, 1, false -} - -func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { - n := len(s) - if n < 1 { - return RuneError, 0, true - } - c0 := s[0] - - // 1-byte, 7-bit sequence? - if c0 < tx { - return rune(c0), 1, false - } - - // unexpected continuation byte? - if c0 < t2 { - return RuneError, 1, false - } - - // need first continuation byte - if n < 2 { - return RuneError, 1, true - } - c1 := s[1] - if c1 < tx || t2 <= c1 { - return RuneError, 1, false - } - - // 2-byte, 11-bit sequence? - if c0 < t3 { - r = rune(c0&mask2)<<6 | rune(c1&maskx) - if r <= rune1Max { - return RuneError, 1, false - } - return r, 2, false - } - - // need second continuation byte - if n < 3 { - return RuneError, 1, true - } - c2 := s[2] - if c2 < tx || t2 <= c2 { - return RuneError, 1, false - } - - // 3-byte, 16-bit sequence? - if c0 < t4 { - r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) - if r <= rune2Max { - return RuneError, 1, false - } - if surrogateMin <= r && r <= surrogateMax { - return RuneError, 1, false - } - return r, 3, false - } - - // need third continuation byte - if n < 4 { - return RuneError, 1, true - } - c3 := s[3] - if c3 < tx || t2 <= c3 { - return RuneError, 1, false - } - - // 4-byte, 21-bit sequence? - if c0 < t5 { - r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) - if r <= rune3Max || MaxRune < r { - return RuneError, 1, false - } - return r, 4, false - } - - // error - return RuneError, 1, false -} - -// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. -// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. -func FullRune(p []byte) bool { - _, _, short := decodeRuneInternal(p) - return !short -} - -// FullRuneInString is like FullRune but its input is a string. -func FullRuneInString(s string) bool { - _, _, short := decodeRuneInStringInternal(s) - return !short -} - -// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes. -// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. -// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is -// out of range, or is not the shortest possible UTF-8 encoding for the -// value. No other validation is performed. -func DecodeRune(p []byte) (r rune, size int) { - r, size, _ = decodeRuneInternal(p) - return -} - -// DecodeRuneInString is like DecodeRune but its input is a string. -// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. -// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is -// out of range, or is not the shortest possible UTF-8 encoding for the -// value. No other validation is performed. -func DecodeRuneInString(s string) (r rune, size int) { - r, size, _ = decodeRuneInStringInternal(s) - return -} - -// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes. -// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. -// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is -// out of range, or is not the shortest possible UTF-8 encoding for the -// value. No other validation is performed. -func DecodeLastRune(p []byte) (r rune, size int) { - end := len(p) - if end == 0 { - return RuneError, 0 - } - start := end - 1 - r = rune(p[start]) - if r < RuneSelf { - return r, 1 - } - // guard against O(n^2) behavior when traversing - // backwards through strings with long sequences of - // invalid UTF-8. - lim := end - UTFMax - if lim < 0 { - lim = 0 - } - for start--; start >= lim; start-- { - if RuneStart(p[start]) { - break - } - } - if start < 0 { - start = 0 - } - r, size = DecodeRune(p[start:end]) - if start+size != end { - return RuneError, 1 - } - return r, size -} - -// DecodeLastRuneInString is like DecodeLastRune but its input is a string. -// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. -// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is -// out of range, or is not the shortest possible UTF-8 encoding for the -// value. No other validation is performed. -func DecodeLastRuneInString(s string) (r rune, size int) { - end := len(s) - if end == 0 { - return RuneError, 0 - } - start := end - 1 - r = rune(s[start]) - if r < RuneSelf { - return r, 1 - } - // guard against O(n^2) behavior when traversing - // backwards through strings with long sequences of - // invalid UTF-8. - lim := end - UTFMax - if lim < 0 { - lim = 0 - } - for start--; start >= lim; start-- { - if RuneStart(s[start]) { - break - } - } - if start < 0 { - start = 0 - } - r, size = DecodeRuneInString(s[start:end]) - if start+size != end { - return RuneError, 1 - } - return r, size -} - -// RuneLen returns the number of bytes required to encode the rune. -// It returns -1 if the rune is not a valid value to encode in UTF-8. -func RuneLen(r rune) int { - switch { - case r < 0: - return -1 - case r <= rune1Max: - return 1 - case r <= rune2Max: - return 2 - case surrogateMin <= r && r <= surrogateMax: - return -1 - case r <= rune3Max: - return 3 - case r <= MaxRune: - return 4 - } - return -1 -} - -// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. -// It returns the number of bytes written. -func EncodeRune(p []byte, r rune) int { - // Negative values are erroneous. Making it unsigned addresses the problem. - switch i := uint32(r); { - case i <= rune1Max: - p[0] = byte(r) - return 1 - case i <= rune2Max: - p[0] = t2 | byte(r>>6) - p[1] = tx | byte(r)&maskx - return 2 - case i > MaxRune, surrogateMin <= i && i <= surrogateMax: - r = RuneError - fallthrough - case i <= rune3Max: - p[0] = t3 | byte(r>>12) - p[1] = tx | byte(r>>6)&maskx - p[2] = tx | byte(r)&maskx - return 3 - default: - p[0] = t4 | byte(r>>18) - p[1] = tx | byte(r>>12)&maskx - p[2] = tx | byte(r>>6)&maskx - p[3] = tx | byte(r)&maskx - return 4 - } -} - -// RuneCount returns the number of runes in p. Erroneous and short -// encodings are treated as single runes of width 1 byte. -func RuneCount(p []byte) int { - i := 0 - var n int - for n = 0; i < len(p); n++ { - if p[i] < RuneSelf { - i++ - } else { - _, size := DecodeRune(p[i:]) - i += size - } - } - return n -} - -// RuneCountInString is like RuneCount but its input is a string. -func RuneCountInString(s string) (n int) { - for _ = range s { - n++ - } - return -} - -// RuneStart reports whether the byte could be the first byte of -// an encoded rune. Second and subsequent bytes always have the top -// two bits set to 10. -func RuneStart(b byte) bool { return b&0xC0 != 0x80 } - -// Valid reports whether p consists entirely of valid UTF-8-encoded runes. -func Valid(p []byte) bool { - i := 0 - for i < len(p) { - if p[i] < RuneSelf { - i++ - } else { - _, size := DecodeRune(p[i:]) - if size == 1 { - // All valid runes of size 1 (those - // below RuneSelf) were handled above. - // This must be a RuneError. - return false - } - i += size - } - } - return true -} - -// ValidString reports whether s consists entirely of valid UTF-8-encoded runes. -func ValidString(s string) bool { - for i, r := range s { - if r == RuneError { - // The RuneError value can be an error - // sentinel value (if it's size 1) or the same - // value encoded properly. Decode it to see if - // it's the 1 byte sentinel value. - _, size := DecodeRuneInString(s[i:]) - if size == 1 { - return false - } - } - } - return true -} - -// ValidRune reports whether r can be legally encoded as UTF-8. -// Code points that are out of range or a surrogate half are illegal. -func ValidRune(r rune) bool { - switch { - case r < 0: - return false - case surrogateMin <= r && r <= surrogateMax: - return false - case r > MaxRune: - return false - } - return true -} diff --git a/src/pkg/unicode/utf8/utf8_test.go b/src/pkg/unicode/utf8/utf8_test.go deleted file mode 100644 index 758d7a0f8..000000000 --- a/src/pkg/unicode/utf8/utf8_test.go +++ /dev/null @@ -1,444 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package utf8_test - -import ( - "bytes" - "testing" - "unicode" - . "unicode/utf8" -) - -// Validate the constants redefined from unicode. -func init() { - if MaxRune != unicode.MaxRune { - panic("utf8.MaxRune is wrong") - } - if RuneError != unicode.ReplacementChar { - panic("utf8.RuneError is wrong") - } -} - -// Validate the constants redefined from unicode. -func TestConstants(t *testing.T) { - if MaxRune != unicode.MaxRune { - t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune) - } - if RuneError != unicode.ReplacementChar { - t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar) - } -} - -type Utf8Map struct { - r rune - str string -} - -var utf8map = []Utf8Map{ - {0x0000, "\x00"}, - {0x0001, "\x01"}, - {0x007e, "\x7e"}, - {0x007f, "\x7f"}, - {0x0080, "\xc2\x80"}, - {0x0081, "\xc2\x81"}, - {0x00bf, "\xc2\xbf"}, - {0x00c0, "\xc3\x80"}, - {0x00c1, "\xc3\x81"}, - {0x00c8, "\xc3\x88"}, - {0x00d0, "\xc3\x90"}, - {0x00e0, "\xc3\xa0"}, - {0x00f0, "\xc3\xb0"}, - {0x00f8, "\xc3\xb8"}, - {0x00ff, "\xc3\xbf"}, - {0x0100, "\xc4\x80"}, - {0x07ff, "\xdf\xbf"}, - {0x0800, "\xe0\xa0\x80"}, - {0x0801, "\xe0\xa0\x81"}, - {0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half. - {0xe000, "\xee\x80\x80"}, // first code point after surrogate half. - {0xfffe, "\xef\xbf\xbe"}, - {0xffff, "\xef\xbf\xbf"}, - {0x10000, "\xf0\x90\x80\x80"}, - {0x10001, "\xf0\x90\x80\x81"}, - {0x10fffe, "\xf4\x8f\xbf\xbe"}, - {0x10ffff, "\xf4\x8f\xbf\xbf"}, - {0xFFFD, "\xef\xbf\xbd"}, -} - -var surrogateMap = []Utf8Map{ - {0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1) - {0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1) -} - -var testStrings = []string{ - "", - "abcd", - "☺☻☹", - "日a本b語ç日ð本Ê語þ日¥本¼語i日©", - "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©", - "\x80\x80\x80\x80", -} - -func TestFullRune(t *testing.T) { - for _, m := range utf8map { - b := []byte(m.str) - if !FullRune(b) { - t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r) - } - s := m.str - if !FullRuneInString(s) { - t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r) - } - b1 := b[0 : len(b)-1] - if FullRune(b1) { - t.Errorf("FullRune(%q) = true, want false", b1) - } - s1 := string(b1) - if FullRuneInString(s1) { - t.Errorf("FullRune(%q) = true, want false", s1) - } - } -} - -func TestEncodeRune(t *testing.T) { - for _, m := range utf8map { - b := []byte(m.str) - var buf [10]byte - n := EncodeRune(buf[0:], m.r) - b1 := buf[0:n] - if !bytes.Equal(b, b1) { - t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b) - } - } -} - -func TestDecodeRune(t *testing.T) { - for _, m := range utf8map { - b := []byte(m.str) - r, size := DecodeRune(b) - if r != m.r || size != len(b) { - t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b)) - } - s := m.str - r, size = DecodeRuneInString(s) - if r != m.r || size != len(b) { - t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) - } - - // there's an extra byte that bytes left behind - make sure trailing byte works - r, size = DecodeRune(b[0:cap(b)]) - if r != m.r || size != len(b) { - t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b)) - } - s = m.str + "\x00" - r, size = DecodeRuneInString(s) - if r != m.r || size != len(b) { - t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) - } - - // make sure missing bytes fail - wantsize := 1 - if wantsize >= len(b) { - wantsize = 0 - } - r, size = DecodeRune(b[0 : len(b)-1]) - if r != RuneError || size != wantsize { - t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize) - } - s = m.str[0 : len(m.str)-1] - r, size = DecodeRuneInString(s) - if r != RuneError || size != wantsize { - t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize) - } - - // make sure bad sequences fail - if len(b) == 1 { - b[0] = 0x80 - } else { - b[len(b)-1] = 0x7F - } - r, size = DecodeRune(b) - if r != RuneError || size != 1 { - t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1) - } - s = string(b) - r, size = DecodeRuneInString(s) - if r != RuneError || size != 1 { - t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1) - } - - } -} - -func TestDecodeSurrogateRune(t *testing.T) { - for _, m := range surrogateMap { - b := []byte(m.str) - r, size := DecodeRune(b) - if r != RuneError || size != 1 { - t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1) - } - s := m.str - r, size = DecodeRuneInString(s) - if r != RuneError || size != 1 { - t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1) - } - } -} - -// Check that DecodeRune and DecodeLastRune correspond to -// the equivalent range loop. -func TestSequencing(t *testing.T) { - for _, ts := range testStrings { - for _, m := range utf8map { - for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} { - testSequence(t, s) - } - } - } -} - -// Check that a range loop and a []int conversion visit the same runes. -// Not really a test of this package, but the assumption is used here and -// it's good to verify -func TestIntConversion(t *testing.T) { - for _, ts := range testStrings { - runes := []rune(ts) - if RuneCountInString(ts) != len(runes) { - t.Errorf("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts)) - break - } - i := 0 - for _, r := range ts { - if r != runes[i] { - t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r) - } - i++ - } - } -} - -func testSequence(t *testing.T, s string) { - type info struct { - index int - r rune - } - index := make([]info, len(s)) - b := []byte(s) - si := 0 - j := 0 - for i, r := range s { - if si != i { - t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i) - return - } - index[j] = info{i, r} - j++ - r1, size1 := DecodeRune(b[i:]) - if r != r1 { - t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r) - return - } - r2, size2 := DecodeRuneInString(s[i:]) - if r != r2 { - t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r) - return - } - if size1 != size2 { - t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2) - return - } - si += size1 - } - j-- - for si = len(s); si > 0; { - r1, size1 := DecodeLastRune(b[0:si]) - r2, size2 := DecodeLastRuneInString(s[0:si]) - if size1 != size2 { - t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2) - return - } - if r1 != index[j].r { - t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r) - return - } - if r2 != index[j].r { - t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r) - return - } - si -= size1 - if si != index[j].index { - t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index) - return - } - j-- - } - if si != 0 { - t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si) - } -} - -// Check that negative runes encode as U+FFFD. -func TestNegativeRune(t *testing.T) { - errorbuf := make([]byte, UTFMax) - errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)] - buf := make([]byte, UTFMax) - buf = buf[0:EncodeRune(buf, -1)] - if !bytes.Equal(buf, errorbuf) { - t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf) - } -} - -type RuneCountTest struct { - in string - out int -} - -var runecounttests = []RuneCountTest{ - {"abcd", 4}, - {"☺☻☹", 3}, - {"1,2,3,4", 7}, - {"\xe2\x00", 2}, -} - -func TestRuneCount(t *testing.T) { - for _, tt := range runecounttests { - if out := RuneCountInString(tt.in); out != tt.out { - t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out) - } - if out := RuneCount([]byte(tt.in)); out != tt.out { - t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out) - } - } -} - -type RuneLenTest struct { - r rune - size int -} - -var runelentests = []RuneLenTest{ - {0, 1}, - {'e', 1}, - {'é', 2}, - {'☺', 3}, - {RuneError, 3}, - {MaxRune, 4}, - {0xD800, -1}, - {0xDFFF, -1}, - {MaxRune + 1, -1}, - {-1, -1}, -} - -func TestRuneLen(t *testing.T) { - for _, tt := range runelentests { - if size := RuneLen(tt.r); size != tt.size { - t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size) - } - } -} - -type ValidTest struct { - in string - out bool -} - -var validTests = []ValidTest{ - {"", true}, - {"a", true}, - {"abc", true}, - {"Ж", true}, - {"ЖЖ", true}, - {"брэд-ЛГТМ", true}, - {"☺☻☹", true}, - {string([]byte{66, 250}), false}, - {string([]byte{66, 250, 67}), false}, - {"a\uFFFDb", true}, - {string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF - {string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range - {string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range - {string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range - {string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect - {string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic) - {string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic) -} - -func TestValid(t *testing.T) { - for _, tt := range validTests { - if Valid([]byte(tt.in)) != tt.out { - t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out) - } - if ValidString(tt.in) != tt.out { - t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out) - } - } -} - -type ValidRuneTest struct { - r rune - ok bool -} - -var validrunetests = []ValidRuneTest{ - {0, true}, - {'e', true}, - {'é', true}, - {'☺', true}, - {RuneError, true}, - {MaxRune, true}, - {0xD7FF, true}, - {0xD800, false}, - {0xDFFF, false}, - {0xE000, true}, - {MaxRune + 1, false}, - {-1, false}, -} - -func TestValidRune(t *testing.T) { - for _, tt := range validrunetests { - if ok := ValidRune(tt.r); ok != tt.ok { - t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok) - } - } -} - -func BenchmarkRuneCountTenASCIIChars(b *testing.B) { - for i := 0; i < b.N; i++ { - RuneCountInString("0123456789") - } -} - -func BenchmarkRuneCountTenJapaneseChars(b *testing.B) { - for i := 0; i < b.N; i++ { - RuneCountInString("日本語日本語日本語日") - } -} - -func BenchmarkEncodeASCIIRune(b *testing.B) { - buf := make([]byte, UTFMax) - for i := 0; i < b.N; i++ { - EncodeRune(buf, 'a') - } -} - -func BenchmarkEncodeJapaneseRune(b *testing.B) { - buf := make([]byte, UTFMax) - for i := 0; i < b.N; i++ { - EncodeRune(buf, '本') - } -} - -func BenchmarkDecodeASCIIRune(b *testing.B) { - a := []byte{'a'} - for i := 0; i < b.N; i++ { - DecodeRune(a) - } -} - -func BenchmarkDecodeJapaneseRune(b *testing.B) { - nihon := []byte("本") - for i := 0; i < b.N; i++ { - DecodeRune(nihon) - } -} |