summaryrefslogtreecommitdiff
path: root/src/pkg/unicode/utf8
diff options
context:
space:
mode:
authorMichael Stapelberg <stapelberg@debian.org>2013-03-04 21:27:36 +0100
committerMichael Stapelberg <michael@stapelberg.de>2013-03-04 21:27:36 +0100
commit04b08da9af0c450d645ab7389d1467308cfc2db8 (patch)
treedb247935fa4f2f94408edc3acd5d0d4f997aa0d8 /src/pkg/unicode/utf8
parent917c5fb8ec48e22459d77e3849e6d388f93d3260 (diff)
downloadgolang-04b08da9af0c450d645ab7389d1467308cfc2db8.tar.gz
Imported Upstream version 1.1~hg20130304upstream/1.1_hg20130304
Diffstat (limited to 'src/pkg/unicode/utf8')
-rw-r--r--src/pkg/unicode/utf8/example_test.go192
-rw-r--r--src/pkg/unicode/utf8/utf8.go56
-rw-r--r--src/pkg/unicode/utf8/utf8_test.go101
3 files changed, 333 insertions, 16 deletions
diff --git a/src/pkg/unicode/utf8/example_test.go b/src/pkg/unicode/utf8/example_test.go
new file mode 100644
index 000000000..fe2037336
--- /dev/null
+++ b/src/pkg/unicode/utf8/example_test.go
@@ -0,0 +1,192 @@
+package utf8_test
+
+import (
+ "fmt"
+ "unicode/utf8"
+)
+
+func ExampleDecodeLastRune() {
+ b := []byte("Hello, 世界")
+
+ for len(b) > 0 {
+ r, size := utf8.DecodeLastRune(b)
+ fmt.Printf("%c %v\n", r, size)
+
+ b = b[:len(b)-size]
+ }
+ // Output:
+ // 界 3
+ // 世 3
+ // 1
+ // , 1
+ // o 1
+ // l 1
+ // l 1
+ // e 1
+ // H 1
+}
+
+func ExampleDecodeLastRuneInString() {
+ str := "Hello, 世界"
+
+ for len(str) > 0 {
+ r, size := utf8.DecodeLastRuneInString(str)
+ fmt.Printf("%c %v\n", r, size)
+
+ str = str[:len(str)-size]
+ }
+ // Output:
+ // 界 3
+ // 世 3
+ // 1
+ // , 1
+ // o 1
+ // l 1
+ // l 1
+ // e 1
+ // H 1
+
+}
+
+func ExampleDecodeRune() {
+ b := []byte("Hello, 世界")
+
+ for len(b) > 0 {
+ r, size := utf8.DecodeRune(b)
+ fmt.Printf("%c %v\n", r, size)
+
+ b = b[size:]
+ }
+ // Output:
+ // H 1
+ // e 1
+ // l 1
+ // l 1
+ // o 1
+ // , 1
+ // 1
+ // 世 3
+ // 界 3
+}
+
+func ExampleDecodeRuneInString() {
+ str := "Hello, 世界"
+
+ for len(str) > 0 {
+ r, size := utf8.DecodeRuneInString(str)
+ fmt.Printf("%c %v\n", r, size)
+
+ str = str[size:]
+ }
+ // Output:
+ // H 1
+ // e 1
+ // l 1
+ // l 1
+ // o 1
+ // , 1
+ // 1
+ // 世 3
+ // 界 3
+}
+
+func ExampleEncodeRune() {
+ r := '世'
+ buf := make([]byte, 3)
+
+ n := utf8.EncodeRune(buf, r)
+
+ fmt.Println(buf)
+ fmt.Println(n)
+ // Output:
+ // [228 184 150]
+ // 3
+}
+
+func ExampleFullRune() {
+ buf := []byte{228, 184, 150} // 世
+ fmt.Println(utf8.FullRune(buf))
+ fmt.Println(utf8.FullRune(buf[:2]))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleFullRuneInString() {
+ str := "世"
+ fmt.Println(utf8.FullRuneInString(str))
+ fmt.Println(utf8.FullRuneInString(str[:2]))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleRuneCount() {
+ buf := []byte("Hello, 世界")
+ fmt.Println("bytes =", len(buf))
+ fmt.Println("runes =", utf8.RuneCount(buf))
+ // Output:
+ // bytes = 13
+ // runes = 9
+}
+
+func ExampleRuneCountInString() {
+ str := "Hello, 世界"
+ fmt.Println("bytes =", len(str))
+ fmt.Println("runes =", utf8.RuneCountInString(str))
+ // Output:
+ // bytes = 13
+ // runes = 9
+}
+
+func ExampleRuneLen() {
+ fmt.Println(utf8.RuneLen('a'))
+ fmt.Println(utf8.RuneLen('界'))
+ // Output:
+ // 1
+ // 3
+}
+
+func ExampleRuneStart() {
+ buf := []byte("a界")
+ fmt.Println(utf8.RuneStart(buf[0]))
+ fmt.Println(utf8.RuneStart(buf[1]))
+ fmt.Println(utf8.RuneStart(buf[2]))
+ // Output:
+ // true
+ // true
+ // false
+}
+
+func ExampleValid() {
+ valid := []byte("Hello, 世界")
+ invalid := []byte{0xff, 0xfe, 0xfd}
+
+ fmt.Println(utf8.Valid(valid))
+ fmt.Println(utf8.Valid(invalid))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleValidRune() {
+ valid := 'a'
+ invalid := rune(0xfffffff)
+
+ fmt.Println(utf8.ValidRune(valid))
+ fmt.Println(utf8.ValidRune(invalid))
+ // Output:
+ // true
+ // false
+}
+
+func ExampleValidString() {
+ valid := "Hello, 世界"
+ invalid := string([]byte{0xff, 0xfe, 0xfd})
+
+ fmt.Println(utf8.ValidString(valid))
+ fmt.Println(utf8.ValidString(invalid))
+ // Output:
+ // true
+ // false
+}
diff --git a/src/pkg/unicode/utf8/utf8.go b/src/pkg/unicode/utf8/utf8.go
index 57ea19e96..93d0be5e0 100644
--- a/src/pkg/unicode/utf8/utf8.go
+++ b/src/pkg/unicode/utf8/utf8.go
@@ -18,6 +18,12 @@ const (
UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
)
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+ surrogateMin = 0xD800
+ surrogateMax = 0xDFFF
+)
+
const (
t1 = 0x00 // 0000 0000
tx = 0x80 // 1000 0000
@@ -34,7 +40,6 @@ const (
rune1Max = 1<<7 - 1
rune2Max = 1<<11 - 1
rune3Max = 1<<16 - 1
- rune4Max = 1<<21 - 1
)
func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
@@ -87,6 +92,9 @@ func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
if r <= rune2Max {
return RuneError, 1, false
}
+ if surrogateMin <= r && r <= surrogateMax {
+ return RuneError, 1, false
+ }
return r, 3, false
}
@@ -102,7 +110,7 @@ func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
// 4-byte, 21-bit sequence?
if c0 < t5 {
r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
- if r <= rune3Max {
+ if r <= rune3Max || MaxRune < r {
return RuneError, 1, false
}
return r, 4, false
@@ -162,6 +170,9 @@ func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
if r <= rune2Max {
return RuneError, 1, false
}
+ if surrogateMin <= r && r <= surrogateMax {
+ return RuneError, 1, false
+ }
return r, 3, false
}
@@ -177,7 +188,7 @@ func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
// 4-byte, 21-bit sequence?
if c0 < t5 {
r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
- if r <= rune3Max {
+ if r <= rune3Max || MaxRune < r {
return RuneError, 1, false
}
return r, 4, false
@@ -202,6 +213,9 @@ func FullRuneInString(s string) bool {
// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeRune(p []byte) (r rune, size int) {
r, size, _ = decodeRuneInternal(p)
return
@@ -209,6 +223,9 @@ func DecodeRune(p []byte) (r rune, size int) {
// DecodeRuneInString is like DecodeRune but its input is a string.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeRuneInString(s string) (r rune, size int) {
r, size, _ = decodeRuneInStringInternal(s)
return
@@ -216,6 +233,9 @@ func DecodeRuneInString(s string) (r rune, size int) {
// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeLastRune(p []byte) (r rune, size int) {
end := len(p)
if end == 0 {
@@ -250,6 +270,9 @@ func DecodeLastRune(p []byte) (r rune, size int) {
// DecodeLastRuneInString is like DecodeLastRune but its input is a string.
// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
func DecodeLastRuneInString(s string) (r rune, size int) {
end := len(s)
if end == 0 {
@@ -283,15 +306,20 @@ func DecodeLastRuneInString(s string) (r rune, size int) {
}
// RuneLen returns the number of bytes required to encode the rune.
+// It returns -1 if the rune is not a valid value to encode in UTF-8.
func RuneLen(r rune) int {
switch {
+ case r < 0:
+ return -1
case r <= rune1Max:
return 1
case r <= rune2Max:
return 2
+ case surrogateMin <= r && r <= surrogateMax:
+ return -1
case r <= rune3Max:
return 3
- case r <= rune4Max:
+ case r <= MaxRune:
return 4
}
return -1
@@ -316,6 +344,10 @@ func EncodeRune(p []byte, r rune) int {
r = RuneError
}
+ if surrogateMin <= r && r <= surrogateMax {
+ r = RuneError
+ }
+
if uint32(r) <= rune3Max {
p[0] = t3 | byte(r>>12)
p[1] = tx | byte(r>>6)&maskx
@@ -368,7 +400,7 @@ func Valid(p []byte) bool {
} else {
_, size := DecodeRune(p[i:])
if size == 1 {
- // All valid runes of size of 1 (those
+ // All valid runes of size 1 (those
// below RuneSelf) were handled above.
// This must be a RuneError.
return false
@@ -395,3 +427,17 @@ func ValidString(s string) bool {
}
return true
}
+
+// ValidRune reports whether r can be legally encoded as UTF-8.
+// Code points that are out of range or a surrogate half are illegal.
+func ValidRune(r rune) bool {
+ switch {
+ case r < 0:
+ return false
+ case surrogateMin <= r && r <= surrogateMax:
+ return false
+ case r > MaxRune:
+ return false
+ }
+ return true
+}
diff --git a/src/pkg/unicode/utf8/utf8_test.go b/src/pkg/unicode/utf8/utf8_test.go
index 4f73c8fb8..c516871c9 100644
--- a/src/pkg/unicode/utf8/utf8_test.go
+++ b/src/pkg/unicode/utf8/utf8_test.go
@@ -56,6 +56,8 @@ var utf8map = []Utf8Map{
{0x07ff, "\xdf\xbf"},
{0x0800, "\xe0\xa0\x80"},
{0x0801, "\xe0\xa0\x81"},
+ {0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
+ {0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
{0xfffe, "\xef\xbf\xbe"},
{0xffff, "\xef\xbf\xbf"},
{0x10000, "\xf0\x90\x80\x80"},
@@ -65,6 +67,11 @@ var utf8map = []Utf8Map{
{0xFFFD, "\xef\xbf\xbd"},
}
+var surrogateMap = []Utf8Map{
+ {0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
+ {0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
+}
+
var testStrings = []string{
"",
"abcd",
@@ -75,8 +82,7 @@ var testStrings = []string{
}
func TestFullRune(t *testing.T) {
- for i := 0; i < len(utf8map); i++ {
- m := utf8map[i]
+ for _, m := range utf8map {
b := []byte(m.str)
if !FullRune(b) {
t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
@@ -97,8 +103,7 @@ func TestFullRune(t *testing.T) {
}
func TestEncodeRune(t *testing.T) {
- for i := 0; i < len(utf8map); i++ {
- m := utf8map[i]
+ for _, m := range utf8map {
b := []byte(m.str)
var buf [10]byte
n := EncodeRune(buf[0:], m.r)
@@ -110,8 +115,7 @@ func TestEncodeRune(t *testing.T) {
}
func TestDecodeRune(t *testing.T) {
- for i := 0; i < len(utf8map); i++ {
- m := utf8map[i]
+ for _, m := range utf8map {
b := []byte(m.str)
r, size := DecodeRune(b)
if r != m.r || size != len(b) {
@@ -168,6 +172,21 @@ func TestDecodeRune(t *testing.T) {
}
}
+func TestDecodeSurrogateRune(t *testing.T) {
+ for _, m := range surrogateMap {
+ b := []byte(m.str)
+ r, size := DecodeRune(b)
+ if r != RuneError || size != 1 {
+ t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
+ }
+ s := m.str
+ r, size = DecodeRuneInString(s)
+ if r != RuneError || size != 1 {
+ t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
+ }
+ }
+}
+
// Check that DecodeRune and DecodeLastRune correspond to
// the equivalent range loop.
func TestSequencing(t *testing.T) {
@@ -284,8 +303,7 @@ var runecounttests = []RuneCountTest{
}
func TestRuneCount(t *testing.T) {
- for i := 0; i < len(runecounttests); i++ {
- tt := runecounttests[i]
+ for _, tt := range runecounttests {
if out := RuneCountInString(tt.in); out != tt.out {
t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
}
@@ -295,6 +313,32 @@ func TestRuneCount(t *testing.T) {
}
}
+type RuneLenTest struct {
+ r rune
+ size int
+}
+
+var runelentests = []RuneLenTest{
+ {0, 1},
+ {'e', 1},
+ {'é', 2},
+ {'☺', 3},
+ {RuneError, 3},
+ {MaxRune, 4},
+ {0xD800, -1},
+ {0xDFFF, -1},
+ {MaxRune + 1, -1},
+ {-1, -1},
+}
+
+func TestRuneLen(t *testing.T) {
+ for _, tt := range runelentests {
+ if size := RuneLen(tt.r); size != tt.size {
+ t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
+ }
+ }
+}
+
type ValidTest struct {
in string
out bool
@@ -311,15 +355,50 @@ var validTests = []ValidTest{
{string([]byte{66, 250}), false},
{string([]byte{66, 250, 67}), false},
{"a\uFFFDb", true},
+ {string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF
+ {string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range
+ {string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range
+ {string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
+ {string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
+ {string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
+ {string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic)
}
func TestValid(t *testing.T) {
- for i, tt := range validTests {
+ for _, tt := range validTests {
if Valid([]byte(tt.in)) != tt.out {
- t.Errorf("%d. Valid(%q) = %v; want %v", i, tt.in, !tt.out, tt.out)
+ t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
}
if ValidString(tt.in) != tt.out {
- t.Errorf("%d. ValidString(%q) = %v; want %v", i, tt.in, !tt.out, tt.out)
+ t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
+ }
+ }
+}
+
+type ValidRuneTest struct {
+ r rune
+ ok bool
+}
+
+var validrunetests = []ValidRuneTest{
+ {0, true},
+ {'e', true},
+ {'é', true},
+ {'☺', true},
+ {RuneError, true},
+ {MaxRune, true},
+ {0xD7FF, true},
+ {0xD800, false},
+ {0xDFFF, false},
+ {0xE000, true},
+ {MaxRune + 1, false},
+ {-1, false},
+}
+
+func TestValidRune(t *testing.T) {
+ for _, tt := range validrunetests {
+ if ok := ValidRune(tt.r); ok != tt.ok {
+ t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
}
}
}