summaryrefslogtreecommitdiff
path: root/src/pkg/unicode
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/unicode')
-rw-r--r--src/pkg/unicode/letter.go4
-rw-r--r--src/pkg/unicode/letter_test.go16
-rw-r--r--src/pkg/unicode/maketables.go8
-rw-r--r--src/pkg/unicode/script_test.go2
-rw-r--r--src/pkg/unicode/tables.go65
-rw-r--r--src/pkg/unicode/utf16/utf16.go2
-rw-r--r--src/pkg/unicode/utf16/utf16_test.go48
-rw-r--r--src/pkg/unicode/utf8/example_test.go4
-rw-r--r--src/pkg/unicode/utf8/utf8.go32
9 files changed, 115 insertions, 66 deletions
diff --git a/src/pkg/unicode/letter.go b/src/pkg/unicode/letter.go
index fadaa57d8..977bd2b3b 100644
--- a/src/pkg/unicode/letter.go
+++ b/src/pkg/unicode/letter.go
@@ -74,7 +74,7 @@ const (
type d [MaxCase]rune // to make the CaseRanges text shorter
-// If the Delta field of a CaseRange is UpperLower or LowerUpper, it means
+// If the Delta field of a CaseRange is UpperLower, it means
// this CaseRange represents a sequence of the form (say)
// Upper Lower Upper Lower.
const (
@@ -316,7 +316,7 @@ type foldPair struct {
// SimpleFold iterates over Unicode code points equivalent under
// the Unicode-defined simple case folding. Among the code points
// equivalent to rune (including rune itself), SimpleFold returns the
-// smallest rune >= r if one exists, or else the smallest rune >= 0.
+// smallest rune > r if one exists, or else the smallest rune >= 0.
//
// For example:
// SimpleFold('A') = 'a'
diff --git a/src/pkg/unicode/letter_test.go b/src/pkg/unicode/letter_test.go
index e4d5572a0..4ee11fb36 100644
--- a/src/pkg/unicode/letter_test.go
+++ b/src/pkg/unicode/letter_test.go
@@ -387,32 +387,20 @@ func TestTurkishCase(t *testing.T) {
}
var simpleFoldTests = []string{
- // SimpleFold could order its returned slices in any order it wants,
- // but we know it orders them in increasing order starting at in
- // and looping around from MaxRune to 0.
+ // SimpleFold(x) returns the next equivalent rune > x or wraps
+ // around to smaller values.
// Easy cases.
"Aa",
- "aA",
"δΔ",
- "Δδ",
// ASCII special cases.
"KkK",
- "kKK",
- "KKk",
"Ssſ",
- "sſS",
- "ſSs",
// Non-ASCII special cases.
"ρϱΡ",
- "ϱΡρ",
- "Ρρϱ",
"ͅΙιι",
- "Ιιιͅ",
- "ιιͅΙ",
- "ιͅΙι",
// Extra special cases: has lower/upper but no case fold.
"İ",
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go
index e5ed08b23..8116ab8a4 100644
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -40,7 +40,7 @@ func main() {
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
var url = flag.String("url",
- "http://www.unicode.org/Public/6.2.0/ucd/",
+ "http://www.unicode.org/Public/6.3.0/ucd/",
"URL of Unicode database directory")
var tablelist = flag.String("tables",
"all",
@@ -386,7 +386,11 @@ func loadCasefold() {
}
}
-const progHeader = `// Generated by running
+const progHeader = `// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Generated by running
// maketables --tables=%s --data=%s --casefolding=%s
// DO NOT EDIT
diff --git a/src/pkg/unicode/script_test.go b/src/pkg/unicode/script_test.go
index 395cc71a0..e2ba0011a 100644
--- a/src/pkg/unicode/script_test.go
+++ b/src/pkg/unicode/script_test.go
@@ -182,7 +182,7 @@ var inPropTest = []T{
{0x0EC4, "Logical_Order_Exception"},
{0x2FFFF, "Noncharacter_Code_Point"},
{0x065E, "Other_Alphabetic"},
- {0x2069, "Other_Default_Ignorable_Code_Point"},
+ {0x2065, "Other_Default_Ignorable_Code_Point"},
{0x0BD7, "Other_Grapheme_Extend"},
{0x0387, "Other_ID_Continue"},
{0x212E, "Other_ID_Start"},
diff --git a/src/pkg/unicode/tables.go b/src/pkg/unicode/tables.go
index 939c41dc5..5670d1c5b 100644
--- a/src/pkg/unicode/tables.go
+++ b/src/pkg/unicode/tables.go
@@ -1,11 +1,15 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
// Generated by running
-// maketables --tables=all --data=http://www.unicode.org/Public/6.2.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.2.0/ucd/CaseFolding.txt
+// maketables --tables=all --data=http://www.unicode.org/Public/6.3.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.3.0/ucd/CaseFolding.txt
// DO NOT EDIT
package unicode
// Version is the Unicode edition from which the tables are derived.
-const Version = "6.2.0"
+const Version = "6.3.0"
// Categories is the set of Unicode category tables.
var Categories = map[string]*RangeTable{
@@ -53,11 +57,12 @@ var _C = &RangeTable{
{0x007f, 0x009f, 1},
{0x00ad, 0x0600, 1363},
{0x0601, 0x0604, 1},
- {0x06dd, 0x070f, 50},
+ {0x061c, 0x06dd, 193},
+ {0x070f, 0x180e, 4351},
{0x200b, 0x200f, 1},
{0x202a, 0x202e, 1},
{0x2060, 0x2064, 1},
- {0x206a, 0x206f, 1},
+ {0x2066, 0x206f, 1},
{0xd800, 0xf8ff, 1},
{0xfeff, 0xfff9, 250},
{0xfffa, 0xfffb, 1},
@@ -85,11 +90,12 @@ var _Cf = &RangeTable{
R16: []Range16{
{0x00ad, 0x0600, 1363},
{0x0601, 0x0604, 1},
- {0x06dd, 0x070f, 50},
+ {0x061c, 0x06dd, 193},
+ {0x070f, 0x180e, 4351},
{0x200b, 0x200f, 1},
{0x202a, 0x202e, 1},
{0x2060, 0x2064, 1},
- {0x206a, 0x206f, 1},
+ {0x2066, 0x206f, 1},
{0xfeff, 0xfff9, 250},
{0xfffa, 0xfffb, 1},
},
@@ -1545,7 +1551,7 @@ var _Mc = &RangeTable{
{0x1933, 0x1938, 1},
{0x19b0, 0x19c0, 1},
{0x19c8, 0x19c9, 1},
- {0x1a19, 0x1a1b, 1},
+ {0x1a19, 0x1a1a, 1},
{0x1a55, 0x1a57, 2},
{0x1a61, 0x1a63, 2},
{0x1a64, 0x1a6d, 9},
@@ -1717,8 +1723,8 @@ var _Mn = &RangeTable{
{0x1932, 0x1939, 7},
{0x193a, 0x193b, 1},
{0x1a17, 0x1a18, 1},
- {0x1a56, 0x1a58, 2},
- {0x1a59, 0x1a5e, 1},
+ {0x1a1b, 0x1a56, 59},
+ {0x1a58, 0x1a5e, 1},
{0x1a60, 0x1a62, 2},
{0x1a65, 0x1a6c, 1},
{0x1a73, 0x1a7c, 1},
@@ -2086,6 +2092,7 @@ var _P = &RangeTable{
{0x2053, 0x205e, 1},
{0x207d, 0x207e, 1},
{0x208d, 0x208e, 1},
+ {0x2308, 0x230b, 1},
{0x2329, 0x232a, 1},
{0x2768, 0x2775, 1},
{0x27c5, 0x27c6, 1},
@@ -2183,7 +2190,8 @@ var _Pe = &RangeTable{
{0x007d, 0x0f3b, 3774},
{0x0f3d, 0x169c, 1887},
{0x2046, 0x207e, 56},
- {0x208e, 0x232a, 668},
+ {0x208e, 0x2309, 635},
+ {0x230b, 0x232a, 31},
{0x2769, 0x2775, 2},
{0x27c6, 0x27e7, 33},
{0x27e9, 0x27ef, 2},
@@ -2360,7 +2368,8 @@ var _Ps = &RangeTable{
{0x0f3c, 0x169b, 1887},
{0x201a, 0x201e, 4},
{0x2045, 0x207d, 56},
- {0x208d, 0x2329, 668},
+ {0x208d, 0x2308, 635},
+ {0x230a, 0x2329, 31},
{0x2768, 0x2774, 2},
{0x27c5, 0x27e6, 33},
{0x27e8, 0x27ee, 2},
@@ -2450,7 +2459,8 @@ var _S = &RangeTable{
{0x2141, 0x2144, 1},
{0x214a, 0x214d, 1},
{0x214f, 0x2190, 65},
- {0x2191, 0x2328, 1},
+ {0x2191, 0x2307, 1},
+ {0x230c, 0x2328, 1},
{0x232b, 0x23f3, 1},
{0x2400, 0x2426, 1},
{0x2440, 0x244a, 1},
@@ -2630,7 +2640,6 @@ var _Sm = &RangeTable{
{0x21cf, 0x21d2, 3},
{0x21d4, 0x21f4, 32},
{0x21f5, 0x22ff, 1},
- {0x2308, 0x230b, 1},
{0x2320, 0x2321, 1},
{0x237c, 0x239b, 31},
{0x239c, 0x23b3, 1},
@@ -2818,8 +2827,8 @@ var _So = &RangeTable{
var _Z = &RangeTable{
R16: []Range16{
{0x0020, 0x00a0, 128},
- {0x1680, 0x180e, 398},
- {0x2000, 0x200a, 1},
+ {0x1680, 0x2000, 2432},
+ {0x2001, 0x200a, 1},
{0x2028, 0x2029, 1},
{0x202f, 0x205f, 48},
{0x3000, 0x3000, 1},
@@ -2842,8 +2851,8 @@ var _Zp = &RangeTable{
var _Zs = &RangeTable{
R16: []Range16{
{0x0020, 0x00a0, 128},
- {0x1680, 0x180e, 398},
- {0x2000, 0x200a, 1},
+ {0x1680, 0x2000, 2432},
+ {0x2001, 0x200a, 1},
{0x202f, 0x205f, 48},
{0x3000, 0x3000, 1},
},
@@ -2902,7 +2911,7 @@ var (
)
// Generated by running
-// maketables --scripts=all --url=http://www.unicode.org/Public/6.2.0/ucd/
+// maketables --scripts=all --url=http://www.unicode.org/Public/6.3.0/ucd/
// DO NOT EDIT
// Scripts is the set of Unicode script tables.
@@ -3016,6 +3025,7 @@ var _Arabic = &RangeTable{
{0x0600, 0x0604, 1},
{0x0606, 0x060b, 1},
{0x060d, 0x061a, 1},
+ {0x061c, 0x061c, 1},
{0x061e, 0x061e, 1},
{0x0620, 0x063f, 1},
{0x0641, 0x064a, 1},
@@ -3245,7 +3255,7 @@ var _Common = &RangeTable{
{0x1cf5, 0x1cf6, 1},
{0x2000, 0x200b, 1},
{0x200e, 0x2064, 1},
- {0x206a, 0x2070, 1},
+ {0x2066, 0x2070, 1},
{0x2074, 0x207e, 1},
{0x2080, 0x208e, 1},
{0x20a0, 0x20ba, 1},
@@ -3281,6 +3291,7 @@ var _Common = &RangeTable{
{0xa700, 0xa721, 1},
{0xa788, 0xa78a, 1},
{0xa830, 0xa839, 1},
+ {0xa9cf, 0xa9cf, 1},
{0xfd3e, 0xfd3f, 1},
{0xfdfd, 0xfdfd, 1},
{0xfe10, 0xfe19, 1},
@@ -3710,7 +3721,7 @@ var _Inscriptional_Parthian = &RangeTable{
var _Javanese = &RangeTable{
R16: []Range16{
{0xa980, 0xa9cd, 1},
- {0xa9cf, 0xa9d9, 1},
+ {0xa9d0, 0xa9d9, 1},
{0xa9de, 0xa9df, 1},
},
}
@@ -4403,7 +4414,7 @@ var (
)
// Generated by running
-// maketables --props=all --url=http://www.unicode.org/Public/6.2.0/ucd/
+// maketables --props=all --url=http://www.unicode.org/Public/6.3.0/ucd/
// DO NOT EDIT
// Properties is the set of Unicode property tables.
@@ -4453,8 +4464,10 @@ var _ASCII_Hex_Digit = &RangeTable{
var _Bidi_Control = &RangeTable{
R16: []Range16{
+ {0x061c, 0x061c, 1},
{0x200e, 0x200f, 1},
{0x202a, 0x202e, 1},
+ {0x2066, 0x2069, 1},
},
}
@@ -4931,7 +4944,7 @@ var _Other_Default_Ignorable_Code_Point = &RangeTable{
{0x034f, 0x034f, 1},
{0x115f, 0x1160, 1},
{0x17b4, 0x17b5, 1},
- {0x2065, 0x2069, 1},
+ {0x2065, 0x2065, 1},
{0x3164, 0x3164, 1},
{0xffa0, 0xffa0, 1},
{0xfff0, 0xfff8, 1},
@@ -5053,6 +5066,7 @@ var _Other_Math = &RangeTable{
{0x21d5, 0x21db, 1},
{0x21dd, 0x21dd, 1},
{0x21e4, 0x21e5, 1},
+ {0x2308, 0x230b, 1},
{0x23b4, 0x23b5, 1},
{0x23b7, 0x23b7, 1},
{0x23d0, 0x23d0, 1},
@@ -5440,7 +5454,6 @@ var _White_Space = &RangeTable{
{0x0085, 0x0085, 1},
{0x00a0, 0x00a0, 1},
{0x1680, 0x1680, 1},
- {0x180e, 0x180e, 1},
{0x2000, 0x200a, 1},
{0x2028, 0x2029, 1},
{0x202f, 0x202f, 1},
@@ -5487,7 +5500,7 @@ var (
)
// Generated by running
-// maketables --data=http://www.unicode.org/Public/6.2.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.2.0/ucd/CaseFolding.txt
+// maketables --data=http://www.unicode.org/Public/6.3.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.3.0/ucd/CaseFolding.txt
// DO NOT EDIT
// CaseRanges is the table describing case mappings for all letters with
@@ -6372,7 +6385,7 @@ var foldMn = &RangeTable{
// If there is no entry for a script name, there are no such points.
var FoldScript = map[string]*RangeTable{}
-// Range entries: 3462 16-bit, 832 32-bit, 4294 total.
-// Range bytes: 20772 16-bit, 9984 32-bit, 30756 total.
+// Range entries: 3471 16-bit, 832 32-bit, 4303 total.
+// Range bytes: 20826 16-bit, 9984 32-bit, 30810 total.
// Fold orbit bytes: 63 pairs, 252 bytes
diff --git a/src/pkg/unicode/utf16/utf16.go b/src/pkg/unicode/utf16/utf16.go
index 903e4012a..c0e47c535 100644
--- a/src/pkg/unicode/utf16/utf16.go
+++ b/src/pkg/unicode/utf16/utf16.go
@@ -36,7 +36,7 @@ func IsSurrogate(r rune) bool {
// the Unicode replacement code point U+FFFD.
func DecodeRune(r1, r2 rune) rune {
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
- return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + 0x10000
+ return (r1-surr1)<<10 | (r2 - surr2) + 0x10000
}
return replacementChar
}
diff --git a/src/pkg/unicode/utf16/utf16_test.go b/src/pkg/unicode/utf16/utf16_test.go
index ee16a303d..3dca472bb 100644
--- a/src/pkg/unicode/utf16/utf16_test.go
+++ b/src/pkg/unicode/utf16/utf16_test.go
@@ -99,3 +99,51 @@ func TestDecode(t *testing.T) {
}
}
}
+
+var decodeRuneTests = []struct {
+ r1, r2 rune
+ want rune
+}{
+ {0xd800, 0xdc00, 0x10000},
+ {0xd800, 0xdc01, 0x10001},
+ {0xd808, 0xdf45, 0x12345},
+ {0xdbff, 0xdfff, 0x10ffff},
+ {0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted
+}
+
+func TestDecodeRune(t *testing.T) {
+ for i, tt := range decodeRuneTests {
+ got := DecodeRune(tt.r1, tt.r2)
+ if got != tt.want {
+ t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want)
+ }
+ }
+}
+
+var surrogateTests = []struct {
+ r rune
+ want bool
+}{
+ // from http://en.wikipedia.org/wiki/UTF-16
+ {'\u007A', false}, // LATIN SMALL LETTER Z
+ {'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water)
+ {'\uFEFF', false}, // Byte Order Mark
+ {'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point)
+ {'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF
+ {'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point)
+
+ {rune(0xd7ff), false}, // surr1-1
+ {rune(0xd800), true}, // surr1
+ {rune(0xdc00), true}, // surr2
+ {rune(0xe000), false}, // surr3
+ {rune(0xdfff), true}, // surr3-1
+}
+
+func TestIsSurrogate(t *testing.T) {
+ for i, tt := range surrogateTests {
+ got := IsSurrogate(tt.r)
+ if got != tt.want {
+ t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want)
+ }
+ }
+}
diff --git a/src/pkg/unicode/utf8/example_test.go b/src/pkg/unicode/utf8/example_test.go
index fe2037336..7b3e7ac74 100644
--- a/src/pkg/unicode/utf8/example_test.go
+++ b/src/pkg/unicode/utf8/example_test.go
@@ -1,3 +1,7 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
package utf8_test
import (
diff --git a/src/pkg/unicode/utf8/utf8.go b/src/pkg/unicode/utf8/utf8.go
index 93d0be5e0..0dc859a04 100644
--- a/src/pkg/unicode/utf8/utf8.go
+++ b/src/pkg/unicode/utf8/utf8.go
@@ -329,37 +329,29 @@ func RuneLen(r rune) int {
// It returns the number of bytes written.
func EncodeRune(p []byte, r rune) int {
// Negative values are erroneous. Making it unsigned addresses the problem.
- if uint32(r) <= rune1Max {
+ switch i := uint32(r); {
+ case i <= rune1Max:
p[0] = byte(r)
return 1
- }
-
- if uint32(r) <= rune2Max {
+ case i <= rune2Max:
p[0] = t2 | byte(r>>6)
p[1] = tx | byte(r)&maskx
return 2
- }
-
- if uint32(r) > MaxRune {
+ case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
r = RuneError
- }
-
- if surrogateMin <= r && r <= surrogateMax {
- r = RuneError
- }
-
- if uint32(r) <= rune3Max {
+ fallthrough
+ case i <= rune3Max:
p[0] = t3 | byte(r>>12)
p[1] = tx | byte(r>>6)&maskx
p[2] = tx | byte(r)&maskx
return 3
+ default:
+ p[0] = t4 | byte(r>>18)
+ p[1] = tx | byte(r>>12)&maskx
+ p[2] = tx | byte(r>>6)&maskx
+ p[3] = tx | byte(r)&maskx
+ return 4
}
-
- p[0] = t4 | byte(r>>18)
- p[1] = tx | byte(r>>12)&maskx
- p[2] = tx | byte(r>>6)&maskx
- p[3] = tx | byte(r)&maskx
- return 4
}
// RuneCount returns the number of runes in p. Erroneous and short