diff options
Diffstat (limited to 'src/pkg/unicode')
-rw-r--r-- | src/pkg/unicode/letter.go | 4 | ||||
-rw-r--r-- | src/pkg/unicode/letter_test.go | 16 | ||||
-rw-r--r-- | src/pkg/unicode/maketables.go | 8 | ||||
-rw-r--r-- | src/pkg/unicode/script_test.go | 2 | ||||
-rw-r--r-- | src/pkg/unicode/tables.go | 65 | ||||
-rw-r--r-- | src/pkg/unicode/utf16/utf16.go | 2 | ||||
-rw-r--r-- | src/pkg/unicode/utf16/utf16_test.go | 48 | ||||
-rw-r--r-- | src/pkg/unicode/utf8/example_test.go | 4 | ||||
-rw-r--r-- | src/pkg/unicode/utf8/utf8.go | 32 |
9 files changed, 115 insertions, 66 deletions
diff --git a/src/pkg/unicode/letter.go b/src/pkg/unicode/letter.go index fadaa57d8..977bd2b3b 100644 --- a/src/pkg/unicode/letter.go +++ b/src/pkg/unicode/letter.go @@ -74,7 +74,7 @@ const ( type d [MaxCase]rune // to make the CaseRanges text shorter -// If the Delta field of a CaseRange is UpperLower or LowerUpper, it means +// If the Delta field of a CaseRange is UpperLower, it means // this CaseRange represents a sequence of the form (say) // Upper Lower Upper Lower. const ( @@ -316,7 +316,7 @@ type foldPair struct { // SimpleFold iterates over Unicode code points equivalent under // the Unicode-defined simple case folding. Among the code points // equivalent to rune (including rune itself), SimpleFold returns the -// smallest rune >= r if one exists, or else the smallest rune >= 0. +// smallest rune > r if one exists, or else the smallest rune >= 0. // // For example: // SimpleFold('A') = 'a' diff --git a/src/pkg/unicode/letter_test.go b/src/pkg/unicode/letter_test.go index e4d5572a0..4ee11fb36 100644 --- a/src/pkg/unicode/letter_test.go +++ b/src/pkg/unicode/letter_test.go @@ -387,32 +387,20 @@ func TestTurkishCase(t *testing.T) { } var simpleFoldTests = []string{ - // SimpleFold could order its returned slices in any order it wants, - // but we know it orders them in increasing order starting at in - // and looping around from MaxRune to 0. + // SimpleFold(x) returns the next equivalent rune > x or wraps + // around to smaller values. // Easy cases. "Aa", - "aA", "δΔ", - "Δδ", // ASCII special cases. "KkK", - "kKK", - "KKk", "Ssſ", - "sſS", - "ſSs", // Non-ASCII special cases. "ρϱΡ", - "ϱΡρ", - "Ρρϱ", "ͅΙιι", - "Ιιιͅ", - "ιιͅΙ", - "ιͅΙι", // Extra special cases: has lower/upper but no case fold. "İ", diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go index e5ed08b23..8116ab8a4 100644 --- a/src/pkg/unicode/maketables.go +++ b/src/pkg/unicode/maketables.go @@ -40,7 +40,7 @@ func main() { var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") var url = flag.String("url", - "http://www.unicode.org/Public/6.2.0/ucd/", + "http://www.unicode.org/Public/6.3.0/ucd/", "URL of Unicode database directory") var tablelist = flag.String("tables", "all", @@ -386,7 +386,11 @@ func loadCasefold() { } } -const progHeader = `// Generated by running +const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Generated by running // maketables --tables=%s --data=%s --casefolding=%s // DO NOT EDIT diff --git a/src/pkg/unicode/script_test.go b/src/pkg/unicode/script_test.go index 395cc71a0..e2ba0011a 100644 --- a/src/pkg/unicode/script_test.go +++ b/src/pkg/unicode/script_test.go @@ -182,7 +182,7 @@ var inPropTest = []T{ {0x0EC4, "Logical_Order_Exception"}, {0x2FFFF, "Noncharacter_Code_Point"}, {0x065E, "Other_Alphabetic"}, - {0x2069, "Other_Default_Ignorable_Code_Point"}, + {0x2065, "Other_Default_Ignorable_Code_Point"}, {0x0BD7, "Other_Grapheme_Extend"}, {0x0387, "Other_ID_Continue"}, {0x212E, "Other_ID_Start"}, diff --git a/src/pkg/unicode/tables.go b/src/pkg/unicode/tables.go index 939c41dc5..5670d1c5b 100644 --- a/src/pkg/unicode/tables.go +++ b/src/pkg/unicode/tables.go @@ -1,11 +1,15 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + // Generated by running -// maketables --tables=all --data=http://www.unicode.org/Public/6.2.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.2.0/ucd/CaseFolding.txt +// maketables --tables=all --data=http://www.unicode.org/Public/6.3.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.3.0/ucd/CaseFolding.txt // DO NOT EDIT package unicode // Version is the Unicode edition from which the tables are derived. -const Version = "6.2.0" +const Version = "6.3.0" // Categories is the set of Unicode category tables. var Categories = map[string]*RangeTable{ @@ -53,11 +57,12 @@ var _C = &RangeTable{ {0x007f, 0x009f, 1}, {0x00ad, 0x0600, 1363}, {0x0601, 0x0604, 1}, - {0x06dd, 0x070f, 50}, + {0x061c, 0x06dd, 193}, + {0x070f, 0x180e, 4351}, {0x200b, 0x200f, 1}, {0x202a, 0x202e, 1}, {0x2060, 0x2064, 1}, - {0x206a, 0x206f, 1}, + {0x2066, 0x206f, 1}, {0xd800, 0xf8ff, 1}, {0xfeff, 0xfff9, 250}, {0xfffa, 0xfffb, 1}, @@ -85,11 +90,12 @@ var _Cf = &RangeTable{ R16: []Range16{ {0x00ad, 0x0600, 1363}, {0x0601, 0x0604, 1}, - {0x06dd, 0x070f, 50}, + {0x061c, 0x06dd, 193}, + {0x070f, 0x180e, 4351}, {0x200b, 0x200f, 1}, {0x202a, 0x202e, 1}, {0x2060, 0x2064, 1}, - {0x206a, 0x206f, 1}, + {0x2066, 0x206f, 1}, {0xfeff, 0xfff9, 250}, {0xfffa, 0xfffb, 1}, }, @@ -1545,7 +1551,7 @@ var _Mc = &RangeTable{ {0x1933, 0x1938, 1}, {0x19b0, 0x19c0, 1}, {0x19c8, 0x19c9, 1}, - {0x1a19, 0x1a1b, 1}, + {0x1a19, 0x1a1a, 1}, {0x1a55, 0x1a57, 2}, {0x1a61, 0x1a63, 2}, {0x1a64, 0x1a6d, 9}, @@ -1717,8 +1723,8 @@ var _Mn = &RangeTable{ {0x1932, 0x1939, 7}, {0x193a, 0x193b, 1}, {0x1a17, 0x1a18, 1}, - {0x1a56, 0x1a58, 2}, - {0x1a59, 0x1a5e, 1}, + {0x1a1b, 0x1a56, 59}, + {0x1a58, 0x1a5e, 1}, {0x1a60, 0x1a62, 2}, {0x1a65, 0x1a6c, 1}, {0x1a73, 0x1a7c, 1}, @@ -2086,6 +2092,7 @@ var _P = &RangeTable{ {0x2053, 0x205e, 1}, {0x207d, 0x207e, 1}, {0x208d, 0x208e, 1}, + {0x2308, 0x230b, 1}, {0x2329, 0x232a, 1}, {0x2768, 0x2775, 1}, {0x27c5, 0x27c6, 1}, @@ -2183,7 +2190,8 @@ var _Pe = &RangeTable{ {0x007d, 0x0f3b, 3774}, {0x0f3d, 0x169c, 1887}, {0x2046, 0x207e, 56}, - {0x208e, 0x232a, 668}, + {0x208e, 0x2309, 635}, + {0x230b, 0x232a, 31}, {0x2769, 0x2775, 2}, {0x27c6, 0x27e7, 33}, {0x27e9, 0x27ef, 2}, @@ -2360,7 +2368,8 @@ var _Ps = &RangeTable{ {0x0f3c, 0x169b, 1887}, {0x201a, 0x201e, 4}, {0x2045, 0x207d, 56}, - {0x208d, 0x2329, 668}, + {0x208d, 0x2308, 635}, + {0x230a, 0x2329, 31}, {0x2768, 0x2774, 2}, {0x27c5, 0x27e6, 33}, {0x27e8, 0x27ee, 2}, @@ -2450,7 +2459,8 @@ var _S = &RangeTable{ {0x2141, 0x2144, 1}, {0x214a, 0x214d, 1}, {0x214f, 0x2190, 65}, - {0x2191, 0x2328, 1}, + {0x2191, 0x2307, 1}, + {0x230c, 0x2328, 1}, {0x232b, 0x23f3, 1}, {0x2400, 0x2426, 1}, {0x2440, 0x244a, 1}, @@ -2630,7 +2640,6 @@ var _Sm = &RangeTable{ {0x21cf, 0x21d2, 3}, {0x21d4, 0x21f4, 32}, {0x21f5, 0x22ff, 1}, - {0x2308, 0x230b, 1}, {0x2320, 0x2321, 1}, {0x237c, 0x239b, 31}, {0x239c, 0x23b3, 1}, @@ -2818,8 +2827,8 @@ var _So = &RangeTable{ var _Z = &RangeTable{ R16: []Range16{ {0x0020, 0x00a0, 128}, - {0x1680, 0x180e, 398}, - {0x2000, 0x200a, 1}, + {0x1680, 0x2000, 2432}, + {0x2001, 0x200a, 1}, {0x2028, 0x2029, 1}, {0x202f, 0x205f, 48}, {0x3000, 0x3000, 1}, @@ -2842,8 +2851,8 @@ var _Zp = &RangeTable{ var _Zs = &RangeTable{ R16: []Range16{ {0x0020, 0x00a0, 128}, - {0x1680, 0x180e, 398}, - {0x2000, 0x200a, 1}, + {0x1680, 0x2000, 2432}, + {0x2001, 0x200a, 1}, {0x202f, 0x205f, 48}, {0x3000, 0x3000, 1}, }, @@ -2902,7 +2911,7 @@ var ( ) // Generated by running -// maketables --scripts=all --url=http://www.unicode.org/Public/6.2.0/ucd/ +// maketables --scripts=all --url=http://www.unicode.org/Public/6.3.0/ucd/ // DO NOT EDIT // Scripts is the set of Unicode script tables. @@ -3016,6 +3025,7 @@ var _Arabic = &RangeTable{ {0x0600, 0x0604, 1}, {0x0606, 0x060b, 1}, {0x060d, 0x061a, 1}, + {0x061c, 0x061c, 1}, {0x061e, 0x061e, 1}, {0x0620, 0x063f, 1}, {0x0641, 0x064a, 1}, @@ -3245,7 +3255,7 @@ var _Common = &RangeTable{ {0x1cf5, 0x1cf6, 1}, {0x2000, 0x200b, 1}, {0x200e, 0x2064, 1}, - {0x206a, 0x2070, 1}, + {0x2066, 0x2070, 1}, {0x2074, 0x207e, 1}, {0x2080, 0x208e, 1}, {0x20a0, 0x20ba, 1}, @@ -3281,6 +3291,7 @@ var _Common = &RangeTable{ {0xa700, 0xa721, 1}, {0xa788, 0xa78a, 1}, {0xa830, 0xa839, 1}, + {0xa9cf, 0xa9cf, 1}, {0xfd3e, 0xfd3f, 1}, {0xfdfd, 0xfdfd, 1}, {0xfe10, 0xfe19, 1}, @@ -3710,7 +3721,7 @@ var _Inscriptional_Parthian = &RangeTable{ var _Javanese = &RangeTable{ R16: []Range16{ {0xa980, 0xa9cd, 1}, - {0xa9cf, 0xa9d9, 1}, + {0xa9d0, 0xa9d9, 1}, {0xa9de, 0xa9df, 1}, }, } @@ -4403,7 +4414,7 @@ var ( ) // Generated by running -// maketables --props=all --url=http://www.unicode.org/Public/6.2.0/ucd/ +// maketables --props=all --url=http://www.unicode.org/Public/6.3.0/ucd/ // DO NOT EDIT // Properties is the set of Unicode property tables. @@ -4453,8 +4464,10 @@ var _ASCII_Hex_Digit = &RangeTable{ var _Bidi_Control = &RangeTable{ R16: []Range16{ + {0x061c, 0x061c, 1}, {0x200e, 0x200f, 1}, {0x202a, 0x202e, 1}, + {0x2066, 0x2069, 1}, }, } @@ -4931,7 +4944,7 @@ var _Other_Default_Ignorable_Code_Point = &RangeTable{ {0x034f, 0x034f, 1}, {0x115f, 0x1160, 1}, {0x17b4, 0x17b5, 1}, - {0x2065, 0x2069, 1}, + {0x2065, 0x2065, 1}, {0x3164, 0x3164, 1}, {0xffa0, 0xffa0, 1}, {0xfff0, 0xfff8, 1}, @@ -5053,6 +5066,7 @@ var _Other_Math = &RangeTable{ {0x21d5, 0x21db, 1}, {0x21dd, 0x21dd, 1}, {0x21e4, 0x21e5, 1}, + {0x2308, 0x230b, 1}, {0x23b4, 0x23b5, 1}, {0x23b7, 0x23b7, 1}, {0x23d0, 0x23d0, 1}, @@ -5440,7 +5454,6 @@ var _White_Space = &RangeTable{ {0x0085, 0x0085, 1}, {0x00a0, 0x00a0, 1}, {0x1680, 0x1680, 1}, - {0x180e, 0x180e, 1}, {0x2000, 0x200a, 1}, {0x2028, 0x2029, 1}, {0x202f, 0x202f, 1}, @@ -5487,7 +5500,7 @@ var ( ) // Generated by running -// maketables --data=http://www.unicode.org/Public/6.2.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.2.0/ucd/CaseFolding.txt +// maketables --data=http://www.unicode.org/Public/6.3.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.3.0/ucd/CaseFolding.txt // DO NOT EDIT // CaseRanges is the table describing case mappings for all letters with @@ -6372,7 +6385,7 @@ var foldMn = &RangeTable{ // If there is no entry for a script name, there are no such points. var FoldScript = map[string]*RangeTable{} -// Range entries: 3462 16-bit, 832 32-bit, 4294 total. -// Range bytes: 20772 16-bit, 9984 32-bit, 30756 total. +// Range entries: 3471 16-bit, 832 32-bit, 4303 total. +// Range bytes: 20826 16-bit, 9984 32-bit, 30810 total. // Fold orbit bytes: 63 pairs, 252 bytes diff --git a/src/pkg/unicode/utf16/utf16.go b/src/pkg/unicode/utf16/utf16.go index 903e4012a..c0e47c535 100644 --- a/src/pkg/unicode/utf16/utf16.go +++ b/src/pkg/unicode/utf16/utf16.go @@ -36,7 +36,7 @@ func IsSurrogate(r rune) bool { // the Unicode replacement code point U+FFFD. func DecodeRune(r1, r2 rune) rune { if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { - return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + 0x10000 + return (r1-surr1)<<10 | (r2 - surr2) + 0x10000 } return replacementChar } diff --git a/src/pkg/unicode/utf16/utf16_test.go b/src/pkg/unicode/utf16/utf16_test.go index ee16a303d..3dca472bb 100644 --- a/src/pkg/unicode/utf16/utf16_test.go +++ b/src/pkg/unicode/utf16/utf16_test.go @@ -99,3 +99,51 @@ func TestDecode(t *testing.T) { } } } + +var decodeRuneTests = []struct { + r1, r2 rune + want rune +}{ + {0xd800, 0xdc00, 0x10000}, + {0xd800, 0xdc01, 0x10001}, + {0xd808, 0xdf45, 0x12345}, + {0xdbff, 0xdfff, 0x10ffff}, + {0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted +} + +func TestDecodeRune(t *testing.T) { + for i, tt := range decodeRuneTests { + got := DecodeRune(tt.r1, tt.r2) + if got != tt.want { + t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want) + } + } +} + +var surrogateTests = []struct { + r rune + want bool +}{ + // from http://en.wikipedia.org/wiki/UTF-16 + {'\u007A', false}, // LATIN SMALL LETTER Z + {'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water) + {'\uFEFF', false}, // Byte Order Mark + {'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point) + {'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF + {'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point) + + {rune(0xd7ff), false}, // surr1-1 + {rune(0xd800), true}, // surr1 + {rune(0xdc00), true}, // surr2 + {rune(0xe000), false}, // surr3 + {rune(0xdfff), true}, // surr3-1 +} + +func TestIsSurrogate(t *testing.T) { + for i, tt := range surrogateTests { + got := IsSurrogate(tt.r) + if got != tt.want { + t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want) + } + } +} diff --git a/src/pkg/unicode/utf8/example_test.go b/src/pkg/unicode/utf8/example_test.go index fe2037336..7b3e7ac74 100644 --- a/src/pkg/unicode/utf8/example_test.go +++ b/src/pkg/unicode/utf8/example_test.go @@ -1,3 +1,7 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package utf8_test import ( diff --git a/src/pkg/unicode/utf8/utf8.go b/src/pkg/unicode/utf8/utf8.go index 93d0be5e0..0dc859a04 100644 --- a/src/pkg/unicode/utf8/utf8.go +++ b/src/pkg/unicode/utf8/utf8.go @@ -329,37 +329,29 @@ func RuneLen(r rune) int { // It returns the number of bytes written. func EncodeRune(p []byte, r rune) int { // Negative values are erroneous. Making it unsigned addresses the problem. - if uint32(r) <= rune1Max { + switch i := uint32(r); { + case i <= rune1Max: p[0] = byte(r) return 1 - } - - if uint32(r) <= rune2Max { + case i <= rune2Max: p[0] = t2 | byte(r>>6) p[1] = tx | byte(r)&maskx return 2 - } - - if uint32(r) > MaxRune { + case i > MaxRune, surrogateMin <= i && i <= surrogateMax: r = RuneError - } - - if surrogateMin <= r && r <= surrogateMax { - r = RuneError - } - - if uint32(r) <= rune3Max { + fallthrough + case i <= rune3Max: p[0] = t3 | byte(r>>12) p[1] = tx | byte(r>>6)&maskx p[2] = tx | byte(r)&maskx return 3 + default: + p[0] = t4 | byte(r>>18) + p[1] = tx | byte(r>>12)&maskx + p[2] = tx | byte(r>>6)&maskx + p[3] = tx | byte(r)&maskx + return 4 } - - p[0] = t4 | byte(r>>18) - p[1] = tx | byte(r>>12)&maskx - p[2] = tx | byte(r>>6)&maskx - p[3] = tx | byte(r)&maskx - return 4 } // RuneCount returns the number of runes in p. Erroneous and short |