diff options
Diffstat (limited to 'src/pkg/unicode')
-rw-r--r-- | src/pkg/unicode/letter.go | 49 | ||||
-rw-r--r-- | src/pkg/unicode/letter_test.go | 46 | ||||
-rw-r--r-- | src/pkg/unicode/maketables.go | 353 | ||||
-rw-r--r-- | src/pkg/unicode/tables.go | 634 |
4 files changed, 947 insertions, 135 deletions
diff --git a/src/pkg/unicode/letter.go b/src/pkg/unicode/letter.go index a0c55bbf7..dbd8638ea 100644 --- a/src/pkg/unicode/letter.go +++ b/src/pkg/unicode/letter.go @@ -275,3 +275,52 @@ func (special SpecialCase) ToLower(rune int) int { } return r } + +// caseOrbit is defined in tables.go as []foldPair. Right now all the +// entries fit in uint16, so use uint16. If that changes, compilation +// will fail (the constants in the composite literal will not fit in uint16) +// and the types here can change to uint32. +type foldPair struct { + From uint16 + To uint16 +} + +// SimpleFold iterates over Unicode code points equivalent under +// the Unicode-defined simple case folding. Among the code points +// equivalent to rune (including rune itself), SimpleFold returns the +// smallest r >= rune if one exists, or else the smallest r >= 0. +// +// For example: +// SimpleFold('A') = 'a' +// SimpleFold('a') = 'A' +// +// SimpleFold('K') = 'k' +// SimpleFold('k') = '\u212A' (Kelvin symbol, K) +// SimpleFold('\u212A') = 'K' +// +// SimpleFold('1') = '1' +// +func SimpleFold(rune int) int { + // Consult caseOrbit table for special cases. + lo := 0 + hi := len(caseOrbit) + for lo < hi { + m := lo + (hi-lo)/2 + if int(caseOrbit[m].From) < rune { + lo = m + 1 + } else { + hi = m + } + } + if lo < len(caseOrbit) && int(caseOrbit[lo].From) == rune { + return int(caseOrbit[lo].To) + } + + // No folding specified. This is a one- or two-element + // equivalence class containing rune and ToLower(rune) + // and ToUpper(rune) if they are different from rune. + if l := ToLower(rune); l != rune { + return l + } + return ToUpper(rune) +} diff --git a/src/pkg/unicode/letter_test.go b/src/pkg/unicode/letter_test.go index 4c24ffc51..c4e26df58 100644 --- a/src/pkg/unicode/letter_test.go +++ b/src/pkg/unicode/letter_test.go @@ -376,3 +376,49 @@ func TestTurkishCase(t *testing.T) { } } } + +var simpleFoldTests = []string{ + // SimpleFold could order its returned slices in any order it wants, + // but we know it orders them in increasing order starting at in + // and looping around from MaxRune to 0. + + // Easy cases. + "Aa", + "aA", + "δΔ", + "Δδ", + + // ASCII special cases. + "KkK", + "kKK", + "KKk", + "Ssſ", + "sſS", + "ſSs", + + // Non-ASCII special cases. + "ρϱΡ", + "ϱΡρ", + "Ρρϱ", + "ͅΙιι", + "Ιιιͅ", + "ιιͅΙ", + "ιͅΙι", + + // Extra special cases: has lower/upper but no case fold. + "İ", + "ı", +} + +func TestSimpleFold(t *testing.T) { + for _, tt := range simpleFoldTests { + cycle := []int(tt) + rune := cycle[len(cycle)-1] + for _, out := range cycle { + if r := SimpleFold(rune); r != out { + t.Errorf("SimpleFold(%#U) = %#U, want %#U", rune, r, out) + } + rune = out + } + } +} diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go index 655fe46e4..07b931d7e 100644 --- a/src/pkg/unicode/maketables.go +++ b/src/pkg/unicode/maketables.go @@ -24,15 +24,18 @@ import ( func main() { flag.Parse() loadChars() // always needed + loadCasefold() printCategories() printScriptOrProperty(false) printScriptOrProperty(true) printCases() printLatinProperties() + printCasefold() printSizes() } var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") +var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") var url = flag.String("url", "http://www.unicode.org/Public/6.0.0/ucd/", "URL of Unicode database directory") @@ -70,7 +73,7 @@ var category = map[string]bool{ // UnicodeData.txt has form: // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A -// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation +// See http://www.unicode.org/reports/tr44/ for a full explanation // The fields: const ( FCodePoint = iota @@ -78,10 +81,10 @@ const ( FGeneralCategory FCanonicalCombiningClass FBidiClass - FDecompositionType - FDecompositionMapping + FDecompositionTypeAndMapping FNumericType - FNumericValue + FNumericDigit // If a decimal digit. + FNumericValue // Includes non-decimal, e.g. U+2155=1/5 FBidiMirrored FUnicode1Name FISOComment @@ -94,21 +97,21 @@ const ( ) var fieldName = []string{ - "CodePoint", - "Name", - "GeneralCategory", - "CanonicalCombiningClass", - "BidiClass", - "DecompositionType", - "DecompositionMapping", - "NumericType", - "NumericValue", - "BidiMirrored", - "Unicode1Name", - "ISOComment", - "SimpleUppercaseMapping", - "SimpleLowercaseMapping", - "SimpleTitlecaseMapping", + FCodePoint: "CodePoint", + FName: "Name", + FGeneralCategory: "GeneralCategory", + FCanonicalCombiningClass: "CanonicalCombiningClass", + FBidiClass: "BidiClass", + FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", + FNumericType: "NumericType", + FNumericDigit: "NumericDigit", + FNumericValue: "NumericValue", + FBidiMirrored: "BidiMirrored", + FUnicode1Name: "Unicode1Name", + FISOComment: "ISOComment", + FSimpleUppercaseMapping: "SimpleUppercaseMapping", + FSimpleLowercaseMapping: "SimpleLowercaseMapping", + FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", } // This contains only the properties we're interested in. @@ -119,6 +122,8 @@ type Char struct { upperCase int lowerCase int titleCase int + foldCase int // simple case folding + caseOrbit int // next in simple case folding orbit } // Scripts.txt has form: @@ -151,7 +156,7 @@ const ( ) func parseCategory(line string) (state State) { - field := strings.Split(line, ";", -1) + field := strings.Split(line, ";") if len(field) != NumField { logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) } @@ -248,7 +253,7 @@ func all(scripts map[string][]Script) []string { // Extract the version number from the URL func version() string { // Break on slashes and look for the first numeric field - fields := strings.Split(*url, "/", -1) + fields := strings.Split(*url, "/") for _, f := range fields { if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { return f @@ -308,8 +313,53 @@ func loadChars() { resp.Body.Close() } +func loadCasefold() { + if *casefoldingURL == "" { + flag.Set("casefolding", *url+"CaseFolding.txt") + } + resp, err := http.Get(*casefoldingURL) + if err != nil { + logger.Fatal(err) + } + if resp.StatusCode != 200 { + logger.Fatal("bad GET status for CaseFolding.txt", resp.Status) + } + input := bufio.NewReader(resp.Body) + for { + line, err := input.ReadString('\n') + if err != nil { + if err == os.EOF { + break + } + logger.Fatal(err) + } + if line[0] == '#' { + continue + } + field := strings.Split(line, "; ") + if len(field) != 4 { + logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) + } + kind := field[1] + if kind != "C" && kind != "S" { + // Only care about 'common' and 'simple' foldings. + continue + } + p1, err := strconv.Btoui64(field[0], 16) + if err != nil { + logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) + } + p2, err := strconv.Btoui64(field[2], 16) + if err != nil { + logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) + } + chars[p1].foldCase = int(p2) + } + resp.Body.Close() +} + const progHeader = `// Generated by running -// maketables --tables=%s --data=%s +// maketables --tables=%s --data=%s --casefolding=%s // DO NOT EDIT package unicode @@ -322,7 +372,7 @@ func printCategories() { return } // Find out which categories to dump - list := strings.Split(*tablelist, ",", -1) + list := strings.Split(*tablelist, ",") if *tablelist == "all" { list = allCategories() } @@ -330,7 +380,7 @@ func printCategories() { fullCategoryTest(list) return } - fmt.Printf(progHeader, *tablelist, *dataURL) + fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL) fmt.Println("// Version is the Unicode edition from which the tables are derived.") fmt.Printf("const Version = %q\n\n", version()) @@ -344,7 +394,7 @@ func printCategories() { fmt.Print("}\n\n") } - decl := make(sort.StringArray, len(list)) + decl := make(sort.StringSlice, len(list)) ndecl := 0 for _, name := range list { if _, ok := category[name]; !ok { @@ -538,7 +588,7 @@ func parseScript(line string, scripts map[string][]Script) { if len(line) == 0 { return } - field := strings.Split(line, ";", -1) + field := strings.Split(line, ";") if len(field) != 2 { logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) } @@ -635,7 +685,7 @@ func printScriptOrProperty(doProps bool) { resp.Body.Close() // Find out which scripts to dump - list := strings.Split(flaglist, ",", -1) + list := strings.Split(flaglist, ",") if flaglist == "all" { list = all(table) } @@ -665,7 +715,7 @@ func printScriptOrProperty(doProps bool) { fmt.Print("}\n\n") } - decl := make(sort.StringArray, len(list)) + decl := make(sort.StringSlice, len(list)) ndecl := 0 for _, name := range list { if doProps { @@ -837,13 +887,13 @@ func printCases() { } fmt.Printf( "// Generated by running\n"+ - "// maketables --data=%s\n"+ + "// maketables --data=%s --casefolding=%s\n"+ "// DO NOT EDIT\n\n"+ "// CaseRanges is the table describing case mappings for all letters with\n"+ "// non-self mappings.\n"+ "var CaseRanges = _CaseRanges\n"+ "var _CaseRanges = []CaseRange {\n", - *dataURL) + *dataURL, *casefoldingURL) var startState *caseState // the start of a run; nil for not active var prevState = &caseState{} // the state of the previous character @@ -946,13 +996,246 @@ func printLatinProperties() { if code == ' ' { property = "pZ | pp" } - fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code) + fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code) } - fmt.Println("}") + fmt.Printf("}\n\n") } -var range16Count = 0 // Number of entries in the 16-bit range tables. -var range32Count = 0 // Number of entries in the 32-bit range tables. +func printCasefold() { + // Build list of case-folding groups attached to each canonical folded char (typically lower case). + var caseOrbit = make([][]int, MaxChar+1) + for i := range chars { + c := &chars[i] + if c.foldCase == 0 { + continue + } + orb := caseOrbit[c.foldCase] + if orb == nil { + orb = append(orb, c.foldCase) + } + caseOrbit[c.foldCase] = append(orb, i) + } + + // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. + for i := range chars { + c := &chars[i] + f := c.foldCase + if f == 0 { + f = i + } + orb := caseOrbit[f] + if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { + // Default assumption of [upper, lower] is wrong. + caseOrbit[i] = []int{i} + } + } + + // Delete the groups for which assuming [lower, upper] is right. + for i, orb := range caseOrbit { + if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { + caseOrbit[i] = nil + } + } + + // Record orbit information in chars. + for _, orb := range caseOrbit { + if orb == nil { + continue + } + sort.Ints(orb) + c := orb[len(orb)-1] + for _, d := range orb { + chars[c].caseOrbit = d + c = d + } + } + + printCaseOrbit() + + // Tables of category and script folding exceptions: code points + // that must be added when interpreting a particular category/script + // in a case-folding context. + cat := make(map[string]map[int]bool) + for name := range category { + if x := foldExceptions(inCategory(name)); len(x) > 0 { + cat[name] = x + } + } + + scr := make(map[string]map[int]bool) + for name := range scripts { + if x := foldExceptions(inScript(name)); len(x) > 0 { + cat[name] = x + } + } + + printCatFold("FoldCategory", cat) + printCatFold("FoldScript", scr) +} + +// inCategory returns a list of all the runes in the category. +func inCategory(name string) []int { + var x []int + for i := range chars { + c := &chars[i] + if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { + x = append(x, i) + } + } + return x +} + +// inScript returns a list of all the runes in the script. +func inScript(name string) []int { + var x []int + for _, s := range scripts[name] { + for c := s.lo; c <= s.hi; c++ { + x = append(x, int(c)) + } + } + return x +} + +// foldExceptions returns a list of all the runes fold-equivalent +// to runes in class but not in class themselves. +func foldExceptions(class []int) map[int]bool { + // Create map containing class and all fold-equivalent chars. + m := make(map[int]bool) + for _, r := range class { + c := &chars[r] + if c.caseOrbit == 0 { + // Just upper and lower. + if u := c.upperCase; u != 0 { + m[u] = true + } + if l := c.lowerCase; l != 0 { + m[l] = true + } + m[r] = true + continue + } + // Otherwise walk orbit. + r0 := r + for { + m[r] = true + r = chars[r].caseOrbit + if r == r0 { + break + } + } + } + + // Remove class itself. + for _, r := range class { + m[r] = false, false + } + + // What's left is the exceptions. + return m +} + +var comment = map[string]string{ + "FoldCategory": "// FoldCategory maps a category name to a table of\n" + + "// code points outside the category that are equivalent under\n" + + "// simple case folding to code points inside the category.\n" + + "// If there is no entry for a category name, there are no such points.\n", + + "FoldScript": "// FoldScript maps a script name to a table of\n" + + "// code points outside the script that are equivalent under\n" + + "// simple case folding to code points inside the script.\n" + + "// If there is no entry for a script name, there are no such points.\n", +} + +func printCaseOrbit() { + if *test { + for i := range chars { + c := &chars[i] + f := c.caseOrbit + if f == 0 { + if c.lowerCase != i && c.lowerCase != 0 { + f = c.lowerCase + } else if c.upperCase != i && c.upperCase != 0 { + f = c.upperCase + } else { + f = i + } + } + if g := unicode.SimpleFold(i); g != f { + fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) + } + } + return + } + + fmt.Printf("var caseOrbit = []foldPair{\n") + for i := range chars { + c := &chars[i] + if c.caseOrbit != 0 { + fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) + foldPairCount++ + } + } + fmt.Printf("}\n\n") +} + +func printCatFold(name string, m map[string]map[int]bool) { + if *test { + var pkgMap map[string]*unicode.RangeTable + if name == "FoldCategory" { + pkgMap = unicode.FoldCategory + } else { + pkgMap = unicode.FoldScript + } + if len(pkgMap) != len(m) { + fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) + return + } + for k, v := range m { + t, ok := pkgMap[k] + if !ok { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) + continue + } + n := 0 + for _, r := range t.R16 { + for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) { + if !v[c] { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) + } + n++ + } + } + for _, r := range t.R32 { + for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) { + if !v[c] { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) + } + n++ + } + } + if n != len(v) { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) + } + } + return + } + + fmt.Print(comment[name]) + fmt.Printf("var %s = map[string]*RangeTable{\n", name) + for name := range m { + fmt.Printf("\t%q: fold%s,\n", name, name) + } + fmt.Printf("}\n\n") + for name, class := range m { + dumpRange( + fmt.Sprintf("var fold%s = &RangeTable{\n", name), + func(code int) bool { return class[code] }) + } +} + +var range16Count = 0 // Number of entries in the 16-bit range tables. +var range32Count = 0 // Number of entries in the 32-bit range tables. +var foldPairCount = 0 // Number of fold pairs in the exception tables. func printSizes() { if *test { @@ -963,4 +1246,6 @@ func printSizes() { range16Bytes := range16Count * 3 * 2 range32Bytes := range32Count * 3 * 4 fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) + fmt.Println() + fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) } diff --git a/src/pkg/unicode/tables.go b/src/pkg/unicode/tables.go index 32681a8c0..a75011adb 100644 --- a/src/pkg/unicode/tables.go +++ b/src/pkg/unicode/tables.go @@ -1,5 +1,5 @@ // Generated by running -// maketables --tables=all --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt +// maketables --tables=all --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.0.0/ucd/CaseFolding.txt // DO NOT EDIT package unicode @@ -5150,7 +5150,7 @@ var ( ) // Generated by running -// maketables --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt +// maketables --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.0.0/ucd/CaseFolding.txt // DO NOT EDIT // CaseRanges is the table describing case mappings for all letters with @@ -5539,7 +5539,7 @@ var properties = [MaxLatin1 + 1]uint8{ 0x7C: pS | pp, // '|' 0x7D: pP | pp, // '}' 0x7E: pS | pp, // '~' - 0x7F: pC, // '\x7f' + 0x7F: pC, // '\u007f' 0x80: pC, // '\u0080' 0x81: pC, // '\u0081' 0x82: pC, // '\u0082' @@ -5573,102 +5573,534 @@ var properties = [MaxLatin1 + 1]uint8{ 0x9E: pC, // '\u009e' 0x9F: pC, // '\u009f' 0xA0: pZ, // '\u00a0' - 0xA1: pP | pp, // '\u00a1' - 0xA2: pS | pp, // '\u00a2' - 0xA3: pS | pp, // '\u00a3' - 0xA4: pS | pp, // '\u00a4' - 0xA5: pS | pp, // '\u00a5' - 0xA6: pS | pp, // '\u00a6' - 0xA7: pS | pp, // '\u00a7' - 0xA8: pS | pp, // '\u00a8' - 0xA9: pS | pp, // '\u00a9' - 0xAA: pLl | pp, // '\u00aa' - 0xAB: pP | pp, // '\u00ab' - 0xAC: pS | pp, // '\u00ac' + 0xA1: pP | pp, // '¡' + 0xA2: pS | pp, // '¢' + 0xA3: pS | pp, // '£' + 0xA4: pS | pp, // '¤' + 0xA5: pS | pp, // '¥' + 0xA6: pS | pp, // '¦' + 0xA7: pS | pp, // '§' + 0xA8: pS | pp, // '¨' + 0xA9: pS | pp, // '©' + 0xAA: pLl | pp, // 'ª' + 0xAB: pP | pp, // '«' + 0xAC: pS | pp, // '¬' 0xAD: 0, // '\u00ad' - 0xAE: pS | pp, // '\u00ae' - 0xAF: pS | pp, // '\u00af' - 0xB0: pS | pp, // '\u00b0' - 0xB1: pS | pp, // '\u00b1' - 0xB2: pN | pp, // '\u00b2' - 0xB3: pN | pp, // '\u00b3' - 0xB4: pS | pp, // '\u00b4' - 0xB5: pLl | pp, // '\u00b5' - 0xB6: pS | pp, // '\u00b6' - 0xB7: pP | pp, // '\u00b7' - 0xB8: pS | pp, // '\u00b8' - 0xB9: pN | pp, // '\u00b9' - 0xBA: pLl | pp, // '\u00ba' - 0xBB: pP | pp, // '\u00bb' - 0xBC: pN | pp, // '\u00bc' - 0xBD: pN | pp, // '\u00bd' - 0xBE: pN | pp, // '\u00be' - 0xBF: pP | pp, // '\u00bf' - 0xC0: pLu | pp, // '\u00c0' - 0xC1: pLu | pp, // '\u00c1' - 0xC2: pLu | pp, // '\u00c2' - 0xC3: pLu | pp, // '\u00c3' - 0xC4: pLu | pp, // '\u00c4' - 0xC5: pLu | pp, // '\u00c5' - 0xC6: pLu | pp, // '\u00c6' - 0xC7: pLu | pp, // '\u00c7' - 0xC8: pLu | pp, // '\u00c8' - 0xC9: pLu | pp, // '\u00c9' - 0xCA: pLu | pp, // '\u00ca' - 0xCB: pLu | pp, // '\u00cb' - 0xCC: pLu | pp, // '\u00cc' - 0xCD: pLu | pp, // '\u00cd' - 0xCE: pLu | pp, // '\u00ce' - 0xCF: pLu | pp, // '\u00cf' - 0xD0: pLu | pp, // '\u00d0' - 0xD1: pLu | pp, // '\u00d1' - 0xD2: pLu | pp, // '\u00d2' - 0xD3: pLu | pp, // '\u00d3' - 0xD4: pLu | pp, // '\u00d4' - 0xD5: pLu | pp, // '\u00d5' - 0xD6: pLu | pp, // '\u00d6' - 0xD7: pS | pp, // '\u00d7' - 0xD8: pLu | pp, // '\u00d8' - 0xD9: pLu | pp, // '\u00d9' - 0xDA: pLu | pp, // '\u00da' - 0xDB: pLu | pp, // '\u00db' - 0xDC: pLu | pp, // '\u00dc' - 0xDD: pLu | pp, // '\u00dd' - 0xDE: pLu | pp, // '\u00de' - 0xDF: pLl | pp, // '\u00df' - 0xE0: pLl | pp, // '\u00e0' - 0xE1: pLl | pp, // '\u00e1' - 0xE2: pLl | pp, // '\u00e2' - 0xE3: pLl | pp, // '\u00e3' - 0xE4: pLl | pp, // '\u00e4' - 0xE5: pLl | pp, // '\u00e5' - 0xE6: pLl | pp, // '\u00e6' - 0xE7: pLl | pp, // '\u00e7' - 0xE8: pLl | pp, // '\u00e8' - 0xE9: pLl | pp, // '\u00e9' - 0xEA: pLl | pp, // '\u00ea' - 0xEB: pLl | pp, // '\u00eb' - 0xEC: pLl | pp, // '\u00ec' - 0xED: pLl | pp, // '\u00ed' - 0xEE: pLl | pp, // '\u00ee' - 0xEF: pLl | pp, // '\u00ef' - 0xF0: pLl | pp, // '\u00f0' - 0xF1: pLl | pp, // '\u00f1' - 0xF2: pLl | pp, // '\u00f2' - 0xF3: pLl | pp, // '\u00f3' - 0xF4: pLl | pp, // '\u00f4' - 0xF5: pLl | pp, // '\u00f5' - 0xF6: pLl | pp, // '\u00f6' - 0xF7: pS | pp, // '\u00f7' - 0xF8: pLl | pp, // '\u00f8' - 0xF9: pLl | pp, // '\u00f9' - 0xFA: pLl | pp, // '\u00fa' - 0xFB: pLl | pp, // '\u00fb' - 0xFC: pLl | pp, // '\u00fc' - 0xFD: pLl | pp, // '\u00fd' - 0xFE: pLl | pp, // '\u00fe' - 0xFF: pLl | pp, // '\u00ff' -} - -// Range entries: 3190 16-bit, 657 32-bit, 3847 total. -// Range bytes: 19140 16-bit, 7884 32-bit, 27024 total. + 0xAE: pS | pp, // '®' + 0xAF: pS | pp, // '¯' + 0xB0: pS | pp, // '°' + 0xB1: pS | pp, // '±' + 0xB2: pN | pp, // '²' + 0xB3: pN | pp, // '³' + 0xB4: pS | pp, // '´' + 0xB5: pLl | pp, // 'µ' + 0xB6: pS | pp, // '¶' + 0xB7: pP | pp, // '·' + 0xB8: pS | pp, // '¸' + 0xB9: pN | pp, // '¹' + 0xBA: pLl | pp, // 'º' + 0xBB: pP | pp, // '»' + 0xBC: pN | pp, // '¼' + 0xBD: pN | pp, // '½' + 0xBE: pN | pp, // '¾' + 0xBF: pP | pp, // '¿' + 0xC0: pLu | pp, // 'À' + 0xC1: pLu | pp, // 'Á' + 0xC2: pLu | pp, // 'Â' + 0xC3: pLu | pp, // 'Ã' + 0xC4: pLu | pp, // 'Ä' + 0xC5: pLu | pp, // 'Å' + 0xC6: pLu | pp, // 'Æ' + 0xC7: pLu | pp, // 'Ç' + 0xC8: pLu | pp, // 'È' + 0xC9: pLu | pp, // 'É' + 0xCA: pLu | pp, // 'Ê' + 0xCB: pLu | pp, // 'Ë' + 0xCC: pLu | pp, // 'Ì' + 0xCD: pLu | pp, // 'Í' + 0xCE: pLu | pp, // 'Î' + 0xCF: pLu | pp, // 'Ï' + 0xD0: pLu | pp, // 'Ð' + 0xD1: pLu | pp, // 'Ñ' + 0xD2: pLu | pp, // 'Ò' + 0xD3: pLu | pp, // 'Ó' + 0xD4: pLu | pp, // 'Ô' + 0xD5: pLu | pp, // 'Õ' + 0xD6: pLu | pp, // 'Ö' + 0xD7: pS | pp, // '×' + 0xD8: pLu | pp, // 'Ø' + 0xD9: pLu | pp, // 'Ù' + 0xDA: pLu | pp, // 'Ú' + 0xDB: pLu | pp, // 'Û' + 0xDC: pLu | pp, // 'Ü' + 0xDD: pLu | pp, // 'Ý' + 0xDE: pLu | pp, // 'Þ' + 0xDF: pLl | pp, // 'ß' + 0xE0: pLl | pp, // 'à' + 0xE1: pLl | pp, // 'á' + 0xE2: pLl | pp, // 'â' + 0xE3: pLl | pp, // 'ã' + 0xE4: pLl | pp, // 'ä' + 0xE5: pLl | pp, // 'å' + 0xE6: pLl | pp, // 'æ' + 0xE7: pLl | pp, // 'ç' + 0xE8: pLl | pp, // 'è' + 0xE9: pLl | pp, // 'é' + 0xEA: pLl | pp, // 'ê' + 0xEB: pLl | pp, // 'ë' + 0xEC: pLl | pp, // 'ì' + 0xED: pLl | pp, // 'í' + 0xEE: pLl | pp, // 'î' + 0xEF: pLl | pp, // 'ï' + 0xF0: pLl | pp, // 'ð' + 0xF1: pLl | pp, // 'ñ' + 0xF2: pLl | pp, // 'ò' + 0xF3: pLl | pp, // 'ó' + 0xF4: pLl | pp, // 'ô' + 0xF5: pLl | pp, // 'õ' + 0xF6: pLl | pp, // 'ö' + 0xF7: pS | pp, // '÷' + 0xF8: pLl | pp, // 'ø' + 0xF9: pLl | pp, // 'ù' + 0xFA: pLl | pp, // 'ú' + 0xFB: pLl | pp, // 'û' + 0xFC: pLl | pp, // 'ü' + 0xFD: pLl | pp, // 'ý' + 0xFE: pLl | pp, // 'þ' + 0xFF: pLl | pp, // 'ÿ' +} + +var caseOrbit = []foldPair{ + {0x004B, 0x006B}, + {0x0053, 0x0073}, + {0x006B, 0x212A}, + {0x0073, 0x017F}, + {0x00B5, 0x039C}, + {0x00C5, 0x00E5}, + {0x00DF, 0x1E9E}, + {0x00E5, 0x212B}, + {0x0130, 0x0130}, + {0x0131, 0x0131}, + {0x017F, 0x0053}, + {0x01C4, 0x01C5}, + {0x01C5, 0x01C6}, + {0x01C6, 0x01C4}, + {0x01C7, 0x01C8}, + {0x01C8, 0x01C9}, + {0x01C9, 0x01C7}, + {0x01CA, 0x01CB}, + {0x01CB, 0x01CC}, + {0x01CC, 0x01CA}, + {0x01F1, 0x01F2}, + {0x01F2, 0x01F3}, + {0x01F3, 0x01F1}, + {0x0345, 0x0399}, + {0x0392, 0x03B2}, + {0x0395, 0x03B5}, + {0x0398, 0x03B8}, + {0x0399, 0x03B9}, + {0x039A, 0x03BA}, + {0x039C, 0x03BC}, + {0x03A0, 0x03C0}, + {0x03A1, 0x03C1}, + {0x03A3, 0x03C2}, + {0x03A6, 0x03C6}, + {0x03A9, 0x03C9}, + {0x03B2, 0x03D0}, + {0x03B5, 0x03F5}, + {0x03B8, 0x03D1}, + {0x03B9, 0x1FBE}, + {0x03BA, 0x03F0}, + {0x03BC, 0x00B5}, + {0x03C0, 0x03D6}, + {0x03C1, 0x03F1}, + {0x03C2, 0x03C3}, + {0x03C3, 0x03A3}, + {0x03C6, 0x03D5}, + {0x03C9, 0x2126}, + {0x03D0, 0x0392}, + {0x03D1, 0x03F4}, + {0x03D5, 0x03A6}, + {0x03D6, 0x03A0}, + {0x03F0, 0x039A}, + {0x03F1, 0x03A1}, + {0x03F4, 0x0398}, + {0x03F5, 0x0395}, + {0x1E60, 0x1E61}, + {0x1E61, 0x1E9B}, + {0x1E9B, 0x1E60}, + {0x1E9E, 0x00DF}, + {0x1FBE, 0x0345}, + {0x2126, 0x03A9}, + {0x212A, 0x004B}, + {0x212B, 0x00C5}, + {0x2160, 0x2170}, + {0x2161, 0x2171}, + {0x2162, 0x2172}, + {0x2163, 0x2173}, + {0x2164, 0x2174}, + {0x2165, 0x2175}, + {0x2166, 0x2176}, + {0x2167, 0x2177}, + {0x2168, 0x2178}, + {0x2169, 0x2179}, + {0x216A, 0x217A}, + {0x216B, 0x217B}, + {0x216C, 0x217C}, + {0x216D, 0x217D}, + {0x216E, 0x217E}, + {0x216F, 0x217F}, + {0x2170, 0x2160}, + {0x2171, 0x2161}, + {0x2172, 0x2162}, + {0x2173, 0x2163}, + {0x2174, 0x2164}, + {0x2175, 0x2165}, + {0x2176, 0x2166}, + {0x2177, 0x2167}, + {0x2178, 0x2168}, + {0x2179, 0x2169}, + {0x217A, 0x216A}, + {0x217B, 0x216B}, + {0x217C, 0x216C}, + {0x217D, 0x216D}, + {0x217E, 0x216E}, + {0x217F, 0x216F}, + {0x24B6, 0x24D0}, + {0x24B7, 0x24D1}, + {0x24B8, 0x24D2}, + {0x24B9, 0x24D3}, + {0x24BA, 0x24D4}, + {0x24BB, 0x24D5}, + {0x24BC, 0x24D6}, + {0x24BD, 0x24D7}, + {0x24BE, 0x24D8}, + {0x24BF, 0x24D9}, + {0x24C0, 0x24DA}, + {0x24C1, 0x24DB}, + {0x24C2, 0x24DC}, + {0x24C3, 0x24DD}, + {0x24C4, 0x24DE}, + {0x24C5, 0x24DF}, + {0x24C6, 0x24E0}, + {0x24C7, 0x24E1}, + {0x24C8, 0x24E2}, + {0x24C9, 0x24E3}, + {0x24CA, 0x24E4}, + {0x24CB, 0x24E5}, + {0x24CC, 0x24E6}, + {0x24CD, 0x24E7}, + {0x24CE, 0x24E8}, + {0x24CF, 0x24E9}, + {0x24D0, 0x24B6}, + {0x24D1, 0x24B7}, + {0x24D2, 0x24B8}, + {0x24D3, 0x24B9}, + {0x24D4, 0x24BA}, + {0x24D5, 0x24BB}, + {0x24D6, 0x24BC}, + {0x24D7, 0x24BD}, + {0x24D8, 0x24BE}, + {0x24D9, 0x24BF}, + {0x24DA, 0x24C0}, + {0x24DB, 0x24C1}, + {0x24DC, 0x24C2}, + {0x24DD, 0x24C3}, + {0x24DE, 0x24C4}, + {0x24DF, 0x24C5}, + {0x24E0, 0x24C6}, + {0x24E1, 0x24C7}, + {0x24E2, 0x24C8}, + {0x24E3, 0x24C9}, + {0x24E4, 0x24CA}, + {0x24E5, 0x24CB}, + {0x24E6, 0x24CC}, + {0x24E7, 0x24CD}, + {0x24E8, 0x24CE}, + {0x24E9, 0x24CF}, +} + +// FoldCategory maps a category name to a table of +// code points outside the category that are equivalent under +// simple case folding to code points inside the category. +// If there is no entry for a category name, there are no such points. +var FoldCategory = map[string]*RangeTable{ + "Ll": foldLl, + "Inherited": foldInherited, + "M": foldM, + "L": foldL, + "Mn": foldMn, + "Common": foldCommon, + "Greek": foldGreek, + "Lu": foldLu, + "Lt": foldLt, +} + +var foldLl = &RangeTable{ + R16: []Range16{ + {0x0041, 0x005a, 1}, + {0x00c0, 0x00d6, 1}, + {0x00d8, 0x00de, 1}, + {0x0100, 0x012e, 2}, + {0x0132, 0x0136, 2}, + {0x0139, 0x0147, 2}, + {0x014a, 0x0178, 2}, + {0x0179, 0x017d, 2}, + {0x0181, 0x0182, 1}, + {0x0184, 0x0186, 2}, + {0x0187, 0x0189, 2}, + {0x018a, 0x018b, 1}, + {0x018e, 0x0191, 1}, + {0x0193, 0x0194, 1}, + {0x0196, 0x0198, 1}, + {0x019c, 0x019d, 1}, + {0x019f, 0x01a0, 1}, + {0x01a2, 0x01a6, 2}, + {0x01a7, 0x01a9, 2}, + {0x01ac, 0x01ae, 2}, + {0x01af, 0x01b1, 2}, + {0x01b2, 0x01b3, 1}, + {0x01b5, 0x01b7, 2}, + {0x01b8, 0x01bc, 4}, + {0x01c4, 0x01c5, 1}, + {0x01c7, 0x01c8, 1}, + {0x01ca, 0x01cb, 1}, + {0x01cd, 0x01db, 2}, + {0x01de, 0x01ee, 2}, + {0x01f1, 0x01f2, 1}, + {0x01f4, 0x01f6, 2}, + {0x01f7, 0x01f8, 1}, + {0x01fa, 0x0232, 2}, + {0x023a, 0x023b, 1}, + {0x023d, 0x023e, 1}, + {0x0241, 0x0243, 2}, + {0x0244, 0x0246, 1}, + {0x0248, 0x024e, 2}, + {0x0345, 0x0370, 43}, + {0x0372, 0x0376, 4}, + {0x0386, 0x0388, 2}, + {0x0389, 0x038a, 1}, + {0x038c, 0x038e, 2}, + {0x038f, 0x0391, 2}, + {0x0392, 0x03a1, 1}, + {0x03a3, 0x03ab, 1}, + {0x03cf, 0x03d8, 9}, + {0x03da, 0x03ee, 2}, + {0x03f4, 0x03f7, 3}, + {0x03f9, 0x03fa, 1}, + {0x03fd, 0x042f, 1}, + {0x0460, 0x0480, 2}, + {0x048a, 0x04c0, 2}, + {0x04c1, 0x04cd, 2}, + {0x04d0, 0x0526, 2}, + {0x0531, 0x0556, 1}, + {0x10a0, 0x10c5, 1}, + {0x1e00, 0x1e94, 2}, + {0x1e9e, 0x1efe, 2}, + {0x1f08, 0x1f0f, 1}, + {0x1f18, 0x1f1d, 1}, + {0x1f28, 0x1f2f, 1}, + {0x1f38, 0x1f3f, 1}, + {0x1f48, 0x1f4d, 1}, + {0x1f59, 0x1f5f, 2}, + {0x1f68, 0x1f6f, 1}, + {0x1f88, 0x1f8f, 1}, + {0x1f98, 0x1f9f, 1}, + {0x1fa8, 0x1faf, 1}, + {0x1fb8, 0x1fbc, 1}, + {0x1fc8, 0x1fcc, 1}, + {0x1fd8, 0x1fdb, 1}, + {0x1fe8, 0x1fec, 1}, + {0x1ff8, 0x1ffc, 1}, + {0x2126, 0x212a, 4}, + {0x212b, 0x2132, 7}, + {0x2183, 0x2c00, 2685}, + {0x2c01, 0x2c2e, 1}, + {0x2c60, 0x2c62, 2}, + {0x2c63, 0x2c64, 1}, + {0x2c67, 0x2c6d, 2}, + {0x2c6e, 0x2c70, 1}, + {0x2c72, 0x2c75, 3}, + {0x2c7e, 0x2c80, 1}, + {0x2c82, 0x2ce2, 2}, + {0x2ceb, 0x2ced, 2}, + {0xa640, 0xa66c, 2}, + {0xa680, 0xa696, 2}, + {0xa722, 0xa72e, 2}, + {0xa732, 0xa76e, 2}, + {0xa779, 0xa77d, 2}, + {0xa77e, 0xa786, 2}, + {0xa78b, 0xa78d, 2}, + {0xa790, 0xa7a0, 16}, + {0xa7a2, 0xa7a8, 2}, + {0xff21, 0xff3a, 1}, + }, + R32: []Range32{ + {0x10400, 0x10427, 1}, + }, +} + +var foldInherited = &RangeTable{ + R16: []Range16{ + {0x0399, 0x03b9, 32}, + {0x1fbe, 0x1fbe, 1}, + }, +} + +var foldM = &RangeTable{ + R16: []Range16{ + {0x0399, 0x03b9, 32}, + {0x1fbe, 0x1fbe, 1}, + }, +} + +var foldL = &RangeTable{ + R16: []Range16{ + {0x0345, 0x0345, 1}, + }, +} + +var foldMn = &RangeTable{ + R16: []Range16{ + {0x0399, 0x03b9, 32}, + {0x1fbe, 0x1fbe, 1}, + }, +} + +var foldCommon = &RangeTable{ + R16: []Range16{ + {0x039c, 0x03bc, 32}, + }, +} + +var foldGreek = &RangeTable{ + R16: []Range16{ + {0x00b5, 0x0345, 656}, + }, +} + +var foldLu = &RangeTable{ + R16: []Range16{ + {0x0061, 0x007a, 1}, + {0x00b5, 0x00df, 42}, + {0x00e0, 0x00f6, 1}, + {0x00f8, 0x00ff, 1}, + {0x0101, 0x012f, 2}, + {0x0133, 0x0137, 2}, + {0x013a, 0x0148, 2}, + {0x014b, 0x0177, 2}, + {0x017a, 0x017e, 2}, + {0x017f, 0x0180, 1}, + {0x0183, 0x0185, 2}, + {0x0188, 0x018c, 4}, + {0x0192, 0x0195, 3}, + {0x0199, 0x019a, 1}, + {0x019e, 0x01a1, 3}, + {0x01a3, 0x01a5, 2}, + {0x01a8, 0x01ad, 5}, + {0x01b0, 0x01b4, 4}, + {0x01b6, 0x01b9, 3}, + {0x01bd, 0x01bf, 2}, + {0x01c5, 0x01c6, 1}, + {0x01c8, 0x01c9, 1}, + {0x01cb, 0x01cc, 1}, + {0x01ce, 0x01dc, 2}, + {0x01dd, 0x01ef, 2}, + {0x01f2, 0x01f3, 1}, + {0x01f5, 0x01f9, 4}, + {0x01fb, 0x021f, 2}, + {0x0223, 0x0233, 2}, + {0x023c, 0x023f, 3}, + {0x0240, 0x0242, 2}, + {0x0247, 0x024f, 2}, + {0x0250, 0x0254, 1}, + {0x0256, 0x0257, 1}, + {0x0259, 0x025b, 2}, + {0x0260, 0x0263, 3}, + {0x0265, 0x0268, 3}, + {0x0269, 0x026b, 2}, + {0x026f, 0x0271, 2}, + {0x0272, 0x0275, 3}, + {0x027d, 0x0283, 3}, + {0x0288, 0x028c, 1}, + {0x0292, 0x0345, 179}, + {0x0371, 0x0373, 2}, + {0x0377, 0x037b, 4}, + {0x037c, 0x037d, 1}, + {0x03ac, 0x03af, 1}, + {0x03b1, 0x03ce, 1}, + {0x03d0, 0x03d1, 1}, + {0x03d5, 0x03d7, 1}, + {0x03d9, 0x03ef, 2}, + {0x03f0, 0x03f2, 1}, + {0x03f5, 0x03fb, 3}, + {0x0430, 0x045f, 1}, + {0x0461, 0x0481, 2}, + {0x048b, 0x04bf, 2}, + {0x04c2, 0x04ce, 2}, + {0x04cf, 0x0527, 2}, + {0x0561, 0x0586, 1}, + {0x1d79, 0x1d7d, 4}, + {0x1e01, 0x1e95, 2}, + {0x1e9b, 0x1ea1, 6}, + {0x1ea3, 0x1eff, 2}, + {0x1f00, 0x1f07, 1}, + {0x1f10, 0x1f15, 1}, + {0x1f20, 0x1f27, 1}, + {0x1f30, 0x1f37, 1}, + {0x1f40, 0x1f45, 1}, + {0x1f51, 0x1f57, 2}, + {0x1f60, 0x1f67, 1}, + {0x1f70, 0x1f7d, 1}, + {0x1fb0, 0x1fb1, 1}, + {0x1fbe, 0x1fd0, 18}, + {0x1fd1, 0x1fe0, 15}, + {0x1fe1, 0x1fe5, 4}, + {0x214e, 0x2184, 54}, + {0x2c30, 0x2c5e, 1}, + {0x2c61, 0x2c65, 4}, + {0x2c66, 0x2c6c, 2}, + {0x2c73, 0x2c76, 3}, + {0x2c81, 0x2ce3, 2}, + {0x2cec, 0x2cee, 2}, + {0x2d00, 0x2d25, 1}, + {0xa641, 0xa66d, 2}, + {0xa681, 0xa697, 2}, + {0xa723, 0xa72f, 2}, + {0xa733, 0xa76f, 2}, + {0xa77a, 0xa77c, 2}, + {0xa77f, 0xa787, 2}, + {0xa78c, 0xa791, 5}, + {0xa7a1, 0xa7a9, 2}, + {0xff41, 0xff5a, 1}, + }, + R32: []Range32{ + {0x10428, 0x1044f, 1}, + }, +} + +var foldLt = &RangeTable{ + R16: []Range16{ + {0x01c4, 0x01c6, 2}, + {0x01c7, 0x01c9, 2}, + {0x01ca, 0x01cc, 2}, + {0x01f1, 0x01f3, 2}, + {0x1f80, 0x1f87, 1}, + {0x1f90, 0x1f97, 1}, + {0x1fa0, 0x1fa7, 1}, + {0x1fb3, 0x1fc3, 16}, + {0x1ff3, 0x1ff3, 1}, + }, +} + +// FoldScript maps a script name to a table of +// code points outside the script that are equivalent under +// simple case folding to code points inside the script. +// If there is no entry for a script name, there are no such points. +var FoldScript = map[string]*RangeTable{} + + +// Range entries: 3391 16-bit, 659 32-bit, 4050 total. +// Range bytes: 20346 16-bit, 7908 32-bit, 28254 total. + +// Fold orbit bytes: 147 pairs, 588 bytes |