diff options
Diffstat (limited to 'src/pkg/unicode/maketables.go')
-rw-r--r-- | src/pkg/unicode/maketables.go | 353 |
1 files changed, 319 insertions, 34 deletions
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go index 655fe46e4..07b931d7e 100644 --- a/src/pkg/unicode/maketables.go +++ b/src/pkg/unicode/maketables.go @@ -24,15 +24,18 @@ import ( func main() { flag.Parse() loadChars() // always needed + loadCasefold() printCategories() printScriptOrProperty(false) printScriptOrProperty(true) printCases() printLatinProperties() + printCasefold() printSizes() } var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") +var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") var url = flag.String("url", "http://www.unicode.org/Public/6.0.0/ucd/", "URL of Unicode database directory") @@ -70,7 +73,7 @@ var category = map[string]bool{ // UnicodeData.txt has form: // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A -// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation +// See http://www.unicode.org/reports/tr44/ for a full explanation // The fields: const ( FCodePoint = iota @@ -78,10 +81,10 @@ const ( FGeneralCategory FCanonicalCombiningClass FBidiClass - FDecompositionType - FDecompositionMapping + FDecompositionTypeAndMapping FNumericType - FNumericValue + FNumericDigit // If a decimal digit. + FNumericValue // Includes non-decimal, e.g. U+2155=1/5 FBidiMirrored FUnicode1Name FISOComment @@ -94,21 +97,21 @@ const ( ) var fieldName = []string{ - "CodePoint", - "Name", - "GeneralCategory", - "CanonicalCombiningClass", - "BidiClass", - "DecompositionType", - "DecompositionMapping", - "NumericType", - "NumericValue", - "BidiMirrored", - "Unicode1Name", - "ISOComment", - "SimpleUppercaseMapping", - "SimpleLowercaseMapping", - "SimpleTitlecaseMapping", + FCodePoint: "CodePoint", + FName: "Name", + FGeneralCategory: "GeneralCategory", + FCanonicalCombiningClass: "CanonicalCombiningClass", + FBidiClass: "BidiClass", + FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", + FNumericType: "NumericType", + FNumericDigit: "NumericDigit", + FNumericValue: "NumericValue", + FBidiMirrored: "BidiMirrored", + FUnicode1Name: "Unicode1Name", + FISOComment: "ISOComment", + FSimpleUppercaseMapping: "SimpleUppercaseMapping", + FSimpleLowercaseMapping: "SimpleLowercaseMapping", + FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", } // This contains only the properties we're interested in. @@ -119,6 +122,8 @@ type Char struct { upperCase int lowerCase int titleCase int + foldCase int // simple case folding + caseOrbit int // next in simple case folding orbit } // Scripts.txt has form: @@ -151,7 +156,7 @@ const ( ) func parseCategory(line string) (state State) { - field := strings.Split(line, ";", -1) + field := strings.Split(line, ";") if len(field) != NumField { logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) } @@ -248,7 +253,7 @@ func all(scripts map[string][]Script) []string { // Extract the version number from the URL func version() string { // Break on slashes and look for the first numeric field - fields := strings.Split(*url, "/", -1) + fields := strings.Split(*url, "/") for _, f := range fields { if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { return f @@ -308,8 +313,53 @@ func loadChars() { resp.Body.Close() } +func loadCasefold() { + if *casefoldingURL == "" { + flag.Set("casefolding", *url+"CaseFolding.txt") + } + resp, err := http.Get(*casefoldingURL) + if err != nil { + logger.Fatal(err) + } + if resp.StatusCode != 200 { + logger.Fatal("bad GET status for CaseFolding.txt", resp.Status) + } + input := bufio.NewReader(resp.Body) + for { + line, err := input.ReadString('\n') + if err != nil { + if err == os.EOF { + break + } + logger.Fatal(err) + } + if line[0] == '#' { + continue + } + field := strings.Split(line, "; ") + if len(field) != 4 { + logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) + } + kind := field[1] + if kind != "C" && kind != "S" { + // Only care about 'common' and 'simple' foldings. + continue + } + p1, err := strconv.Btoui64(field[0], 16) + if err != nil { + logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) + } + p2, err := strconv.Btoui64(field[2], 16) + if err != nil { + logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) + } + chars[p1].foldCase = int(p2) + } + resp.Body.Close() +} + const progHeader = `// Generated by running -// maketables --tables=%s --data=%s +// maketables --tables=%s --data=%s --casefolding=%s // DO NOT EDIT package unicode @@ -322,7 +372,7 @@ func printCategories() { return } // Find out which categories to dump - list := strings.Split(*tablelist, ",", -1) + list := strings.Split(*tablelist, ",") if *tablelist == "all" { list = allCategories() } @@ -330,7 +380,7 @@ func printCategories() { fullCategoryTest(list) return } - fmt.Printf(progHeader, *tablelist, *dataURL) + fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL) fmt.Println("// Version is the Unicode edition from which the tables are derived.") fmt.Printf("const Version = %q\n\n", version()) @@ -344,7 +394,7 @@ func printCategories() { fmt.Print("}\n\n") } - decl := make(sort.StringArray, len(list)) + decl := make(sort.StringSlice, len(list)) ndecl := 0 for _, name := range list { if _, ok := category[name]; !ok { @@ -538,7 +588,7 @@ func parseScript(line string, scripts map[string][]Script) { if len(line) == 0 { return } - field := strings.Split(line, ";", -1) + field := strings.Split(line, ";") if len(field) != 2 { logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) } @@ -635,7 +685,7 @@ func printScriptOrProperty(doProps bool) { resp.Body.Close() // Find out which scripts to dump - list := strings.Split(flaglist, ",", -1) + list := strings.Split(flaglist, ",") if flaglist == "all" { list = all(table) } @@ -665,7 +715,7 @@ func printScriptOrProperty(doProps bool) { fmt.Print("}\n\n") } - decl := make(sort.StringArray, len(list)) + decl := make(sort.StringSlice, len(list)) ndecl := 0 for _, name := range list { if doProps { @@ -837,13 +887,13 @@ func printCases() { } fmt.Printf( "// Generated by running\n"+ - "// maketables --data=%s\n"+ + "// maketables --data=%s --casefolding=%s\n"+ "// DO NOT EDIT\n\n"+ "// CaseRanges is the table describing case mappings for all letters with\n"+ "// non-self mappings.\n"+ "var CaseRanges = _CaseRanges\n"+ "var _CaseRanges = []CaseRange {\n", - *dataURL) + *dataURL, *casefoldingURL) var startState *caseState // the start of a run; nil for not active var prevState = &caseState{} // the state of the previous character @@ -946,13 +996,246 @@ func printLatinProperties() { if code == ' ' { property = "pZ | pp" } - fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code) + fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code) } - fmt.Println("}") + fmt.Printf("}\n\n") } -var range16Count = 0 // Number of entries in the 16-bit range tables. -var range32Count = 0 // Number of entries in the 32-bit range tables. +func printCasefold() { + // Build list of case-folding groups attached to each canonical folded char (typically lower case). + var caseOrbit = make([][]int, MaxChar+1) + for i := range chars { + c := &chars[i] + if c.foldCase == 0 { + continue + } + orb := caseOrbit[c.foldCase] + if orb == nil { + orb = append(orb, c.foldCase) + } + caseOrbit[c.foldCase] = append(orb, i) + } + + // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. + for i := range chars { + c := &chars[i] + f := c.foldCase + if f == 0 { + f = i + } + orb := caseOrbit[f] + if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { + // Default assumption of [upper, lower] is wrong. + caseOrbit[i] = []int{i} + } + } + + // Delete the groups for which assuming [lower, upper] is right. + for i, orb := range caseOrbit { + if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { + caseOrbit[i] = nil + } + } + + // Record orbit information in chars. + for _, orb := range caseOrbit { + if orb == nil { + continue + } + sort.Ints(orb) + c := orb[len(orb)-1] + for _, d := range orb { + chars[c].caseOrbit = d + c = d + } + } + + printCaseOrbit() + + // Tables of category and script folding exceptions: code points + // that must be added when interpreting a particular category/script + // in a case-folding context. + cat := make(map[string]map[int]bool) + for name := range category { + if x := foldExceptions(inCategory(name)); len(x) > 0 { + cat[name] = x + } + } + + scr := make(map[string]map[int]bool) + for name := range scripts { + if x := foldExceptions(inScript(name)); len(x) > 0 { + cat[name] = x + } + } + + printCatFold("FoldCategory", cat) + printCatFold("FoldScript", scr) +} + +// inCategory returns a list of all the runes in the category. +func inCategory(name string) []int { + var x []int + for i := range chars { + c := &chars[i] + if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { + x = append(x, i) + } + } + return x +} + +// inScript returns a list of all the runes in the script. +func inScript(name string) []int { + var x []int + for _, s := range scripts[name] { + for c := s.lo; c <= s.hi; c++ { + x = append(x, int(c)) + } + } + return x +} + +// foldExceptions returns a list of all the runes fold-equivalent +// to runes in class but not in class themselves. +func foldExceptions(class []int) map[int]bool { + // Create map containing class and all fold-equivalent chars. + m := make(map[int]bool) + for _, r := range class { + c := &chars[r] + if c.caseOrbit == 0 { + // Just upper and lower. + if u := c.upperCase; u != 0 { + m[u] = true + } + if l := c.lowerCase; l != 0 { + m[l] = true + } + m[r] = true + continue + } + // Otherwise walk orbit. + r0 := r + for { + m[r] = true + r = chars[r].caseOrbit + if r == r0 { + break + } + } + } + + // Remove class itself. + for _, r := range class { + m[r] = false, false + } + + // What's left is the exceptions. + return m +} + +var comment = map[string]string{ + "FoldCategory": "// FoldCategory maps a category name to a table of\n" + + "// code points outside the category that are equivalent under\n" + + "// simple case folding to code points inside the category.\n" + + "// If there is no entry for a category name, there are no such points.\n", + + "FoldScript": "// FoldScript maps a script name to a table of\n" + + "// code points outside the script that are equivalent under\n" + + "// simple case folding to code points inside the script.\n" + + "// If there is no entry for a script name, there are no such points.\n", +} + +func printCaseOrbit() { + if *test { + for i := range chars { + c := &chars[i] + f := c.caseOrbit + if f == 0 { + if c.lowerCase != i && c.lowerCase != 0 { + f = c.lowerCase + } else if c.upperCase != i && c.upperCase != 0 { + f = c.upperCase + } else { + f = i + } + } + if g := unicode.SimpleFold(i); g != f { + fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) + } + } + return + } + + fmt.Printf("var caseOrbit = []foldPair{\n") + for i := range chars { + c := &chars[i] + if c.caseOrbit != 0 { + fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) + foldPairCount++ + } + } + fmt.Printf("}\n\n") +} + +func printCatFold(name string, m map[string]map[int]bool) { + if *test { + var pkgMap map[string]*unicode.RangeTable + if name == "FoldCategory" { + pkgMap = unicode.FoldCategory + } else { + pkgMap = unicode.FoldScript + } + if len(pkgMap) != len(m) { + fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) + return + } + for k, v := range m { + t, ok := pkgMap[k] + if !ok { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) + continue + } + n := 0 + for _, r := range t.R16 { + for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) { + if !v[c] { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) + } + n++ + } + } + for _, r := range t.R32 { + for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) { + if !v[c] { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) + } + n++ + } + } + if n != len(v) { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) + } + } + return + } + + fmt.Print(comment[name]) + fmt.Printf("var %s = map[string]*RangeTable{\n", name) + for name := range m { + fmt.Printf("\t%q: fold%s,\n", name, name) + } + fmt.Printf("}\n\n") + for name, class := range m { + dumpRange( + fmt.Sprintf("var fold%s = &RangeTable{\n", name), + func(code int) bool { return class[code] }) + } +} + +var range16Count = 0 // Number of entries in the 16-bit range tables. +var range32Count = 0 // Number of entries in the 32-bit range tables. +var foldPairCount = 0 // Number of fold pairs in the exception tables. func printSizes() { if *test { @@ -963,4 +1246,6 @@ func printSizes() { range16Bytes := range16Count * 3 * 2 range32Bytes := range32Count * 3 * 4 fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) + fmt.Println() + fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) } |