diff options
Diffstat (limited to 'src/pkg/unicode/maketables.go')
-rw-r--r-- | src/pkg/unicode/maketables.go | 219 |
1 files changed, 169 insertions, 50 deletions
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go index 0c367673e..655fe46e4 100644 --- a/src/pkg/unicode/maketables.go +++ b/src/pkg/unicode/maketables.go @@ -28,6 +28,8 @@ func main() { printScriptOrProperty(false) printScriptOrProperty(true) printCases() + printLatinProperties() + printSizes() } var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") @@ -53,7 +55,17 @@ var test = flag.Bool("test", var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) var logger = log.New(os.Stderr, "", log.Lshortfile) -var category = map[string]bool{"letter": true} // Nd Lu etc. letter is a special case +var category = map[string]bool{ + // Nd Lu etc. + // We use one-character names to identify merged categories + "L": true, // Lu Ll Lt Lm Lo + "P": true, // Pc Pd Ps Pe Pu Pf Po + "M": true, // Mn Mc Me + "N": true, // Nd Nl No + "S": true, // Sm Sc Sk So + "Z": true, // Zs Zl Zp + "C": true, // Cc Cf Cs Co Cn +} // UnicodeData.txt has form: // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; @@ -246,19 +258,16 @@ func version() string { return "Unknown" } -func letterOp(code int) bool { - switch chars[code].category { - case "Lu", "Ll", "Lt", "Lm", "Lo": - return true - } - return false +func categoryOp(code int, class uint8) bool { + category := chars[code].category + return len(category) > 0 && category[0] == class } func loadChars() { if *dataURL == "" { flag.Set("data", *url+"UnicodeData.txt") } - resp, _, err := http.Get(*dataURL) + resp, err := http.Get(*dataURL) if err != nil { logger.Fatal(err) } @@ -278,16 +287,16 @@ func loadChars() { switch parseCategory(line[0 : len(line)-1]) { case SNormal: if first != 0 { - logger.Fatalf("bad state normal at U+%04X", lastChar) + logger.Fatalf("bad state normal at %U", lastChar) } case SFirst: if first != 0 { - logger.Fatalf("bad state first at U+%04X", lastChar) + logger.Fatalf("bad state first at %U", lastChar) } first = lastChar case SLast: if first == 0 { - logger.Fatalf("bad state last at U+%04X", lastChar) + logger.Fatalf("bad state last at %U", lastChar) } for i := first + 1; i <= lastChar; i++ { chars[i] = chars[first] @@ -299,6 +308,15 @@ func loadChars() { resp.Body.Close() } +const progHeader = `// Generated by running +// maketables --tables=%s --data=%s +// DO NOT EDIT + +package unicode + +` + + func printCategories() { if *tablelist == "" { return @@ -312,20 +330,14 @@ func printCategories() { fullCategoryTest(list) return } - fmt.Printf( - "// Generated by running\n"+ - "// maketables --tables=%s --data=%s\n"+ - "// DO NOT EDIT\n\n"+ - "package unicode\n\n", - *tablelist, - *dataURL) + fmt.Printf(progHeader, *tablelist, *dataURL) fmt.Println("// Version is the Unicode edition from which the tables are derived.") fmt.Printf("const Version = %q\n\n", version()) if *tablelist == "all" { fmt.Println("// Categories is the set of Unicode data tables.") - fmt.Println("var Categories = map[string] []Range {") + fmt.Println("var Categories = map[string] *RangeTable {") for k := range category { fmt.Printf("\t%q: %s,\n", k, k) } @@ -344,8 +356,27 @@ func printCategories() { // Cases deserving special comments varDecl := "" switch name { - case "letter": - varDecl = "\tLetter = letter; // Letter is the set of Unicode letters.\n" + case "C": + varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" + varDecl += "\tC = _C\n" + case "L": + varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" + varDecl += "\tL = _L\n" + case "M": + varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" + varDecl += "\tM = _M\n" + case "N": + varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" + varDecl += "\tN = _N\n" + case "P": + varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" + varDecl += "\tP = _P\n" + case "S": + varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" + varDecl += "\tS = _S\n" + case "Z": + varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" + varDecl += "\tZ = _Z\n" case "Nd": varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" case "Lu": @@ -355,21 +386,22 @@ func printCategories() { case "Lt": varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" } - if name != "letter" { + if len(name) > 1 { varDecl += fmt.Sprintf( "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", name, name, name, name) } decl[ndecl] = varDecl ndecl++ - if name == "letter" { // special case + if len(name) == 1 { // unified categories + decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) dumpRange( - "var letter = []Range {\n", - letterOp) + decl, + func(code int) bool { return categoryOp(code, name[0]) }) continue } dumpRange( - fmt.Sprintf("var _%s = []Range {\n", name), + fmt.Sprintf("var _%s = &RangeTable{\n", name), func(code int) bool { return chars[code].category == name }) } decl.Sort() @@ -382,12 +414,15 @@ func printCategories() { type Op func(code int) bool -const format = "\t{0x%04x, 0x%04x, %d},\n" +const format = "\t\t{0x%04x, 0x%04x, %d},\n" func dumpRange(header string, inCategory Op) { fmt.Print(header) next := 0 + fmt.Print("\tR16: []Range16{\n") // one Range for each iteration + count := &range16Count + size := 16 for { // look for start of range for next < len(chars) && !inCategory(next) { @@ -427,24 +462,49 @@ func dumpRange(header string, inCategory Op) { break } } - fmt.Printf(format, lo, hi, stride) + size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) // next range: start looking where this range ends next = hi + 1 } + fmt.Print("\t},\n") fmt.Print("}\n\n") } +func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { + if size == 16 && hi >= 1<<16 { + if lo < 1<<16 { + if lo+stride != hi { + logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) + } + // No range contains U+FFFF as an instance, so split + // the range into two entries. That way we can maintain + // the invariant that R32 contains only >= 1<<16. + fmt.Printf(format, lo, lo, 1) + lo = hi + stride = 1 + *count++ + } + fmt.Print("\t},\n") + fmt.Print("\tR32: []Range32{\n") + size = 32 + count = &range32Count + } + fmt.Printf(format, lo, hi, stride) + *count++ + return size, count +} + func fullCategoryTest(list []string) { for _, name := range list { if _, ok := category[name]; !ok { logger.Fatal("unknown category", name) } r, ok := unicode.Categories[name] - if !ok { - logger.Fatal("unknown table", name) + if !ok && len(name) > 1 { + logger.Fatalf("unknown table %q", name) } - if name == "letter" { - verifyRange(name, letterOp, r) + if len(name) == 1 { + verifyRange(name, func(code int) bool { return categoryOp(code, name[0]) }, r) } else { verifyRange( name, @@ -454,12 +514,17 @@ func fullCategoryTest(list []string) { } } -func verifyRange(name string, inCategory Op, table []unicode.Range) { +func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { + count := 0 for i := range chars { web := inCategory(i) pkg := unicode.Is(table, i) if web != pkg { - fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg) + fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) + count++ + if count > 10 { + break + } } } } @@ -497,22 +562,22 @@ func parseScript(line string, scripts map[string][]Script) { } // The script tables have a lot of adjacent elements. Fold them together. -func foldAdjacent(r []Script) []unicode.Range { - s := make([]unicode.Range, 0, len(r)) +func foldAdjacent(r []Script) []unicode.Range32 { + s := make([]unicode.Range32, 0, len(r)) j := 0 for i := 0; i < len(r); i++ { - if j > 0 && int(r[i].lo) == s[j-1].Hi+1 { - s[j-1].Hi = int(r[i].hi) + if j > 0 && r[i].lo == s[j-1].Hi+1 { + s[j-1].Hi = r[i].hi } else { s = s[0 : j+1] - s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1} + s[j] = unicode.Range32{uint32(r[i].lo), uint32(r[i].hi), 1} j++ } } return s } -func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts map[string][]Script) { +func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { for _, name := range list { if _, ok := scripts[name]; !ok { logger.Fatal("unknown script", name) @@ -524,7 +589,7 @@ func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts for _, script := range scripts[name] { for r := script.lo; r <= script.hi; r++ { if !unicode.Is(installed[name], int(r)) { - fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name) + fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) } } } @@ -549,7 +614,7 @@ func printScriptOrProperty(doProps bool) { return } var err os.Error - resp, _, err := http.Get(*url + file) + resp, err := http.Get(*url + file) if err != nil { logger.Fatal(err) } @@ -589,10 +654,10 @@ func printScriptOrProperty(doProps bool) { if flaglist == "all" { if doProps { fmt.Println("// Properties is the set of Unicode property tables.") - fmt.Println("var Properties = map[string] []Range {") + fmt.Println("var Properties = map[string] *RangeTable{") } else { fmt.Println("// Scripts is the set of Unicode script tables.") - fmt.Println("var Scripts = map[string] []Range {") + fmt.Println("var Scripts = map[string] *RangeTable{") } for k := range table { fmt.Printf("\t%q: %s,\n", k, k) @@ -613,11 +678,15 @@ func printScriptOrProperty(doProps bool) { name, name, name, name) } ndecl++ - fmt.Printf("var _%s = []Range {\n", name) + fmt.Printf("var _%s = &RangeTable {\n", name) + fmt.Print("\tR16: []Range16{\n") ranges := foldAdjacent(table[name]) + size := 16 + count := &range16Count for _, s := range ranges { - fmt.Printf(format, s.Lo, s.Hi, s.Stride) + size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) } + fmt.Print("\t},\n") fmt.Print("}\n\n") } decl.Sort() @@ -808,7 +877,7 @@ func printCaseRange(lo, hi *caseState) { fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", lo.point, hi.point) case hi.point > lo.point && lo.isLowerUpper(): - logger.Fatalf("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point) + logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", lo.point, hi.point) default: @@ -831,17 +900,67 @@ func fullCaseTest() { lower := unicode.ToLower(i) want := caseIt(i, c.lowerCase) if lower != want { - fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower) + fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) } upper := unicode.ToUpper(i) want = caseIt(i, c.upperCase) if upper != want { - fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper) + fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) } title := unicode.ToTitle(i) want = caseIt(i, c.titleCase) if title != want { - fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title) + fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) } } } + +func printLatinProperties() { + if *test { + return + } + fmt.Println("var properties = [MaxLatin1+1]uint8{") + for code := 0; code <= unicode.MaxLatin1; code++ { + var property string + switch chars[code].category { + case "Cc", "": // NUL has no category. + property = "pC" + case "Cf": // soft hyphen, unique category, not printable. + property = "0" + case "Ll": + property = "pLl | pp" + case "Lu": + property = "pLu | pp" + case "Nd", "No": + property = "pN | pp" + case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": + property = "pP | pp" + case "Sc", "Sk", "Sm", "So": + property = "pS | pp" + case "Zs": + property = "pZ" + default: + logger.Fatalf("%U has unknown category %q", code, chars[code].category) + } + // Special case + if code == ' ' { + property = "pZ | pp" + } + fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code) + } + fmt.Println("}") +} + +var range16Count = 0 // Number of entries in the 16-bit range tables. +var range32Count = 0 // Number of entries in the 32-bit range tables. + +func printSizes() { + if *test { + return + } + fmt.Println() + fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) + range16Bytes := range16Count * 3 * 2 + range32Bytes := range32Count * 3 * 4 + fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) +} |