diff options
Diffstat (limited to 'src/unicode/maketables.go')
-rw-r--r-- | src/unicode/maketables.go | 1376 |
1 files changed, 1376 insertions, 0 deletions
diff --git a/src/unicode/maketables.go b/src/unicode/maketables.go new file mode 100644 index 000000000..d1c9aa04a --- /dev/null +++ b/src/unicode/maketables.go @@ -0,0 +1,1376 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +// Unicode table generator. +// Data read from the web. + +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "net/http" + "os" + "os/exec" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" + "unicode" +) + +func main() { + flag.Parse() + setupOutput() + loadChars() // always needed + loadCasefold() + printCategories() + printScriptOrProperty(false) + printScriptOrProperty(true) + printCases() + printLatinProperties() + printCasefold() + printSizes() + flushOutput() +} + +var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") +var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt") +var url = flag.String("url", + "http://www.unicode.org/Public/7.0.0/ucd/", + "URL of Unicode database directory") +var tablelist = flag.String("tables", + "all", + "comma-separated list of which tables to generate; can be letter") +var scriptlist = flag.String("scripts", + "all", + "comma-separated list of which script tables to generate") +var proplist = flag.String("props", + "all", + "comma-separated list of which property tables to generate") +var cases = flag.Bool("cases", + true, + "generate case tables") +var test = flag.Bool("test", + false, + "test existing tables; can be used to compare web data with package data") +var localFiles = flag.Bool("local", + false, + "data files have been copied to current directory; for debugging only") +var outputFile = flag.String("output", + "", + "output file for generated tables; default stdout") + +var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) +var logger = log.New(os.Stderr, "", log.Lshortfile) + +var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile" + +func setupOutput() { + output = bufio.NewWriter(startGofmt()) +} + +// startGofmt connects output to a gofmt process if -output is set. +func startGofmt() io.Writer { + if *outputFile == "" { + return os.Stdout + } + stdout, err := os.Create(*outputFile) + if err != nil { + logger.Fatal(err) + } + // Pipe output to gofmt. + gofmt := exec.Command("gofmt") + fd, err := gofmt.StdinPipe() + if err != nil { + logger.Fatal(err) + } + gofmt.Stdout = stdout + gofmt.Stderr = os.Stderr + err = gofmt.Start() + if err != nil { + logger.Fatal(err) + } + return fd +} + +func flushOutput() { + err := output.Flush() + if err != nil { + logger.Fatal(err) + } +} + +func printf(format string, args ...interface{}) { + fmt.Fprintf(output, format, args...) +} + +func print(args ...interface{}) { + fmt.Fprint(output, args...) +} + +func println(args ...interface{}) { + fmt.Fprintln(output, args...) +} + +type reader struct { + *bufio.Reader + fd *os.File + resp *http.Response +} + +func open(url string) *reader { + file := filepath.Base(url) + if *localFiles { + fd, err := os.Open(file) + if err != nil { + logger.Fatal(err) + } + return &reader{bufio.NewReader(fd), fd, nil} + } + resp, err := http.Get(url) + if err != nil { + logger.Fatal(err) + } + if resp.StatusCode != 200 { + logger.Fatalf("bad GET status for %s: %d", file, resp.Status) + } + return &reader{bufio.NewReader(resp.Body), nil, resp} + +} + +func (r *reader) close() { + if r.fd != nil { + r.fd.Close() + } else { + r.resp.Body.Close() + } +} + +var category = map[string]bool{ + // Nd Lu etc. + // We use one-character names to identify merged categories + "L": true, // Lu Ll Lt Lm Lo + "P": true, // Pc Pd Ps Pe Pu Pf Po + "M": true, // Mn Mc Me + "N": true, // Nd Nl No + "S": true, // Sm Sc Sk So + "Z": true, // Zs Zl Zp + "C": true, // Cc Cf Cs Co Cn +} + +// UnicodeData.txt has form: +// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; +// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A +// See http://www.unicode.org/reports/tr44/ for a full explanation +// The fields: +const ( + FCodePoint = iota + FName + FGeneralCategory + FCanonicalCombiningClass + FBidiClass + FDecompositionTypeAndMapping + FNumericType + FNumericDigit // If a decimal digit. + FNumericValue // Includes non-decimal, e.g. U+2155=1/5 + FBidiMirrored + FUnicode1Name + FISOComment + FSimpleUppercaseMapping + FSimpleLowercaseMapping + FSimpleTitlecaseMapping + NumField + + MaxChar = 0x10FFFF // anything above this shouldn't exist +) + +var fieldName = []string{ + FCodePoint: "CodePoint", + FName: "Name", + FGeneralCategory: "GeneralCategory", + FCanonicalCombiningClass: "CanonicalCombiningClass", + FBidiClass: "BidiClass", + FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", + FNumericType: "NumericType", + FNumericDigit: "NumericDigit", + FNumericValue: "NumericValue", + FBidiMirrored: "BidiMirrored", + FUnicode1Name: "Unicode1Name", + FISOComment: "ISOComment", + FSimpleUppercaseMapping: "SimpleUppercaseMapping", + FSimpleLowercaseMapping: "SimpleLowercaseMapping", + FSimpleTitlecaseMapping: "SimpleTitlecaseMapping", +} + +// This contains only the properties we're interested in. +type Char struct { + field []string // debugging only; could be deleted if we take out char.dump() + codePoint rune // if zero, this index is not a valid code point. + category string + upperCase rune + lowerCase rune + titleCase rune + foldCase rune // simple case folding + caseOrbit rune // next in simple case folding orbit +} + +// Scripts.txt has form: +// A673 ; Cyrillic # Po SLAVONIC ASTERISK +// A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK +// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation + +type Script struct { + lo, hi uint32 // range of code points + script string +} + +var chars = make([]Char, MaxChar+1) +var scripts = make(map[string][]Script) +var props = make(map[string][]Script) // a property looks like a script; can share the format + +var lastChar rune = 0 + +// In UnicodeData.txt, some ranges are marked like this: +// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; +// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; +// parseCategory returns a state variable indicating the weirdness. +type State int + +const ( + SNormal State = iota // known to be zero for the type + SFirst + SLast + SMissing +) + +func parseCategory(line string) (state State) { + field := strings.Split(line, ";") + if len(field) != NumField { + logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) + } + point, err := strconv.ParseUint(field[FCodePoint], 16, 64) + if err != nil { + logger.Fatalf("%.5s...: %s", line, err) + } + lastChar = rune(point) + if point == 0 { + return // not interesting and we use 0 as unset + } + if point > MaxChar { + return + } + char := &chars[point] + char.field = field + if char.codePoint != 0 { + logger.Fatalf("point %U reused", point) + } + char.codePoint = lastChar + char.category = field[FGeneralCategory] + category[char.category] = true + switch char.category { + case "Nd": + // Decimal digit + _, err := strconv.Atoi(field[FNumericValue]) + if err != nil { + logger.Fatalf("%U: bad numeric field: %s", point, err) + } + case "Lu": + char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) + case "Ll": + char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping]) + case "Lt": + char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint]) + default: + char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping]) + } + switch { + case strings.Index(field[FName], ", First>") > 0: + state = SFirst + case strings.Index(field[FName], ", Last>") > 0: + state = SLast + } + return +} + +func (char *Char) dump(s string) { + print(s, " ") + for i := 0; i < len(char.field); i++ { + printf("%s:%q ", fieldName[i], char.field[i]) + } + print("\n") +} + +func (char *Char) letter(u, l, t string) { + char.upperCase = char.letterValue(u, "U") + char.lowerCase = char.letterValue(l, "L") + char.titleCase = char.letterValue(t, "T") +} + +func (char *Char) letterValue(s string, cas string) rune { + if s == "" { + return 0 + } + v, err := strconv.ParseUint(s, 16, 64) + if err != nil { + char.dump(cas) + logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err) + } + return rune(v) +} + +func allCategories() []string { + a := make([]string, 0, len(category)) + for k := range category { + a = append(a, k) + } + sort.Strings(a) + return a +} + +func all(scripts map[string][]Script) []string { + a := make([]string, 0, len(scripts)) + for k := range scripts { + a = append(a, k) + } + sort.Strings(a) + return a +} + +func allCatFold(m map[string]map[rune]bool) []string { + a := make([]string, 0, len(m)) + for k := range m { + a = append(a, k) + } + sort.Strings(a) + return a +} + +// Extract the version number from the URL +func version() string { + // Break on slashes and look for the first numeric field + fields := strings.Split(*url, "/") + for _, f := range fields { + if len(f) > 0 && '0' <= f[0] && f[0] <= '9' { + return f + } + } + logger.Fatal("unknown version") + return "Unknown" +} + +func categoryOp(code rune, class uint8) bool { + category := chars[code].category + return len(category) > 0 && category[0] == class +} + +func loadChars() { + if *dataURL == "" { + flag.Set("data", *url+"UnicodeData.txt") + } + input := open(*dataURL) + defer input.close() + scanner := bufio.NewScanner(input) + var first rune = 0 + for scanner.Scan() { + switch parseCategory(scanner.Text()) { + case SNormal: + if first != 0 { + logger.Fatalf("bad state normal at %U", lastChar) + } + case SFirst: + if first != 0 { + logger.Fatalf("bad state first at %U", lastChar) + } + first = lastChar + case SLast: + if first == 0 { + logger.Fatalf("bad state last at %U", lastChar) + } + for i := first + 1; i <= lastChar; i++ { + chars[i] = chars[first] + chars[i].codePoint = i + } + first = 0 + } + } + if scanner.Err() != nil { + logger.Fatal(scanner.Err()) + } +} + +func loadCasefold() { + if *casefoldingURL == "" { + flag.Set("casefolding", *url+"CaseFolding.txt") + } + input := open(*casefoldingURL) + defer input.close() + scanner := bufio.NewScanner(input) + for scanner.Scan() { + line := scanner.Text() + if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 { + continue + } + field := strings.Split(line, "; ") + if len(field) != 4 { + logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) + } + kind := field[1] + if kind != "C" && kind != "S" { + // Only care about 'common' and 'simple' foldings. + continue + } + p1, err := strconv.ParseUint(field[0], 16, 64) + if err != nil { + logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) + } + p2, err := strconv.ParseUint(field[2], 16, 64) + if err != nil { + logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) + } + chars[p1].foldCase = rune(p2) + } + if scanner.Err() != nil { + logger.Fatal(scanner.Err()) + } +} + +const progHeader = `// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Generated by running +// maketables --tables=%s --data=%s --casefolding=%s +// DO NOT EDIT + +package unicode + +` + +func printCategories() { + if *tablelist == "" { + return + } + // Find out which categories to dump + list := strings.Split(*tablelist, ",") + if *tablelist == "all" { + list = allCategories() + } + if *test { + fullCategoryTest(list) + return + } + printf(progHeader, *tablelist, *dataURL, *casefoldingURL) + + println("// Version is the Unicode edition from which the tables are derived.") + printf("const Version = %q\n\n", version()) + + if *tablelist == "all" { + println("// Categories is the set of Unicode category tables.") + println("var Categories = map[string] *RangeTable {") + for _, k := range allCategories() { + printf("\t%q: %s,\n", k, k) + } + print("}\n\n") + } + + decl := make(sort.StringSlice, len(list)) + ndecl := 0 + for _, name := range list { + if _, ok := category[name]; !ok { + logger.Fatal("unknown category", name) + } + // We generate an UpperCase name to serve as concise documentation and an _UnderScored + // name to store the data. This stops godoc dumping all the tables but keeps them + // available to clients. + // Cases deserving special comments + varDecl := "" + switch name { + case "C": + varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" + varDecl += "\tC = _C\n" + case "L": + varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" + varDecl += "\tL = _L\n" + case "M": + varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" + varDecl += "\tM = _M\n" + case "N": + varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" + varDecl += "\tN = _N\n" + case "P": + varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" + varDecl += "\tP = _P\n" + case "S": + varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" + varDecl += "\tS = _S\n" + case "Z": + varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" + varDecl += "\tZ = _Z\n" + case "Nd": + varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" + case "Lu": + varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" + case "Ll": + varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" + case "Lt": + varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" + } + if len(name) > 1 { + varDecl += fmt.Sprintf( + "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", + name, name, name, name) + } + decl[ndecl] = varDecl + ndecl++ + if len(name) == 1 { // unified categories + decl := fmt.Sprintf("var _%s = &RangeTable{\n", name) + dumpRange( + decl, + func(code rune) bool { return categoryOp(code, name[0]) }) + continue + } + dumpRange( + fmt.Sprintf("var _%s = &RangeTable{\n", name), + func(code rune) bool { return chars[code].category == name }) + } + decl.Sort() + println("// These variables have type *RangeTable.") + println("var (") + for _, d := range decl { + print(d) + } + print(")\n\n") +} + +type Op func(code rune) bool + +const format = "\t\t{0x%04x, 0x%04x, %d},\n" + +func dumpRange(header string, inCategory Op) { + print(header) + next := rune(0) + latinOffset := 0 + print("\tR16: []Range16{\n") + // one Range for each iteration + count := &range16Count + size := 16 + for { + // look for start of range + for next < rune(len(chars)) && !inCategory(next) { + next++ + } + if next >= rune(len(chars)) { + // no characters remain + break + } + + // start of range + lo := next + hi := next + stride := rune(1) + // accept lo + next++ + // look for another character to set the stride + for next < rune(len(chars)) && !inCategory(next) { + next++ + } + if next >= rune(len(chars)) { + // no more characters + printf(format, lo, hi, stride) + break + } + // set stride + stride = next - lo + // check for length of run. next points to first jump in stride + for i := next; i < rune(len(chars)); i++ { + if inCategory(i) == (((i - lo) % stride) == 0) { + // accept + if inCategory(i) { + hi = i + } + } else { + // no more characters in this run + break + } + } + if uint32(hi) <= unicode.MaxLatin1 { + latinOffset++ + } + size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) + // next range: start looking where this range ends + next = hi + 1 + } + print("\t},\n") + if latinOffset > 0 { + printf("\tLatinOffset: %d,\n", latinOffset) + } + print("}\n\n") +} + +func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { + if size == 16 && hi >= 1<<16 { + if lo < 1<<16 { + if lo+stride != hi { + logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) + } + // No range contains U+FFFF as an instance, so split + // the range into two entries. That way we can maintain + // the invariant that R32 contains only >= 1<<16. + printf(format, lo, lo, 1) + lo = hi + stride = 1 + *count++ + } + print("\t},\n") + print("\tR32: []Range32{\n") + size = 32 + count = &range32Count + } + printf(format, lo, hi, stride) + *count++ + return size, count +} + +func fullCategoryTest(list []string) { + for _, name := range list { + if _, ok := category[name]; !ok { + logger.Fatal("unknown category", name) + } + r, ok := unicode.Categories[name] + if !ok && len(name) > 1 { + logger.Fatalf("unknown table %q", name) + } + if len(name) == 1 { + verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) + } else { + verifyRange( + name, + func(code rune) bool { return chars[code].category == name }, + r) + } + } +} + +func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { + count := 0 + for j := range chars { + i := rune(j) + web := inCategory(i) + pkg := unicode.Is(table, i) + if web != pkg { + fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) + count++ + if count > 10 { + break + } + } + } +} + +func parseScript(line string, scripts map[string][]Script) { + comment := strings.Index(line, "#") + if comment >= 0 { + line = line[0:comment] + } + line = strings.TrimSpace(line) + if len(line) == 0 { + return + } + field := strings.Split(line, ";") + if len(field) != 2 { + logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field)) + } + matches := scriptRe.FindStringSubmatch(line) + if len(matches) != 4 { + logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches)) + } + lo, err := strconv.ParseUint(matches[1], 16, 64) + if err != nil { + logger.Fatalf("%.5s...: %s", line, err) + } + hi := lo + if len(matches[2]) > 2 { // ignore leading .. + hi, err = strconv.ParseUint(matches[2][2:], 16, 64) + if err != nil { + logger.Fatalf("%.5s...: %s", line, err) + } + } + name := matches[3] + scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) +} + +// The script tables have a lot of adjacent elements. Fold them together. +func foldAdjacent(r []Script) []unicode.Range32 { + s := make([]unicode.Range32, 0, len(r)) + j := 0 + for i := 0; i < len(r); i++ { + if j > 0 && r[i].lo == s[j-1].Hi+1 { + s[j-1].Hi = r[i].hi + } else { + s = s[0 : j+1] + s[j] = unicode.Range32{ + Lo: uint32(r[i].lo), + Hi: uint32(r[i].hi), + Stride: 1, + } + j++ + } + } + return s +} + +func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { + for _, name := range list { + if _, ok := scripts[name]; !ok { + logger.Fatal("unknown script", name) + } + _, ok := installed[name] + if !ok { + logger.Fatal("unknown table", name) + } + for _, script := range scripts[name] { + for r := script.lo; r <= script.hi; r++ { + if !unicode.Is(installed[name], rune(r)) { + fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) + } + } + } + } +} + +// PropList.txt has the same format as Scripts.txt so we can share its parser. +func printScriptOrProperty(doProps bool) { + flag := "scripts" + flaglist := *scriptlist + file := "Scripts.txt" + table := scripts + installed := unicode.Scripts + if doProps { + flag = "props" + flaglist = *proplist + file = "PropList.txt" + table = props + installed = unicode.Properties + } + if flaglist == "" { + return + } + input := open(*url + file) + scanner := bufio.NewScanner(input) + for scanner.Scan() { + parseScript(scanner.Text(), table) + } + if scanner.Err() != nil { + logger.Fatal(scanner.Err()) + } + input.close() + + // Find out which scripts to dump + list := strings.Split(flaglist, ",") + if flaglist == "all" { + list = all(table) + } + if *test { + fullScriptTest(list, installed, table) + return + } + + printf( + "// Generated by running\n"+ + "// maketables --%s=%s --url=%s\n"+ + "// DO NOT EDIT\n\n", + flag, + flaglist, + *url) + if flaglist == "all" { + if doProps { + println("// Properties is the set of Unicode property tables.") + println("var Properties = map[string] *RangeTable{") + } else { + println("// Scripts is the set of Unicode script tables.") + println("var Scripts = map[string] *RangeTable{") + } + for _, k := range all(table) { + printf("\t%q: %s,\n", k, k) + } + print("}\n\n") + } + + decl := make(sort.StringSlice, len(list)) + ndecl := 0 + for _, name := range list { + if doProps { + decl[ndecl] = fmt.Sprintf( + "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", + name, name, name, name) + } else { + decl[ndecl] = fmt.Sprintf( + "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", + name, name, name, name) + } + ndecl++ + printf("var _%s = &RangeTable {\n", name) + ranges := foldAdjacent(table[name]) + print("\tR16: []Range16{\n") + size := 16 + count := &range16Count + for _, s := range ranges { + size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) + } + print("\t},\n") + if off := findLatinOffset(ranges); off > 0 { + printf("\tLatinOffset: %d,\n", off) + } + print("}\n\n") + } + decl.Sort() + println("// These variables have type *RangeTable.") + println("var (") + for _, d := range decl { + print(d) + } + print(")\n\n") +} + +func findLatinOffset(ranges []unicode.Range32) int { + i := 0 + for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { + i++ + } + return i +} + +const ( + CaseUpper = 1 << iota + CaseLower + CaseTitle + CaseNone = 0 // must be zero + CaseMissing = -1 // character not present; not a valid case state +) + +type caseState struct { + point rune + _case int + deltaToUpper rune + deltaToLower rune + deltaToTitle rune +} + +// Is d a continuation of the state of c? +func (c *caseState) adjacent(d *caseState) bool { + if d.point < c.point { + c, d = d, c + } + switch { + case d.point != c.point+1: // code points not adjacent (shouldn't happen) + return false + case d._case != c._case: // different cases + return c.upperLowerAdjacent(d) + case c._case == CaseNone: + return false + case c._case == CaseMissing: + return false + case d.deltaToUpper != c.deltaToUpper: + return false + case d.deltaToLower != c.deltaToLower: + return false + case d.deltaToTitle != c.deltaToTitle: + return false + } + return true +} + +// Is d the same as c, but opposite in upper/lower case? this would make it +// an element of an UpperLower sequence. +func (c *caseState) upperLowerAdjacent(d *caseState) bool { + // check they're a matched case pair. we know they have adjacent values + switch { + case c._case == CaseUpper && d._case != CaseLower: + return false + case c._case == CaseLower && d._case != CaseUpper: + return false + } + // matched pair (at least in upper/lower). make the order Upper Lower + if c._case == CaseLower { + c, d = d, c + } + // for an Upper Lower sequence the deltas have to be in order + // c: 0 1 0 + // d: -1 0 -1 + switch { + case c.deltaToUpper != 0: + return false + case c.deltaToLower != 1: + return false + case c.deltaToTitle != 0: + return false + case d.deltaToUpper != -1: + return false + case d.deltaToLower != 0: + return false + case d.deltaToTitle != -1: + return false + } + return true +} + +// Does this character start an UpperLower sequence? +func (c *caseState) isUpperLower() bool { + // for an Upper Lower sequence the deltas have to be in order + // c: 0 1 0 + switch { + case c.deltaToUpper != 0: + return false + case c.deltaToLower != 1: + return false + case c.deltaToTitle != 0: + return false + } + return true +} + +// Does this character start a LowerUpper sequence? +func (c *caseState) isLowerUpper() bool { + // for an Upper Lower sequence the deltas have to be in order + // c: -1 0 -1 + switch { + case c.deltaToUpper != -1: + return false + case c.deltaToLower != 0: + return false + case c.deltaToTitle != -1: + return false + } + return true +} + +func getCaseState(i rune) (c *caseState) { + c = &caseState{point: i, _case: CaseNone} + ch := &chars[i] + switch ch.codePoint { + case 0: + c._case = CaseMissing // Will get NUL wrong but that doesn't matter + return + case ch.upperCase: + c._case = CaseUpper + case ch.lowerCase: + c._case = CaseLower + case ch.titleCase: + c._case = CaseTitle + } + // Some things such as roman numeral U+2161 don't describe themselves + // as upper case, but have a lower case. Second-guess them. + if c._case == CaseNone && ch.lowerCase != 0 { + c._case = CaseUpper + } + // Same in the other direction. + if c._case == CaseNone && ch.upperCase != 0 { + c._case = CaseLower + } + + if ch.upperCase != 0 { + c.deltaToUpper = ch.upperCase - i + } + if ch.lowerCase != 0 { + c.deltaToLower = ch.lowerCase - i + } + if ch.titleCase != 0 { + c.deltaToTitle = ch.titleCase - i + } + return +} + +func printCases() { + if !*cases { + return + } + if *test { + fullCaseTest() + return + } + printf( + "// Generated by running\n"+ + "// maketables --data=%s --casefolding=%s\n"+ + "// DO NOT EDIT\n\n"+ + "// CaseRanges is the table describing case mappings for all letters with\n"+ + "// non-self mappings.\n"+ + "var CaseRanges = _CaseRanges\n"+ + "var _CaseRanges = []CaseRange {\n", + *dataURL, *casefoldingURL) + + var startState *caseState // the start of a run; nil for not active + var prevState = &caseState{} // the state of the previous character + for i := range chars { + state := getCaseState(rune(i)) + if state.adjacent(prevState) { + prevState = state + continue + } + // end of run (possibly) + printCaseRange(startState, prevState) + startState = nil + if state._case != CaseMissing && state._case != CaseNone { + startState = state + } + prevState = state + } + print("}\n") +} + +func printCaseRange(lo, hi *caseState) { + if lo == nil { + return + } + if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { + // character represents itself in all cases - no need to mention it + return + } + switch { + case hi.point > lo.point && lo.isUpperLower(): + printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", + lo.point, hi.point) + case hi.point > lo.point && lo.isLowerUpper(): + logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) + printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", + lo.point, hi.point) + default: + printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", + lo.point, hi.point, + lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) + } +} + +// If the cased value in the Char is 0, it means use the rune itself. +func caseIt(r, cased rune) rune { + if cased == 0 { + return r + } + return cased +} + +func fullCaseTest() { + for j, c := range chars { + i := rune(j) + lower := unicode.ToLower(i) + want := caseIt(i, c.lowerCase) + if lower != want { + fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) + } + upper := unicode.ToUpper(i) + want = caseIt(i, c.upperCase) + if upper != want { + fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) + } + title := unicode.ToTitle(i) + want = caseIt(i, c.titleCase) + if title != want { + fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) + } + } +} + +func printLatinProperties() { + if *test { + return + } + println("var properties = [MaxLatin1+1]uint8{") + for code := 0; code <= unicode.MaxLatin1; code++ { + var property string + switch chars[code].category { + case "Cc", "": // NUL has no category. + property = "pC" + case "Cf": // soft hyphen, unique category, not printable. + property = "0" + case "Ll": + property = "pLl | pp" + case "Lo": + property = "pLo | pp" + case "Lu": + property = "pLu | pp" + case "Nd", "No": + property = "pN | pp" + case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": + property = "pP | pp" + case "Sc", "Sk", "Sm", "So": + property = "pS | pp" + case "Zs": + property = "pZ" + default: + logger.Fatalf("%U has unknown category %q", code, chars[code].category) + } + // Special case + if code == ' ' { + property = "pZ | pp" + } + printf("\t0x%02X: %s, // %q\n", code, property, code) + } + printf("}\n\n") +} + +type runeSlice []rune + +func (p runeSlice) Len() int { return len(p) } +func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } +func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +func printCasefold() { + // Build list of case-folding groups attached to each canonical folded char (typically lower case). + var caseOrbit = make([][]rune, MaxChar+1) + for j := range chars { + i := rune(j) + c := &chars[i] + if c.foldCase == 0 { + continue + } + orb := caseOrbit[c.foldCase] + if orb == nil { + orb = append(orb, c.foldCase) + } + caseOrbit[c.foldCase] = append(orb, i) + } + + // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. + for j := range chars { + i := rune(j) + c := &chars[i] + f := c.foldCase + if f == 0 { + f = i + } + orb := caseOrbit[f] + if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { + // Default assumption of [upper, lower] is wrong. + caseOrbit[i] = []rune{i} + } + } + + // Delete the groups for which assuming [lower, upper] is right. + for i, orb := range caseOrbit { + if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { + caseOrbit[i] = nil + } + } + + // Record orbit information in chars. + for _, orb := range caseOrbit { + if orb == nil { + continue + } + sort.Sort(runeSlice(orb)) + c := orb[len(orb)-1] + for _, d := range orb { + chars[c].caseOrbit = d + c = d + } + } + + printCaseOrbit() + + // Tables of category and script folding exceptions: code points + // that must be added when interpreting a particular category/script + // in a case-folding context. + cat := make(map[string]map[rune]bool) + for name := range category { + if x := foldExceptions(inCategory(name)); len(x) > 0 { + cat[name] = x + } + } + + scr := make(map[string]map[rune]bool) + for name := range scripts { + if x := foldExceptions(inScript(name)); len(x) > 0 { + cat[name] = x + } + } + + printCatFold("FoldCategory", cat) + printCatFold("FoldScript", scr) +} + +// inCategory returns a list of all the runes in the category. +func inCategory(name string) []rune { + var x []rune + for j := range chars { + i := rune(j) + c := &chars[i] + if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { + x = append(x, i) + } + } + return x +} + +// inScript returns a list of all the runes in the script. +func inScript(name string) []rune { + var x []rune + for _, s := range scripts[name] { + for c := s.lo; c <= s.hi; c++ { + x = append(x, rune(c)) + } + } + return x +} + +// foldExceptions returns a list of all the runes fold-equivalent +// to runes in class but not in class themselves. +func foldExceptions(class []rune) map[rune]bool { + // Create map containing class and all fold-equivalent chars. + m := make(map[rune]bool) + for _, r := range class { + c := &chars[r] + if c.caseOrbit == 0 { + // Just upper and lower. + if u := c.upperCase; u != 0 { + m[u] = true + } + if l := c.lowerCase; l != 0 { + m[l] = true + } + m[r] = true + continue + } + // Otherwise walk orbit. + r0 := r + for { + m[r] = true + r = chars[r].caseOrbit + if r == r0 { + break + } + } + } + + // Remove class itself. + for _, r := range class { + delete(m, r) + } + + // What's left is the exceptions. + return m +} + +var comment = map[string]string{ + "FoldCategory": "// FoldCategory maps a category name to a table of\n" + + "// code points outside the category that are equivalent under\n" + + "// simple case folding to code points inside the category.\n" + + "// If there is no entry for a category name, there are no such points.\n", + + "FoldScript": "// FoldScript maps a script name to a table of\n" + + "// code points outside the script that are equivalent under\n" + + "// simple case folding to code points inside the script.\n" + + "// If there is no entry for a script name, there are no such points.\n", +} + +func printCaseOrbit() { + if *test { + for j := range chars { + i := rune(j) + c := &chars[i] + f := c.caseOrbit + if f == 0 { + if c.lowerCase != i && c.lowerCase != 0 { + f = c.lowerCase + } else if c.upperCase != i && c.upperCase != 0 { + f = c.upperCase + } else { + f = i + } + } + if g := unicode.SimpleFold(i); g != f { + fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) + } + } + return + } + + printf("var caseOrbit = []foldPair{\n") + for i := range chars { + c := &chars[i] + if c.caseOrbit != 0 { + printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) + foldPairCount++ + } + } + printf("}\n\n") +} + +func printCatFold(name string, m map[string]map[rune]bool) { + if *test { + var pkgMap map[string]*unicode.RangeTable + if name == "FoldCategory" { + pkgMap = unicode.FoldCategory + } else { + pkgMap = unicode.FoldScript + } + if len(pkgMap) != len(m) { + fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) + return + } + for k, v := range m { + t, ok := pkgMap[k] + if !ok { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) + continue + } + n := 0 + for _, r := range t.R16 { + for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { + if !v[c] { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) + } + n++ + } + } + for _, r := range t.R32 { + for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { + if !v[c] { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) + } + n++ + } + } + if n != len(v) { + fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) + } + } + return + } + + print(comment[name]) + printf("var %s = map[string]*RangeTable{\n", name) + for _, name := range allCatFold(m) { + printf("\t%q: fold%s,\n", name, name) + } + printf("}\n\n") + for _, name := range allCatFold(m) { + class := m[name] + dumpRange( + fmt.Sprintf("var fold%s = &RangeTable{\n", name), + func(code rune) bool { return class[code] }) + } +} + +var range16Count = 0 // Number of entries in the 16-bit range tables. +var range32Count = 0 // Number of entries in the 32-bit range tables. +var foldPairCount = 0 // Number of fold pairs in the exception tables. + +func printSizes() { + if *test { + return + } + println() + printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) + range16Bytes := range16Count * 3 * 2 + range32Bytes := range32Count * 3 * 4 + printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) + println() + printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) +} |