diff options
author | Ondřej Surý <ondrej@sury.org> | 2011-09-13 13:13:40 +0200 |
---|---|---|
committer | Ondřej Surý <ondrej@sury.org> | 2011-09-13 13:13:40 +0200 |
commit | 5ff4c17907d5b19510a62e08fd8d3b11e62b431d (patch) | |
tree | c0650497e988f47be9c6f2324fa692a52dea82e1 /src/pkg/exp/norm/maketables.go | |
parent | 80f18fc933cf3f3e829c5455a1023d69f7b86e52 (diff) | |
download | golang-upstream/60.tar.gz |
Imported Upstream version 60upstream/60
Diffstat (limited to 'src/pkg/exp/norm/maketables.go')
-rw-r--r-- | src/pkg/exp/norm/maketables.go | 855 |
1 files changed, 855 insertions, 0 deletions
diff --git a/src/pkg/exp/norm/maketables.go b/src/pkg/exp/norm/maketables.go new file mode 100644 index 000000000..e3e5700a6 --- /dev/null +++ b/src/pkg/exp/norm/maketables.go @@ -0,0 +1,855 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Normalization table generator. +// Data read from the web. + +package main + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "http" + "io" + "log" + "os" + "regexp" + "strconv" + "strings" +) + +func main() { + flag.Parse() + loadUnicodeData() + loadCompositionExclusions() + completeCharFields(FCanonical) + completeCharFields(FCompatibility) + verifyComputed() + printChars() + makeTables() + testDerived() +} + +var url = flag.String("url", + "http://www.unicode.org/Public/6.0.0/ucd/", + "URL of Unicode database directory") +var tablelist = flag.String("tables", + "all", + "comma-separated list of which tables to generate; "+ + "can be 'decomp', 'recomp', 'info' and 'all'") +var test = flag.Bool("test", + false, + "test existing tables; can be used to compare web data with package data") +var verbose = flag.Bool("verbose", + false, + "write data to stdout as it is parsed") +var localFiles = flag.Bool("local", + false, + "data files have been copied to the current directory; for debugging only") + +var logger = log.New(os.Stderr, "", log.Lshortfile) + +// UnicodeData.txt has form: +// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;; +// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A +// See http://unicode.org/reports/tr44/ for full explanation +// The fields: +const ( + FCodePoint = iota + FName + FGeneralCategory + FCanonicalCombiningClass + FBidiClass + FDecompMapping + FDecimalValue + FDigitValue + FNumericValue + FBidiMirrored + FUnicode1Name + FISOComment + FSimpleUppercaseMapping + FSimpleLowercaseMapping + FSimpleTitlecaseMapping + NumField + + MaxChar = 0x10FFFF // anything above this shouldn't exist +) + +// Quick Check properties of runes allow us to quickly +// determine whether a rune may occur in a normal form. +// For a given normal form, a rune may be guaranteed to occur +// verbatim (QC=Yes), may or may not combine with another +// rune (QC=Maybe), or may not occur (QC=No). +type QCResult int + +const ( + QCUnknown QCResult = iota + QCYes + QCNo + QCMaybe +) + +func (r QCResult) String() string { + switch r { + case QCYes: + return "Yes" + case QCNo: + return "No" + case QCMaybe: + return "Maybe" + } + return "***UNKNOWN***" +} + +const ( + FCanonical = iota // NFC or NFD + FCompatibility // NFKC or NFKD + FNumberOfFormTypes +) + +const ( + MComposed = iota // NFC or NFKC + MDecomposed // NFD or NFKD + MNumberOfModes +) + +// This contains only the properties we're interested in. +type Char struct { + name string + codePoint int // if zero, this index is not a valid code point. + ccc uint8 // canonical combining class + excludeInComp bool // from CompositionExclusions.txt + compatDecomp bool // it has a compatibility expansion + + forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility + + state State +} + +var chars = make([]Char, MaxChar+1) + +func (c Char) String() string { + buf := new(bytes.Buffer) + + fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) + fmt.Fprintf(buf, " ccc: %v\n", c.ccc) + fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) + fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) + fmt.Fprintf(buf, " state: %v\n", c.state) + fmt.Fprintf(buf, " NFC:\n") + fmt.Fprint(buf, c.forms[FCanonical]) + fmt.Fprintf(buf, " NFKC:\n") + fmt.Fprint(buf, c.forms[FCompatibility]) + + return buf.String() +} + +// In UnicodeData.txt, some ranges are marked like this: +// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; +// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; +// parseCharacter keeps a state variable indicating the weirdness. +type State int + +const ( + SNormal State = iota // known to be zero for the type + SFirst + SLast + SMissing +) + +var lastChar int = 0 + +func (c Char) isValid() bool { + return c.codePoint != 0 && c.state != SMissing +} + +type FormInfo struct { + quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed + verified [MNumberOfModes]bool // index: MComposed or MDecomposed + + combinesForward bool // May combine with rune on the right + combinesBackward bool // May combine with rune on the left + isOneWay bool // Never appears in result + inDecomp bool // Some decompositions result in this char. + decomp Decomposition + expandedDecomp Decomposition +} + +func (f FormInfo) String() string { + buf := bytes.NewBuffer(make([]byte, 0)) + + fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) + fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) + fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) + fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) + fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) + fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) + fmt.Fprintf(buf, " decomposition: %v\n", f.decomp) + fmt.Fprintf(buf, " expandedDecomp: %v\n", f.expandedDecomp) + + return buf.String() +} + +type Decomposition []int + +func (d Decomposition) String() string { + return fmt.Sprintf("%.4X", d) +} + +func openReader(file string) (input io.ReadCloser) { + if *localFiles { + f, err := os.Open(file) + if err != nil { + logger.Fatal(err) + } + input = f + } else { + path := *url + file + resp, err := http.Get(path) + if err != nil { + logger.Fatal(err) + } + if resp.StatusCode != 200 { + logger.Fatal("bad GET status for "+file, resp.Status) + } + input = resp.Body + } + return +} + +func parseDecomposition(s string, skipfirst bool) (a []int, e os.Error) { + decomp := strings.Split(s, " ") + if len(decomp) > 0 && skipfirst { + decomp = decomp[1:] + } + for _, d := range decomp { + point, err := strconv.Btoui64(d, 16) + if err != nil { + return a, err + } + a = append(a, int(point)) + } + return a, nil +} + +func parseCharacter(line string) { + field := strings.Split(line, ";") + if len(field) != NumField { + logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField) + } + x, err := strconv.Btoui64(field[FCodePoint], 16) + point := int(x) + if err != nil { + logger.Fatalf("%.5s...: %s", line, err) + } + if point == 0 { + return // not interesting and we use 0 as unset + } + if point > MaxChar { + logger.Fatalf("%5s: Rune %X > MaxChar (%X)", line, point, MaxChar) + return + } + state := SNormal + switch { + case strings.Index(field[FName], ", First>") > 0: + state = SFirst + case strings.Index(field[FName], ", Last>") > 0: + state = SLast + } + firstChar := lastChar + 1 + lastChar = int(point) + if state != SLast { + firstChar = lastChar + } + x, err = strconv.Atoui64(field[FCanonicalCombiningClass]) + if err != nil { + logger.Fatalf("%U: bad ccc field: %s", int(x), err) + } + ccc := uint8(x) + decmap := field[FDecompMapping] + exp, e := parseDecomposition(decmap, false) + isCompat := false + if e != nil { + if len(decmap) > 0 { + exp, e = parseDecomposition(decmap, true) + if e != nil { + logger.Fatalf(`%U: bad decomp |%v|: "%s"`, int(x), decmap, e) + } + isCompat = true + } + } + for i := firstChar; i <= lastChar; i++ { + char := &chars[i] + char.name = field[FName] + char.codePoint = i + char.forms[FCompatibility].decomp = exp + if !isCompat { + char.forms[FCanonical].decomp = exp + } else { + char.compatDecomp = true + } + if len(decmap) > 0 { + char.forms[FCompatibility].decomp = exp + } + char.ccc = ccc + char.state = SMissing + if i == lastChar { + char.state = state + } + } + return +} + +func loadUnicodeData() { + f := openReader("UnicodeData.txt") + defer f.Close() + input := bufio.NewReader(f) + for { + line, err := input.ReadString('\n') + if err != nil { + if err == os.EOF { + break + } + logger.Fatal(err) + } + parseCharacter(line[0 : len(line)-1]) + } +} + +var singlePointRe = regexp.MustCompile(`^([0-9A-F]+) *$`) + +// CompositionExclusions.txt has form: +// 0958 # ... +// See http://unicode.org/reports/tr44/ for full explanation +func parseExclusion(line string) int { + comment := strings.Index(line, "#") + if comment >= 0 { + line = line[0:comment] + } + if len(line) == 0 { + return 0 + } + matches := singlePointRe.FindStringSubmatch(line) + if len(matches) != 2 { + logger.Fatalf("%s: %d matches (expected 1)\n", line, len(matches)) + } + point, err := strconv.Btoui64(matches[1], 16) + if err != nil { + logger.Fatalf("%.5s...: %s", line, err) + } + return int(point) +} + +func loadCompositionExclusions() { + f := openReader("CompositionExclusions.txt") + defer f.Close() + input := bufio.NewReader(f) + for { + line, err := input.ReadString('\n') + if err != nil { + if err == os.EOF { + break + } + logger.Fatal(err) + } + point := parseExclusion(line[0 : len(line)-1]) + if point == 0 { + continue + } + c := &chars[point] + if c.excludeInComp { + logger.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) + } + c.excludeInComp = true + } +} + +// hasCompatDecomp returns true if any of the recursive +// decompositions contains a compatibility expansion. +// In this case, the character may not occur in NFK*. +func hasCompatDecomp(rune int) bool { + c := &chars[rune] + if c.compatDecomp { + return true + } + for _, d := range c.forms[FCompatibility].decomp { + if hasCompatDecomp(d) { + return true + } + } + return false +} + +// Hangul related constants. +const ( + HangulBase = 0xAC00 + HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28) + + JamoLBase = 0x1100 + JamoLEnd = 0x1113 + JamoVBase = 0x1161 + JamoVEnd = 0x1176 + JamoTBase = 0x11A8 + JamoTEnd = 0x11C3 +) + +func isHangul(rune int) bool { + return HangulBase <= rune && rune < HangulEnd +} + +func ccc(rune int) uint8 { + return chars[rune].ccc +} + +// Insert a rune in a buffer, ordered by Canonical Combining Class. +func insertOrdered(b Decomposition, rune int) Decomposition { + n := len(b) + b = append(b, 0) + cc := ccc(rune) + if cc > 0 { + // Use bubble sort. + for ; n > 0; n-- { + if ccc(b[n-1]) <= cc { + break + } + b[n] = b[n-1] + } + } + b[n] = rune + return b +} + +// Recursively decompose. +func decomposeRecursive(form int, rune int, d Decomposition) Decomposition { + if isHangul(rune) { + return d + } + dcomp := chars[rune].forms[form].decomp + if len(dcomp) == 0 { + return insertOrdered(d, rune) + } + for _, c := range dcomp { + d = decomposeRecursive(form, c, d) + } + return d +} + +func completeCharFields(form int) { + // Phase 0: pre-expand decomposition. + for i := range chars { + f := &chars[i].forms[form] + if len(f.decomp) == 0 { + continue + } + exp := make(Decomposition, 0) + for _, c := range f.decomp { + exp = decomposeRecursive(form, c, exp) + } + f.expandedDecomp = exp + } + + // Phase 1: composition exclusion, mark decomposition. + for i := range chars { + c := &chars[i] + f := &c.forms[form] + + // Marks script-specific exclusions and version restricted. + f.isOneWay = c.excludeInComp + + // Singletons + f.isOneWay = f.isOneWay || len(f.decomp) == 1 + + // Non-starter decompositions + if len(f.decomp) > 1 { + chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 + f.isOneWay = f.isOneWay || chk + } + + // Runes that decompose into more than two runes. + f.isOneWay = f.isOneWay || len(f.decomp) > 2 + + if form == FCompatibility { + f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) + } + + for _, rune := range f.decomp { + chars[rune].forms[form].inDecomp = true + } + } + + // Phase 2: forward and backward combining. + for i := range chars { + c := &chars[i] + f := &c.forms[form] + + if !f.isOneWay && len(f.decomp) == 2 { + f0 := &chars[f.decomp[0]].forms[form] + f1 := &chars[f.decomp[1]].forms[form] + if !f0.isOneWay { + f0.combinesForward = true + } + if !f1.isOneWay { + f1.combinesBackward = true + } + } + } + + // Phase 3: quick check values. + for i := range chars { + c := &chars[i] + f := &c.forms[form] + + switch { + case len(f.decomp) > 0: + f.quickCheck[MDecomposed] = QCNo + case isHangul(i): + f.quickCheck[MDecomposed] = QCNo + default: + f.quickCheck[MDecomposed] = QCYes + } + switch { + case f.isOneWay: + f.quickCheck[MComposed] = QCNo + case (i & 0xffff00) == JamoLBase: + f.quickCheck[MComposed] = QCYes + if JamoVBase <= i && i < JamoVEnd { + f.quickCheck[MComposed] = QCMaybe + f.combinesBackward = true + } + if JamoTBase <= i && i < JamoTEnd { + f.quickCheck[MComposed] = QCMaybe + f.combinesBackward = true + } + case !f.combinesBackward: + f.quickCheck[MComposed] = QCYes + default: + f.quickCheck[MComposed] = QCMaybe + } + } +} + +func printBytes(b []byte, name string) { + fmt.Printf("// %s: %d bytes\n", name, len(b)) + fmt.Printf("var %s = [...]byte {", name) + for i, c := range b { + switch { + case i%64 == 0: + fmt.Printf("\n// Bytes %x - %x\n", i, i+63) + case i%8 == 0: + fmt.Printf("\n") + } + fmt.Printf("0x%.2X, ", c) + } + fmt.Print("\n}\n\n") +} + +// See forminfo.go for format. +func makeEntry(f *FormInfo) uint16 { + e := uint16(0) + if f.combinesForward { + e |= 0x8 + } + if f.quickCheck[MDecomposed] == QCNo { + e |= 0x1 + } + switch f.quickCheck[MComposed] { + case QCYes: + case QCNo: + e |= 0x2 + case QCMaybe: + e |= 0x6 + default: + log.Fatalf("Illegal quickcheck value %d.", f.quickCheck[MComposed]) + } + return e +} + +// Bits +// 0..8: CCC +// 9..12: NF(C|D) qc bits. +// 13..16: NFK(C|D) qc bits. +func makeCharInfo(c Char) uint16 { + e := makeEntry(&c.forms[FCompatibility]) + e = e<<4 | makeEntry(&c.forms[FCanonical]) + e = e<<8 | uint16(c.ccc) + return e +} + +func printCharInfoTables() int { + // Quick Check + CCC trie. + t := newNode() + for i, char := range chars { + v := makeCharInfo(char) + if v != 0 { + t.insert(i, v) + } + } + return t.printTables("charInfo") +} + +func printDecompositionTables() int { + decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) + size := 0 + + // Map decompositions + positionMap := make(map[string]uint16) + + // Store the uniqued decompositions in a byte buffer, + // preceded by their byte length. + for _, c := range chars { + for f := 0; f < 2; f++ { + d := c.forms[f].expandedDecomp + s := string([]int(d)) + if _, ok := positionMap[s]; !ok { + p := decompositions.Len() + decompositions.WriteByte(uint8(len(s))) + decompositions.WriteString(s) + positionMap[s] = uint16(p) + } + } + } + b := decompositions.Bytes() + printBytes(b, "decomps") + size += len(b) + + nfcT := newNode() + nfkcT := newNode() + for i, c := range chars { + d := c.forms[FCanonical].expandedDecomp + if len(d) != 0 { + nfcT.insert(i, positionMap[string([]int(d))]) + if ccc(c.codePoint) != ccc(d[0]) { + // We assume the lead ccc of a decomposition is !=0 in this case. + if ccc(d[0]) == 0 { + logger.Fatal("Expected differing CCC to be non-zero.") + } + } + } + d = c.forms[FCompatibility].expandedDecomp + if len(d) != 0 { + nfkcT.insert(i, positionMap[string([]int(d))]) + if ccc(c.codePoint) != ccc(d[0]) { + // We assume the lead ccc of a decomposition is !=0 in this case. + if ccc(d[0]) == 0 { + logger.Fatal("Expected differing CCC to be non-zero.") + } + } + } + } + size += nfcT.printTables("nfcDecomp") + size += nfkcT.printTables("nfkcDecomp") + return size +} + +func contains(sa []string, s string) bool { + for _, a := range sa { + if a == s { + return true + } + } + return false +} + +// Extract the version number from the URL. +func version() string { + // From http://www.unicode.org/standard/versions/#Version_Numbering: + // for the later Unicode versions, data files are located in + // versioned directories. + fields := strings.Split(*url, "/") + for _, f := range fields { + if match, _ := regexp.MatchString(`[0-9]\.[0-9]\.[0-9]`, f); match { + return f + } + } + logger.Fatal("unknown version") + return "Unknown" +} + +const fileHeader = `// Generated by running +// maketables --tables=%s --url=%s +// DO NOT EDIT + +package norm + +` + +func makeTables() { + size := 0 + if *tablelist == "" { + return + } + list := strings.Split(*tablelist, ",") + if *tablelist == "all" { + list = []string{"decomp", "recomp", "info"} + } + fmt.Printf(fileHeader, *tablelist, *url) + + fmt.Println("// Version is the Unicode edition from which the tables are derived.") + fmt.Printf("const Version = %q\n\n", version()) + + if contains(list, "decomp") { + size += printDecompositionTables() + } + + if contains(list, "recomp") { + // Note that we use 32 bit keys, instead of 64 bit. + // This clips the bits of three entries, but we know + // this won't cause a collision. The compiler will catch + // any changes made to UnicodeData.txt that introduces + // a collision. + // Note that the recomposition map for NFC and NFKC + // are identical. + + // Recomposition map + nrentries := 0 + for _, c := range chars { + f := c.forms[FCanonical] + if !f.isOneWay && len(f.decomp) > 0 { + nrentries++ + } + } + sz := nrentries * 8 + size += sz + fmt.Printf("// recompMap: %d bytes (entries only)\n", sz) + fmt.Println("var recompMap = map[uint32]uint32{") + for i, c := range chars { + f := c.forms[FCanonical] + d := f.decomp + if !f.isOneWay && len(d) > 0 { + key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) + fmt.Printf("0x%.8X: 0x%.4X,\n", key, i) + } + } + fmt.Printf("}\n\n") + } + + if contains(list, "info") { + size += printCharInfoTables() + } + fmt.Printf("// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) +} + +func printChars() { + if *verbose { + for _, c := range chars { + if !c.isValid() || c.state == SMissing { + continue + } + fmt.Println(c) + } + } +} + +// verifyComputed does various consistency tests. +func verifyComputed() { + for i, c := range chars { + for _, f := range c.forms { + isNo := (f.quickCheck[MDecomposed] == QCNo) + if (len(f.decomp) > 0) != isNo && !isHangul(i) { + log.Fatalf("%U: NF*D must be no if rune decomposes", i) + } + + isMaybe := f.quickCheck[MComposed] == QCMaybe + if f.combinesBackward != isMaybe { + log.Fatalf("%U: NF*C must be maybe if combinesBackward", i) + } + } + } +} + +var qcRe = regexp.MustCompile(`^([0-9A-F\.]+) *; (NF.*_QC); ([YNM]) #.*$`) + +// Use values in DerivedNormalizationProps.txt to compare against the +// values we computed. +// DerivedNormalizationProps.txt has form: +// 00C0..00C5 ; NFD_QC; N # ... +// 0374 ; NFD_QC; N # ... +// See http://unicode.org/reports/tr44/ for full explanation +func testDerived() { + if !*test { + return + } + f := openReader("DerivedNormalizationProps.txt") + defer f.Close() + input := bufio.NewReader(f) + for { + line, err := input.ReadString('\n') + if err != nil { + if err == os.EOF { + break + } + logger.Fatal(err) + } + qc := qcRe.FindStringSubmatch(line) + if qc == nil { + continue + } + rng := strings.Split(qc[1], "..") + i, err := strconv.Btoui64(rng[0], 16) + if err != nil { + log.Fatal(err) + } + j := i + if len(rng) > 1 { + j, err = strconv.Btoui64(rng[1], 16) + if err != nil { + log.Fatal(err) + } + } + var ftype, mode int + qt := strings.TrimSpace(qc[2]) + switch qt { + case "NFC_QC": + ftype, mode = FCanonical, MComposed + case "NFD_QC": + ftype, mode = FCanonical, MDecomposed + case "NFKC_QC": + ftype, mode = FCompatibility, MComposed + case "NFKD_QC": + ftype, mode = FCompatibility, MDecomposed + default: + log.Fatalf(`Unexpected quick check type "%s"`, qt) + } + var qr QCResult + switch qc[3] { + case "Y": + qr = QCYes + case "N": + qr = QCNo + case "M": + qr = QCMaybe + default: + log.Fatalf(`Unexpected quick check value "%s"`, qc[3]) + } + var lastFailed bool + // Verify current + for ; i <= j; i++ { + c := &chars[int(i)] + c.forms[ftype].verified[mode] = true + curqr := c.forms[ftype].quickCheck[mode] + if curqr != qr { + if !lastFailed { + logger.Printf("%s: %.4X..%.4X -- %s\n", + qt, int(i), int(j), line[0:50]) + } + logger.Printf("%U: FAILED %s (was %v need %v)\n", + int(i), qt, curqr, qr) + lastFailed = true + } + } + } + // Any unspecified value must be QCYes. Verify this. + for i, c := range chars { + for j, fd := range c.forms { + for k, qr := range fd.quickCheck { + if !fd.verified[k] && qr != QCYes { + m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" + logger.Printf(m, i, j, k, qr, c.name) + } + } + } + } +} |