summaryrefslogtreecommitdiff
path: root/src/pkg/unicode/maketables.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/unicode/maketables.go')
-rw-r--r--src/pkg/unicode/maketables.go353
1 files changed, 319 insertions, 34 deletions
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go
index 655fe46e4..07b931d7e 100644
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -24,15 +24,18 @@ import (
func main() {
flag.Parse()
loadChars() // always needed
+ loadCasefold()
printCategories()
printScriptOrProperty(false)
printScriptOrProperty(true)
printCases()
printLatinProperties()
+ printCasefold()
printSizes()
}
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
+var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
var url = flag.String("url",
"http://www.unicode.org/Public/6.0.0/ucd/",
"URL of Unicode database directory")
@@ -70,7 +73,7 @@ var category = map[string]bool{
// UnicodeData.txt has form:
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
-// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
+// See http://www.unicode.org/reports/tr44/ for a full explanation
// The fields:
const (
FCodePoint = iota
@@ -78,10 +81,10 @@ const (
FGeneralCategory
FCanonicalCombiningClass
FBidiClass
- FDecompositionType
- FDecompositionMapping
+ FDecompositionTypeAndMapping
FNumericType
- FNumericValue
+ FNumericDigit // If a decimal digit.
+ FNumericValue // Includes non-decimal, e.g. U+2155=1/5
FBidiMirrored
FUnicode1Name
FISOComment
@@ -94,21 +97,21 @@ const (
)
var fieldName = []string{
- "CodePoint",
- "Name",
- "GeneralCategory",
- "CanonicalCombiningClass",
- "BidiClass",
- "DecompositionType",
- "DecompositionMapping",
- "NumericType",
- "NumericValue",
- "BidiMirrored",
- "Unicode1Name",
- "ISOComment",
- "SimpleUppercaseMapping",
- "SimpleLowercaseMapping",
- "SimpleTitlecaseMapping",
+ FCodePoint: "CodePoint",
+ FName: "Name",
+ FGeneralCategory: "GeneralCategory",
+ FCanonicalCombiningClass: "CanonicalCombiningClass",
+ FBidiClass: "BidiClass",
+ FDecompositionTypeAndMapping: "DecompositionTypeAndMapping",
+ FNumericType: "NumericType",
+ FNumericDigit: "NumericDigit",
+ FNumericValue: "NumericValue",
+ FBidiMirrored: "BidiMirrored",
+ FUnicode1Name: "Unicode1Name",
+ FISOComment: "ISOComment",
+ FSimpleUppercaseMapping: "SimpleUppercaseMapping",
+ FSimpleLowercaseMapping: "SimpleLowercaseMapping",
+ FSimpleTitlecaseMapping: "SimpleTitlecaseMapping",
}
// This contains only the properties we're interested in.
@@ -119,6 +122,8 @@ type Char struct {
upperCase int
lowerCase int
titleCase int
+ foldCase int // simple case folding
+ caseOrbit int // next in simple case folding orbit
}
// Scripts.txt has form:
@@ -151,7 +156,7 @@ const (
)
func parseCategory(line string) (state State) {
- field := strings.Split(line, ";", -1)
+ field := strings.Split(line, ";")
if len(field) != NumField {
logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
}
@@ -248,7 +253,7 @@ func all(scripts map[string][]Script) []string {
// Extract the version number from the URL
func version() string {
// Break on slashes and look for the first numeric field
- fields := strings.Split(*url, "/", -1)
+ fields := strings.Split(*url, "/")
for _, f := range fields {
if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {
return f
@@ -308,8 +313,53 @@ func loadChars() {
resp.Body.Close()
}
+func loadCasefold() {
+ if *casefoldingURL == "" {
+ flag.Set("casefolding", *url+"CaseFolding.txt")
+ }
+ resp, err := http.Get(*casefoldingURL)
+ if err != nil {
+ logger.Fatal(err)
+ }
+ if resp.StatusCode != 200 {
+ logger.Fatal("bad GET status for CaseFolding.txt", resp.Status)
+ }
+ input := bufio.NewReader(resp.Body)
+ for {
+ line, err := input.ReadString('\n')
+ if err != nil {
+ if err == os.EOF {
+ break
+ }
+ logger.Fatal(err)
+ }
+ if line[0] == '#' {
+ continue
+ }
+ field := strings.Split(line, "; ")
+ if len(field) != 4 {
+ logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
+ }
+ kind := field[1]
+ if kind != "C" && kind != "S" {
+ // Only care about 'common' and 'simple' foldings.
+ continue
+ }
+ p1, err := strconv.Btoui64(field[0], 16)
+ if err != nil {
+ logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
+ }
+ p2, err := strconv.Btoui64(field[2], 16)
+ if err != nil {
+ logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
+ }
+ chars[p1].foldCase = int(p2)
+ }
+ resp.Body.Close()
+}
+
const progHeader = `// Generated by running
-// maketables --tables=%s --data=%s
+// maketables --tables=%s --data=%s --casefolding=%s
// DO NOT EDIT
package unicode
@@ -322,7 +372,7 @@ func printCategories() {
return
}
// Find out which categories to dump
- list := strings.Split(*tablelist, ",", -1)
+ list := strings.Split(*tablelist, ",")
if *tablelist == "all" {
list = allCategories()
}
@@ -330,7 +380,7 @@ func printCategories() {
fullCategoryTest(list)
return
}
- fmt.Printf(progHeader, *tablelist, *dataURL)
+ fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version())
@@ -344,7 +394,7 @@ func printCategories() {
fmt.Print("}\n\n")
}
- decl := make(sort.StringArray, len(list))
+ decl := make(sort.StringSlice, len(list))
ndecl := 0
for _, name := range list {
if _, ok := category[name]; !ok {
@@ -538,7 +588,7 @@ func parseScript(line string, scripts map[string][]Script) {
if len(line) == 0 {
return
}
- field := strings.Split(line, ";", -1)
+ field := strings.Split(line, ";")
if len(field) != 2 {
logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field))
}
@@ -635,7 +685,7 @@ func printScriptOrProperty(doProps bool) {
resp.Body.Close()
// Find out which scripts to dump
- list := strings.Split(flaglist, ",", -1)
+ list := strings.Split(flaglist, ",")
if flaglist == "all" {
list = all(table)
}
@@ -665,7 +715,7 @@ func printScriptOrProperty(doProps bool) {
fmt.Print("}\n\n")
}
- decl := make(sort.StringArray, len(list))
+ decl := make(sort.StringSlice, len(list))
ndecl := 0
for _, name := range list {
if doProps {
@@ -837,13 +887,13 @@ func printCases() {
}
fmt.Printf(
"// Generated by running\n"+
- "// maketables --data=%s\n"+
+ "// maketables --data=%s --casefolding=%s\n"+
"// DO NOT EDIT\n\n"+
"// CaseRanges is the table describing case mappings for all letters with\n"+
"// non-self mappings.\n"+
"var CaseRanges = _CaseRanges\n"+
"var _CaseRanges = []CaseRange {\n",
- *dataURL)
+ *dataURL, *casefoldingURL)
var startState *caseState // the start of a run; nil for not active
var prevState = &caseState{} // the state of the previous character
@@ -946,13 +996,246 @@ func printLatinProperties() {
if code == ' ' {
property = "pZ | pp"
}
- fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code)
+ fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)
}
- fmt.Println("}")
+ fmt.Printf("}\n\n")
}
-var range16Count = 0 // Number of entries in the 16-bit range tables.
-var range32Count = 0 // Number of entries in the 32-bit range tables.
+func printCasefold() {
+ // Build list of case-folding groups attached to each canonical folded char (typically lower case).
+ var caseOrbit = make([][]int, MaxChar+1)
+ for i := range chars {
+ c := &chars[i]
+ if c.foldCase == 0 {
+ continue
+ }
+ orb := caseOrbit[c.foldCase]
+ if orb == nil {
+ orb = append(orb, c.foldCase)
+ }
+ caseOrbit[c.foldCase] = append(orb, i)
+ }
+
+ // Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
+ for i := range chars {
+ c := &chars[i]
+ f := c.foldCase
+ if f == 0 {
+ f = i
+ }
+ orb := caseOrbit[f]
+ if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
+ // Default assumption of [upper, lower] is wrong.
+ caseOrbit[i] = []int{i}
+ }
+ }
+
+ // Delete the groups for which assuming [lower, upper] is right.
+ for i, orb := range caseOrbit {
+ if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
+ caseOrbit[i] = nil
+ }
+ }
+
+ // Record orbit information in chars.
+ for _, orb := range caseOrbit {
+ if orb == nil {
+ continue
+ }
+ sort.Ints(orb)
+ c := orb[len(orb)-1]
+ for _, d := range orb {
+ chars[c].caseOrbit = d
+ c = d
+ }
+ }
+
+ printCaseOrbit()
+
+ // Tables of category and script folding exceptions: code points
+ // that must be added when interpreting a particular category/script
+ // in a case-folding context.
+ cat := make(map[string]map[int]bool)
+ for name := range category {
+ if x := foldExceptions(inCategory(name)); len(x) > 0 {
+ cat[name] = x
+ }
+ }
+
+ scr := make(map[string]map[int]bool)
+ for name := range scripts {
+ if x := foldExceptions(inScript(name)); len(x) > 0 {
+ cat[name] = x
+ }
+ }
+
+ printCatFold("FoldCategory", cat)
+ printCatFold("FoldScript", scr)
+}
+
+// inCategory returns a list of all the runes in the category.
+func inCategory(name string) []int {
+ var x []int
+ for i := range chars {
+ c := &chars[i]
+ if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
+ x = append(x, i)
+ }
+ }
+ return x
+}
+
+// inScript returns a list of all the runes in the script.
+func inScript(name string) []int {
+ var x []int
+ for _, s := range scripts[name] {
+ for c := s.lo; c <= s.hi; c++ {
+ x = append(x, int(c))
+ }
+ }
+ return x
+}
+
+// foldExceptions returns a list of all the runes fold-equivalent
+// to runes in class but not in class themselves.
+func foldExceptions(class []int) map[int]bool {
+ // Create map containing class and all fold-equivalent chars.
+ m := make(map[int]bool)
+ for _, r := range class {
+ c := &chars[r]
+ if c.caseOrbit == 0 {
+ // Just upper and lower.
+ if u := c.upperCase; u != 0 {
+ m[u] = true
+ }
+ if l := c.lowerCase; l != 0 {
+ m[l] = true
+ }
+ m[r] = true
+ continue
+ }
+ // Otherwise walk orbit.
+ r0 := r
+ for {
+ m[r] = true
+ r = chars[r].caseOrbit
+ if r == r0 {
+ break
+ }
+ }
+ }
+
+ // Remove class itself.
+ for _, r := range class {
+ m[r] = false, false
+ }
+
+ // What's left is the exceptions.
+ return m
+}
+
+var comment = map[string]string{
+ "FoldCategory": "// FoldCategory maps a category name to a table of\n" +
+ "// code points outside the category that are equivalent under\n" +
+ "// simple case folding to code points inside the category.\n" +
+ "// If there is no entry for a category name, there are no such points.\n",
+
+ "FoldScript": "// FoldScript maps a script name to a table of\n" +
+ "// code points outside the script that are equivalent under\n" +
+ "// simple case folding to code points inside the script.\n" +
+ "// If there is no entry for a script name, there are no such points.\n",
+}
+
+func printCaseOrbit() {
+ if *test {
+ for i := range chars {
+ c := &chars[i]
+ f := c.caseOrbit
+ if f == 0 {
+ if c.lowerCase != i && c.lowerCase != 0 {
+ f = c.lowerCase
+ } else if c.upperCase != i && c.upperCase != 0 {
+ f = c.upperCase
+ } else {
+ f = i
+ }
+ }
+ if g := unicode.SimpleFold(i); g != f {
+ fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
+ }
+ }
+ return
+ }
+
+ fmt.Printf("var caseOrbit = []foldPair{\n")
+ for i := range chars {
+ c := &chars[i]
+ if c.caseOrbit != 0 {
+ fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
+ foldPairCount++
+ }
+ }
+ fmt.Printf("}\n\n")
+}
+
+func printCatFold(name string, m map[string]map[int]bool) {
+ if *test {
+ var pkgMap map[string]*unicode.RangeTable
+ if name == "FoldCategory" {
+ pkgMap = unicode.FoldCategory
+ } else {
+ pkgMap = unicode.FoldScript
+ }
+ if len(pkgMap) != len(m) {
+ fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
+ return
+ }
+ for k, v := range m {
+ t, ok := pkgMap[k]
+ if !ok {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
+ continue
+ }
+ n := 0
+ for _, r := range t.R16 {
+ for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
+ if !v[c] {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
+ }
+ n++
+ }
+ }
+ for _, r := range t.R32 {
+ for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
+ if !v[c] {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
+ }
+ n++
+ }
+ }
+ if n != len(v) {
+ fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
+ }
+ }
+ return
+ }
+
+ fmt.Print(comment[name])
+ fmt.Printf("var %s = map[string]*RangeTable{\n", name)
+ for name := range m {
+ fmt.Printf("\t%q: fold%s,\n", name, name)
+ }
+ fmt.Printf("}\n\n")
+ for name, class := range m {
+ dumpRange(
+ fmt.Sprintf("var fold%s = &RangeTable{\n", name),
+ func(code int) bool { return class[code] })
+ }
+}
+
+var range16Count = 0 // Number of entries in the 16-bit range tables.
+var range32Count = 0 // Number of entries in the 32-bit range tables.
+var foldPairCount = 0 // Number of fold pairs in the exception tables.
func printSizes() {
if *test {
@@ -963,4 +1246,6 @@ func printSizes() {
range16Bytes := range16Count * 3 * 2
range32Bytes := range32Count * 3 * 4
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
+ fmt.Println()
+ fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
}