summaryrefslogtreecommitdiff
path: root/src/pkg/unicode/maketables.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/unicode/maketables.go')
-rw-r--r--src/pkg/unicode/maketables.go219
1 files changed, 169 insertions, 50 deletions
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go
index 0c367673e..655fe46e4 100644
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -28,6 +28,8 @@ func main() {
printScriptOrProperty(false)
printScriptOrProperty(true)
printCases()
+ printLatinProperties()
+ printSizes()
}
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
@@ -53,7 +55,17 @@ var test = flag.Bool("test",
var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
var logger = log.New(os.Stderr, "", log.Lshortfile)
-var category = map[string]bool{"letter": true} // Nd Lu etc. letter is a special case
+var category = map[string]bool{
+ // Nd Lu etc.
+ // We use one-character names to identify merged categories
+ "L": true, // Lu Ll Lt Lm Lo
+ "P": true, // Pc Pd Ps Pe Pu Pf Po
+ "M": true, // Mn Mc Me
+ "N": true, // Nd Nl No
+ "S": true, // Sm Sc Sk So
+ "Z": true, // Zs Zl Zp
+ "C": true, // Cc Cf Cs Co Cn
+}
// UnicodeData.txt has form:
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
@@ -246,19 +258,16 @@ func version() string {
return "Unknown"
}
-func letterOp(code int) bool {
- switch chars[code].category {
- case "Lu", "Ll", "Lt", "Lm", "Lo":
- return true
- }
- return false
+func categoryOp(code int, class uint8) bool {
+ category := chars[code].category
+ return len(category) > 0 && category[0] == class
}
func loadChars() {
if *dataURL == "" {
flag.Set("data", *url+"UnicodeData.txt")
}
- resp, _, err := http.Get(*dataURL)
+ resp, err := http.Get(*dataURL)
if err != nil {
logger.Fatal(err)
}
@@ -278,16 +287,16 @@ func loadChars() {
switch parseCategory(line[0 : len(line)-1]) {
case SNormal:
if first != 0 {
- logger.Fatalf("bad state normal at U+%04X", lastChar)
+ logger.Fatalf("bad state normal at %U", lastChar)
}
case SFirst:
if first != 0 {
- logger.Fatalf("bad state first at U+%04X", lastChar)
+ logger.Fatalf("bad state first at %U", lastChar)
}
first = lastChar
case SLast:
if first == 0 {
- logger.Fatalf("bad state last at U+%04X", lastChar)
+ logger.Fatalf("bad state last at %U", lastChar)
}
for i := first + 1; i <= lastChar; i++ {
chars[i] = chars[first]
@@ -299,6 +308,15 @@ func loadChars() {
resp.Body.Close()
}
+const progHeader = `// Generated by running
+// maketables --tables=%s --data=%s
+// DO NOT EDIT
+
+package unicode
+
+`
+
+
func printCategories() {
if *tablelist == "" {
return
@@ -312,20 +330,14 @@ func printCategories() {
fullCategoryTest(list)
return
}
- fmt.Printf(
- "// Generated by running\n"+
- "// maketables --tables=%s --data=%s\n"+
- "// DO NOT EDIT\n\n"+
- "package unicode\n\n",
- *tablelist,
- *dataURL)
+ fmt.Printf(progHeader, *tablelist, *dataURL)
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version())
if *tablelist == "all" {
fmt.Println("// Categories is the set of Unicode data tables.")
- fmt.Println("var Categories = map[string] []Range {")
+ fmt.Println("var Categories = map[string] *RangeTable {")
for k := range category {
fmt.Printf("\t%q: %s,\n", k, k)
}
@@ -344,8 +356,27 @@ func printCategories() {
// Cases deserving special comments
varDecl := ""
switch name {
- case "letter":
- varDecl = "\tLetter = letter; // Letter is the set of Unicode letters.\n"
+ case "C":
+ varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n"
+ varDecl += "\tC = _C\n"
+ case "L":
+ varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n"
+ varDecl += "\tL = _L\n"
+ case "M":
+ varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n"
+ varDecl += "\tM = _M\n"
+ case "N":
+ varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n"
+ varDecl += "\tN = _N\n"
+ case "P":
+ varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n"
+ varDecl += "\tP = _P\n"
+ case "S":
+ varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n"
+ varDecl += "\tS = _S\n"
+ case "Z":
+ varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n"
+ varDecl += "\tZ = _Z\n"
case "Nd":
varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
case "Lu":
@@ -355,21 +386,22 @@ func printCategories() {
case "Lt":
varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n"
}
- if name != "letter" {
+ if len(name) > 1 {
varDecl += fmt.Sprintf(
"\t%s = _%s; // %s is the set of Unicode characters in category %s.\n",
name, name, name, name)
}
decl[ndecl] = varDecl
ndecl++
- if name == "letter" { // special case
+ if len(name) == 1 { // unified categories
+ decl := fmt.Sprintf("var _%s = &RangeTable{\n", name)
dumpRange(
- "var letter = []Range {\n",
- letterOp)
+ decl,
+ func(code int) bool { return categoryOp(code, name[0]) })
continue
}
dumpRange(
- fmt.Sprintf("var _%s = []Range {\n", name),
+ fmt.Sprintf("var _%s = &RangeTable{\n", name),
func(code int) bool { return chars[code].category == name })
}
decl.Sort()
@@ -382,12 +414,15 @@ func printCategories() {
type Op func(code int) bool
-const format = "\t{0x%04x, 0x%04x, %d},\n"
+const format = "\t\t{0x%04x, 0x%04x, %d},\n"
func dumpRange(header string, inCategory Op) {
fmt.Print(header)
next := 0
+ fmt.Print("\tR16: []Range16{\n")
// one Range for each iteration
+ count := &range16Count
+ size := 16
for {
// look for start of range
for next < len(chars) && !inCategory(next) {
@@ -427,24 +462,49 @@ func dumpRange(header string, inCategory Op) {
break
}
}
- fmt.Printf(format, lo, hi, stride)
+ size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
// next range: start looking where this range ends
next = hi + 1
}
+ fmt.Print("\t},\n")
fmt.Print("}\n\n")
}
+func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
+ if size == 16 && hi >= 1<<16 {
+ if lo < 1<<16 {
+ if lo+stride != hi {
+ logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
+ }
+ // No range contains U+FFFF as an instance, so split
+ // the range into two entries. That way we can maintain
+ // the invariant that R32 contains only >= 1<<16.
+ fmt.Printf(format, lo, lo, 1)
+ lo = hi
+ stride = 1
+ *count++
+ }
+ fmt.Print("\t},\n")
+ fmt.Print("\tR32: []Range32{\n")
+ size = 32
+ count = &range32Count
+ }
+ fmt.Printf(format, lo, hi, stride)
+ *count++
+ return size, count
+}
+
func fullCategoryTest(list []string) {
for _, name := range list {
if _, ok := category[name]; !ok {
logger.Fatal("unknown category", name)
}
r, ok := unicode.Categories[name]
- if !ok {
- logger.Fatal("unknown table", name)
+ if !ok && len(name) > 1 {
+ logger.Fatalf("unknown table %q", name)
}
- if name == "letter" {
- verifyRange(name, letterOp, r)
+ if len(name) == 1 {
+ verifyRange(name, func(code int) bool { return categoryOp(code, name[0]) }, r)
} else {
verifyRange(
name,
@@ -454,12 +514,17 @@ func fullCategoryTest(list []string) {
}
}
-func verifyRange(name string, inCategory Op, table []unicode.Range) {
+func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
+ count := 0
for i := range chars {
web := inCategory(i)
pkg := unicode.Is(table, i)
if web != pkg {
- fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg)
+ fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
+ count++
+ if count > 10 {
+ break
+ }
}
}
}
@@ -497,22 +562,22 @@ func parseScript(line string, scripts map[string][]Script) {
}
// The script tables have a lot of adjacent elements. Fold them together.
-func foldAdjacent(r []Script) []unicode.Range {
- s := make([]unicode.Range, 0, len(r))
+func foldAdjacent(r []Script) []unicode.Range32 {
+ s := make([]unicode.Range32, 0, len(r))
j := 0
for i := 0; i < len(r); i++ {
- if j > 0 && int(r[i].lo) == s[j-1].Hi+1 {
- s[j-1].Hi = int(r[i].hi)
+ if j > 0 && r[i].lo == s[j-1].Hi+1 {
+ s[j-1].Hi = r[i].hi
} else {
s = s[0 : j+1]
- s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1}
+ s[j] = unicode.Range32{uint32(r[i].lo), uint32(r[i].hi), 1}
j++
}
}
return s
}
-func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts map[string][]Script) {
+func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
for _, name := range list {
if _, ok := scripts[name]; !ok {
logger.Fatal("unknown script", name)
@@ -524,7 +589,7 @@ func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts
for _, script := range scripts[name] {
for r := script.lo; r <= script.hi; r++ {
if !unicode.Is(installed[name], int(r)) {
- fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name)
+ fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
}
}
}
@@ -549,7 +614,7 @@ func printScriptOrProperty(doProps bool) {
return
}
var err os.Error
- resp, _, err := http.Get(*url + file)
+ resp, err := http.Get(*url + file)
if err != nil {
logger.Fatal(err)
}
@@ -589,10 +654,10 @@ func printScriptOrProperty(doProps bool) {
if flaglist == "all" {
if doProps {
fmt.Println("// Properties is the set of Unicode property tables.")
- fmt.Println("var Properties = map[string] []Range {")
+ fmt.Println("var Properties = map[string] *RangeTable{")
} else {
fmt.Println("// Scripts is the set of Unicode script tables.")
- fmt.Println("var Scripts = map[string] []Range {")
+ fmt.Println("var Scripts = map[string] *RangeTable{")
}
for k := range table {
fmt.Printf("\t%q: %s,\n", k, k)
@@ -613,11 +678,15 @@ func printScriptOrProperty(doProps bool) {
name, name, name, name)
}
ndecl++
- fmt.Printf("var _%s = []Range {\n", name)
+ fmt.Printf("var _%s = &RangeTable {\n", name)
+ fmt.Print("\tR16: []Range16{\n")
ranges := foldAdjacent(table[name])
+ size := 16
+ count := &range16Count
for _, s := range ranges {
- fmt.Printf(format, s.Lo, s.Hi, s.Stride)
+ size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
}
+ fmt.Print("\t},\n")
fmt.Print("}\n\n")
}
decl.Sort()
@@ -808,7 +877,7 @@ func printCaseRange(lo, hi *caseState) {
fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
lo.point, hi.point)
case hi.point > lo.point && lo.isLowerUpper():
- logger.Fatalf("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point)
+ logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
lo.point, hi.point)
default:
@@ -831,17 +900,67 @@ func fullCaseTest() {
lower := unicode.ToLower(i)
want := caseIt(i, c.lowerCase)
if lower != want {
- fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower)
+ fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
}
upper := unicode.ToUpper(i)
want = caseIt(i, c.upperCase)
if upper != want {
- fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper)
+ fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
}
title := unicode.ToTitle(i)
want = caseIt(i, c.titleCase)
if title != want {
- fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title)
+ fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
}
}
}
+
+func printLatinProperties() {
+ if *test {
+ return
+ }
+ fmt.Println("var properties = [MaxLatin1+1]uint8{")
+ for code := 0; code <= unicode.MaxLatin1; code++ {
+ var property string
+ switch chars[code].category {
+ case "Cc", "": // NUL has no category.
+ property = "pC"
+ case "Cf": // soft hyphen, unique category, not printable.
+ property = "0"
+ case "Ll":
+ property = "pLl | pp"
+ case "Lu":
+ property = "pLu | pp"
+ case "Nd", "No":
+ property = "pN | pp"
+ case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
+ property = "pP | pp"
+ case "Sc", "Sk", "Sm", "So":
+ property = "pS | pp"
+ case "Zs":
+ property = "pZ"
+ default:
+ logger.Fatalf("%U has unknown category %q", code, chars[code].category)
+ }
+ // Special case
+ if code == ' ' {
+ property = "pZ | pp"
+ }
+ fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code)
+ }
+ fmt.Println("}")
+}
+
+var range16Count = 0 // Number of entries in the 16-bit range tables.
+var range32Count = 0 // Number of entries in the 32-bit range tables.
+
+func printSizes() {
+ if *test {
+ return
+ }
+ fmt.Println()
+ fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
+ range16Bytes := range16Count * 3 * 2
+ range32Bytes := range32Count * 3 * 4
+ fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
+}