diff options
| author | Ondřej Surý <ondrej@sury.org> | 2011-08-03 16:54:30 +0200 | 
|---|---|---|
| committer | Ondřej Surý <ondrej@sury.org> | 2011-08-03 16:54:30 +0200 | 
| commit | 28592ee1ea1f5cdffcf85472f9de0285d928cf12 (patch) | |
| tree | 32944e18b23f7fe4a0818a694aa2a6dfb1835463 /src/pkg/unicode/maketables.go | |
| parent | e836bee4716dc0d4d913537ad3ad1925a7ac32d0 (diff) | |
| download | golang-28592ee1ea1f5cdffcf85472f9de0285d928cf12.tar.gz | |
Imported Upstream version 59upstream/59
Diffstat (limited to 'src/pkg/unicode/maketables.go')
| -rw-r--r-- | src/pkg/unicode/maketables.go | 353 | 
1 files changed, 319 insertions, 34 deletions
| diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go index 655fe46e4..07b931d7e 100644 --- a/src/pkg/unicode/maketables.go +++ b/src/pkg/unicode/maketables.go @@ -24,15 +24,18 @@ import (  func main() {  	flag.Parse()  	loadChars() // always needed +	loadCasefold()  	printCategories()  	printScriptOrProperty(false)  	printScriptOrProperty(true)  	printCases()  	printLatinProperties() +	printCasefold()  	printSizes()  }  var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") +var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")  var url = flag.String("url",  	"http://www.unicode.org/Public/6.0.0/ucd/",  	"URL of Unicode database directory") @@ -70,7 +73,7 @@ var category = map[string]bool{  // UnicodeData.txt has form:  //	0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;  //	007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A -// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation +// See http://www.unicode.org/reports/tr44/ for a full explanation  // The fields:  const (  	FCodePoint = iota @@ -78,10 +81,10 @@ const (  	FGeneralCategory  	FCanonicalCombiningClass  	FBidiClass -	FDecompositionType -	FDecompositionMapping +	FDecompositionTypeAndMapping  	FNumericType -	FNumericValue +	FNumericDigit // If a decimal digit. +	FNumericValue // Includes non-decimal, e.g. U+2155=1/5  	FBidiMirrored  	FUnicode1Name  	FISOComment @@ -94,21 +97,21 @@ const (  )  var fieldName = []string{ -	"CodePoint", -	"Name", -	"GeneralCategory", -	"CanonicalCombiningClass", -	"BidiClass", -	"DecompositionType", -	"DecompositionMapping", -	"NumericType", -	"NumericValue", -	"BidiMirrored", -	"Unicode1Name", -	"ISOComment", -	"SimpleUppercaseMapping", -	"SimpleLowercaseMapping", -	"SimpleTitlecaseMapping", +	FCodePoint:                   "CodePoint", +	FName:                        "Name", +	FGeneralCategory:             "GeneralCategory", +	FCanonicalCombiningClass:     "CanonicalCombiningClass", +	FBidiClass:                   "BidiClass", +	FDecompositionTypeAndMapping: "DecompositionTypeAndMapping", +	FNumericType:                 "NumericType", +	FNumericDigit:                "NumericDigit", +	FNumericValue:                "NumericValue", +	FBidiMirrored:                "BidiMirrored", +	FUnicode1Name:                "Unicode1Name", +	FISOComment:                  "ISOComment", +	FSimpleUppercaseMapping:      "SimpleUppercaseMapping", +	FSimpleLowercaseMapping:      "SimpleLowercaseMapping", +	FSimpleTitlecaseMapping:      "SimpleTitlecaseMapping",  }  // This contains only the properties we're interested in. @@ -119,6 +122,8 @@ type Char struct {  	upperCase int  	lowerCase int  	titleCase int +	foldCase  int // simple case folding +	caseOrbit int // next in simple case folding orbit  }  // Scripts.txt has form: @@ -151,7 +156,7 @@ const (  )  func parseCategory(line string) (state State) { -	field := strings.Split(line, ";", -1) +	field := strings.Split(line, ";")  	if len(field) != NumField {  		logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)  	} @@ -248,7 +253,7 @@ func all(scripts map[string][]Script) []string {  // Extract the version number from the URL  func version() string {  	// Break on slashes and look for the first numeric field -	fields := strings.Split(*url, "/", -1) +	fields := strings.Split(*url, "/")  	for _, f := range fields {  		if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {  			return f @@ -308,8 +313,53 @@ func loadChars() {  	resp.Body.Close()  } +func loadCasefold() { +	if *casefoldingURL == "" { +		flag.Set("casefolding", *url+"CaseFolding.txt") +	} +	resp, err := http.Get(*casefoldingURL) +	if err != nil { +		logger.Fatal(err) +	} +	if resp.StatusCode != 200 { +		logger.Fatal("bad GET status for CaseFolding.txt", resp.Status) +	} +	input := bufio.NewReader(resp.Body) +	for { +		line, err := input.ReadString('\n') +		if err != nil { +			if err == os.EOF { +				break +			} +			logger.Fatal(err) +		} +		if line[0] == '#' { +			continue +		} +		field := strings.Split(line, "; ") +		if len(field) != 4 { +			logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4) +		} +		kind := field[1] +		if kind != "C" && kind != "S" { +			// Only care about 'common' and 'simple' foldings. +			continue +		} +		p1, err := strconv.Btoui64(field[0], 16) +		if err != nil { +			logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) +		} +		p2, err := strconv.Btoui64(field[2], 16) +		if err != nil { +			logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err) +		} +		chars[p1].foldCase = int(p2) +	} +	resp.Body.Close() +} +  const progHeader = `// Generated by running -//	maketables --tables=%s --data=%s +//	maketables --tables=%s --data=%s --casefolding=%s  // DO NOT EDIT  package unicode @@ -322,7 +372,7 @@ func printCategories() {  		return  	}  	// Find out which categories to dump -	list := strings.Split(*tablelist, ",", -1) +	list := strings.Split(*tablelist, ",")  	if *tablelist == "all" {  		list = allCategories()  	} @@ -330,7 +380,7 @@ func printCategories() {  		fullCategoryTest(list)  		return  	} -	fmt.Printf(progHeader, *tablelist, *dataURL) +	fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)  	fmt.Println("// Version is the Unicode edition from which the tables are derived.")  	fmt.Printf("const Version = %q\n\n", version()) @@ -344,7 +394,7 @@ func printCategories() {  		fmt.Print("}\n\n")  	} -	decl := make(sort.StringArray, len(list)) +	decl := make(sort.StringSlice, len(list))  	ndecl := 0  	for _, name := range list {  		if _, ok := category[name]; !ok { @@ -538,7 +588,7 @@ func parseScript(line string, scripts map[string][]Script) {  	if len(line) == 0 {  		return  	} -	field := strings.Split(line, ";", -1) +	field := strings.Split(line, ";")  	if len(field) != 2 {  		logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field))  	} @@ -635,7 +685,7 @@ func printScriptOrProperty(doProps bool) {  	resp.Body.Close()  	// Find out which scripts to dump -	list := strings.Split(flaglist, ",", -1) +	list := strings.Split(flaglist, ",")  	if flaglist == "all" {  		list = all(table)  	} @@ -665,7 +715,7 @@ func printScriptOrProperty(doProps bool) {  		fmt.Print("}\n\n")  	} -	decl := make(sort.StringArray, len(list)) +	decl := make(sort.StringSlice, len(list))  	ndecl := 0  	for _, name := range list {  		if doProps { @@ -837,13 +887,13 @@ func printCases() {  	}  	fmt.Printf(  		"// Generated by running\n"+ -			"//	maketables --data=%s\n"+ +			"//	maketables --data=%s --casefolding=%s\n"+  			"// DO NOT EDIT\n\n"+  			"// CaseRanges is the table describing case mappings for all letters with\n"+  			"// non-self mappings.\n"+  			"var CaseRanges = _CaseRanges\n"+  			"var _CaseRanges = []CaseRange {\n", -		*dataURL) +		*dataURL, *casefoldingURL)  	var startState *caseState    // the start of a run; nil for not active  	var prevState = &caseState{} // the state of the previous character @@ -946,13 +996,246 @@ func printLatinProperties() {  		if code == ' ' {  			property = "pZ | pp"  		} -		fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code) +		fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)  	} -	fmt.Println("}") +	fmt.Printf("}\n\n")  } -var range16Count = 0 // Number of entries in the 16-bit range tables. -var range32Count = 0 // Number of entries in the 32-bit range tables. +func printCasefold() { +	// Build list of case-folding groups attached to each canonical folded char (typically lower case). +	var caseOrbit = make([][]int, MaxChar+1) +	for i := range chars { +		c := &chars[i] +		if c.foldCase == 0 { +			continue +		} +		orb := caseOrbit[c.foldCase] +		if orb == nil { +			orb = append(orb, c.foldCase) +		} +		caseOrbit[c.foldCase] = append(orb, i) +	} + +	// Insert explicit 1-element groups when assuming [lower, upper] would be wrong. +	for i := range chars { +		c := &chars[i] +		f := c.foldCase +		if f == 0 { +			f = i +		} +		orb := caseOrbit[f] +		if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { +			// Default assumption of [upper, lower] is wrong. +			caseOrbit[i] = []int{i} +		} +	} + +	// Delete the groups for which assuming [lower, upper] is right. +	for i, orb := range caseOrbit { +		if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { +			caseOrbit[i] = nil +		} +	} + +	// Record orbit information in chars. +	for _, orb := range caseOrbit { +		if orb == nil { +			continue +		} +		sort.Ints(orb) +		c := orb[len(orb)-1] +		for _, d := range orb { +			chars[c].caseOrbit = d +			c = d +		} +	} + +	printCaseOrbit() + +	// Tables of category and script folding exceptions: code points +	// that must be added when interpreting a particular category/script +	// in a case-folding context. +	cat := make(map[string]map[int]bool) +	for name := range category { +		if x := foldExceptions(inCategory(name)); len(x) > 0 { +			cat[name] = x +		} +	} + +	scr := make(map[string]map[int]bool) +	for name := range scripts { +		if x := foldExceptions(inScript(name)); len(x) > 0 { +			cat[name] = x +		} +	} + +	printCatFold("FoldCategory", cat) +	printCatFold("FoldScript", scr) +} + +// inCategory returns a list of all the runes in the category. +func inCategory(name string) []int { +	var x []int +	for i := range chars { +		c := &chars[i] +		if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { +			x = append(x, i) +		} +	} +	return x +} + +// inScript returns a list of all the runes in the script. +func inScript(name string) []int { +	var x []int +	for _, s := range scripts[name] { +		for c := s.lo; c <= s.hi; c++ { +			x = append(x, int(c)) +		} +	} +	return x +} + +// foldExceptions returns a list of all the runes fold-equivalent +// to runes in class but not in class themselves. +func foldExceptions(class []int) map[int]bool { +	// Create map containing class and all fold-equivalent chars. +	m := make(map[int]bool) +	for _, r := range class { +		c := &chars[r] +		if c.caseOrbit == 0 { +			// Just upper and lower. +			if u := c.upperCase; u != 0 { +				m[u] = true +			} +			if l := c.lowerCase; l != 0 { +				m[l] = true +			} +			m[r] = true +			continue +		} +		// Otherwise walk orbit. +		r0 := r +		for { +			m[r] = true +			r = chars[r].caseOrbit +			if r == r0 { +				break +			} +		} +	} + +	// Remove class itself. +	for _, r := range class { +		m[r] = false, false +	} + +	// What's left is the exceptions. +	return m +} + +var comment = map[string]string{ +	"FoldCategory": "// FoldCategory maps a category name to a table of\n" + +		"// code points outside the category that are equivalent under\n" + +		"// simple case folding to code points inside the category.\n" + +		"// If there is no entry for a category name, there are no such points.\n", + +	"FoldScript": "// FoldScript maps a script name to a table of\n" + +		"// code points outside the script that are equivalent under\n" + +		"// simple case folding to code points inside the script.\n" + +		"// If there is no entry for a script name, there are no such points.\n", +} + +func printCaseOrbit() { +	if *test { +		for i := range chars { +			c := &chars[i] +			f := c.caseOrbit +			if f == 0 { +				if c.lowerCase != i && c.lowerCase != 0 { +					f = c.lowerCase +				} else if c.upperCase != i && c.upperCase != 0 { +					f = c.upperCase +				} else { +					f = i +				} +			} +			if g := unicode.SimpleFold(i); g != f { +				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) +			} +		} +		return +	} + +	fmt.Printf("var caseOrbit = []foldPair{\n") +	for i := range chars { +		c := &chars[i] +		if c.caseOrbit != 0 { +			fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) +			foldPairCount++ +		} +	} +	fmt.Printf("}\n\n") +} + +func printCatFold(name string, m map[string]map[int]bool) { +	if *test { +		var pkgMap map[string]*unicode.RangeTable +		if name == "FoldCategory" { +			pkgMap = unicode.FoldCategory +		} else { +			pkgMap = unicode.FoldScript +		} +		if len(pkgMap) != len(m) { +			fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) +			return +		} +		for k, v := range m { +			t, ok := pkgMap[k] +			if !ok { +				fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) +				continue +			} +			n := 0 +			for _, r := range t.R16 { +				for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) { +					if !v[c] { +						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) +					} +					n++ +				} +			} +			for _, r := range t.R32 { +				for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) { +					if !v[c] { +						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) +					} +					n++ +				} +			} +			if n != len(v) { +				fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) +			} +		} +		return +	} + +	fmt.Print(comment[name]) +	fmt.Printf("var %s = map[string]*RangeTable{\n", name) +	for name := range m { +		fmt.Printf("\t%q: fold%s,\n", name, name) +	} +	fmt.Printf("}\n\n") +	for name, class := range m { +		dumpRange( +			fmt.Sprintf("var fold%s = &RangeTable{\n", name), +			func(code int) bool { return class[code] }) +	} +} + +var range16Count = 0  // Number of entries in the 16-bit range tables. +var range32Count = 0  // Number of entries in the 32-bit range tables. +var foldPairCount = 0 // Number of fold pairs in the exception tables.  func printSizes() {  	if *test { @@ -963,4 +1246,6 @@ func printSizes() {  	range16Bytes := range16Count * 3 * 2  	range32Bytes := range32Count * 3 * 4  	fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) +	fmt.Println() +	fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)  } | 
