summaryrefslogtreecommitdiff
path: root/src/pkg/exp/norm/triegen.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/exp/norm/triegen.go')
-rw-r--r--src/pkg/exp/norm/triegen.go211
1 files changed, 211 insertions, 0 deletions
diff --git a/src/pkg/exp/norm/triegen.go b/src/pkg/exp/norm/triegen.go
new file mode 100644
index 000000000..2b7eeee17
--- /dev/null
+++ b/src/pkg/exp/norm/triegen.go
@@ -0,0 +1,211 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Trie table generator.
+// Used by make*tables tools to generate a go file with trie data structures
+// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte
+// sequence are used to lookup offsets in the index table to be used for the
+// next byte. The last byte is used to index into a table with 16-bit values.
+
+package main
+
+import (
+ "fmt"
+ "hash/crc32"
+ "log"
+ "utf8"
+)
+
+// Intermediate trie structure
+type trieNode struct {
+ table [256]*trieNode
+ value uint16
+ b byte
+ leaf bool
+}
+
+func newNode() *trieNode {
+ return new(trieNode)
+}
+
+func (n trieNode) String() string {
+ s := fmt.Sprint("trieNode{table: { non-nil at index: ")
+ for i, v := range n.table {
+ if v != nil {
+ s += fmt.Sprintf("%d, ", i)
+ }
+ }
+ s += fmt.Sprintf("}, value:%#x, b:%#x leaf:%v}", n.value, n.b, n.leaf)
+ return s
+}
+
+func (n trieNode) isInternal() bool {
+ internal := true
+ for i := 0; i < 256; i++ {
+ if nn := n.table[i]; nn != nil {
+ if !internal && !nn.leaf {
+ log.Fatalf("triegen: isInternal: node contains both leaf and non-leaf children (%v)", n)
+ }
+ internal = internal && !nn.leaf
+ }
+ }
+ return internal
+}
+
+func (n *trieNode) insert(rune int, value uint16) {
+ var p [utf8.UTFMax]byte
+ sz := utf8.EncodeRune(p[:], rune)
+
+ for i := 0; i < sz; i++ {
+ if n.leaf {
+ log.Fatalf("triegen: insert: node (%#v) should not be a leaf", n)
+ }
+ nn := n.table[p[i]]
+ if nn == nil {
+ nn = newNode()
+ nn.b = p[i]
+ n.table[p[i]] = nn
+ }
+ n = nn
+ }
+ n.value = value
+ n.leaf = true
+}
+
+type nodeIndex struct {
+ lookupBlocks []*trieNode
+ valueBlocks []*trieNode
+
+ lookupBlockIdx map[uint32]uint16
+ valueBlockIdx map[uint32]uint16
+}
+
+func newIndex() *nodeIndex {
+ index := &nodeIndex{}
+ index.lookupBlocks = make([]*trieNode, 0)
+ index.valueBlocks = make([]*trieNode, 0)
+ index.lookupBlockIdx = make(map[uint32]uint16)
+ index.valueBlockIdx = make(map[uint32]uint16)
+ return index
+}
+
+func computeOffsets(index *nodeIndex, n *trieNode) uint16 {
+ if n.leaf {
+ return n.value
+ }
+ hasher := crc32.New(crc32.MakeTable(crc32.IEEE))
+ // We only index continuation bytes.
+ for i := 0; i < 64; i++ {
+ var v uint16 = 0
+ if nn := n.table[0x80+i]; nn != nil {
+ v = computeOffsets(index, nn)
+ }
+ hasher.Write([]byte{uint8(v >> 8), uint8(v)})
+ }
+ h := hasher.Sum32()
+ if n.isInternal() {
+ v, ok := index.lookupBlockIdx[h]
+ if !ok {
+ v = uint16(len(index.lookupBlocks))
+ index.lookupBlocks = append(index.lookupBlocks, n)
+ index.lookupBlockIdx[h] = v
+ }
+ n.value = v
+ } else {
+ v, ok := index.valueBlockIdx[h]
+ if !ok {
+ v = uint16(len(index.valueBlocks))
+ index.valueBlocks = append(index.valueBlocks, n)
+ index.valueBlockIdx[h] = v
+ }
+ n.value = v
+ }
+ return n.value
+}
+
+func printValueBlock(nr int, n *trieNode, offset int) {
+ boff := nr * 64
+ fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
+ var printnewline bool
+ for i := 0; i < 64; i++ {
+ if i%6 == 0 {
+ printnewline = true
+ }
+ v := uint16(0)
+ if nn := n.table[i+offset]; nn != nil {
+ v = nn.value
+ }
+ if v != 0 {
+ if printnewline {
+ fmt.Printf("\n")
+ printnewline = false
+ }
+ fmt.Printf("%#04x:%#04x, ", nr*64+i, v)
+ }
+ }
+}
+
+func printLookupBlock(nr int, n *trieNode, offset int) {
+ boff := nr * 64
+ fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
+ var printnewline bool
+ for i := 0; i < 64; i++ {
+ if i%8 == 0 {
+ printnewline = true
+ }
+ v := uint16(0)
+ if nn := n.table[i+offset]; nn != nil {
+ v = nn.value
+ }
+ if v != 0 {
+ if printnewline {
+ fmt.Printf("\n")
+ printnewline = false
+ }
+ fmt.Printf("%#03x:%#02x, ", boff+i, v)
+ }
+ }
+}
+
+// printTables returns the size in bytes of the generated tables.
+func (t *trieNode) printTables(name string) int {
+ index := newIndex()
+ // Values for 7-bit ASCII are stored in first two block, followed by nil block.
+ index.valueBlocks = append(index.valueBlocks, nil, nil, nil)
+ // First byte of multi-byte UTF-8 codepoints are indexed in 4th block.
+ index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil, nil)
+ // Index starter bytes of multi-byte UTF-8.
+ for i := 0xC0; i < 0x100; i++ {
+ if t.table[i] != nil {
+ computeOffsets(index, t.table[i])
+ }
+ }
+
+ nv := len(index.valueBlocks) * 64
+ fmt.Printf("// %sValues: %d entries, %d bytes\n", name, nv, nv*2)
+ fmt.Printf("// Block 2 is the null block.\n")
+ fmt.Printf("var %sValues = [%d]uint16 {", name, nv)
+ printValueBlock(0, t, 0)
+ printValueBlock(1, t, 64)
+ printValueBlock(2, newNode(), 0)
+ for i := 3; i < len(index.valueBlocks); i++ {
+ printValueBlock(i, index.valueBlocks[i], 0x80)
+ }
+ fmt.Print("\n}\n\n")
+
+ ni := len(index.lookupBlocks) * 64
+ fmt.Printf("// %sLookup: %d bytes\n", name, ni)
+ fmt.Printf("// Block 0 is the null block.\n")
+ fmt.Printf("var %sLookup = [%d]uint8 {", name, ni)
+ printLookupBlock(0, newNode(), 0)
+ printLookupBlock(1, newNode(), 0)
+ printLookupBlock(2, newNode(), 0)
+ printLookupBlock(3, t, 0xC0)
+ for i := 4; i < len(index.lookupBlocks); i++ {
+ printLookupBlock(i, index.lookupBlocks[i], 0x80)
+ }
+ fmt.Print("\n}\n\n")
+ fmt.Printf("var %sTrie = trie{ %sLookup[:], %sValues[:] }\n\n", name, name, name)
+ return nv*2 + ni
+}