summaryrefslogtreecommitdiff
path: root/src/cmd/godoc/index.go
diff options
context:
space:
mode:
authorOndřej Surý <ondrej@sury.org>2011-09-13 13:13:40 +0200
committerOndřej Surý <ondrej@sury.org>2011-09-13 13:13:40 +0200
commit5ff4c17907d5b19510a62e08fd8d3b11e62b431d (patch)
treec0650497e988f47be9c6f2324fa692a52dea82e1 /src/cmd/godoc/index.go
parent80f18fc933cf3f3e829c5455a1023d69f7b86e52 (diff)
downloadgolang-upstream/60.tar.gz
Imported Upstream version 60upstream/60
Diffstat (limited to 'src/cmd/godoc/index.go')
-rw-r--r--src/cmd/godoc/index.go986
1 files changed, 986 insertions, 0 deletions
diff --git a/src/cmd/godoc/index.go b/src/cmd/godoc/index.go
new file mode 100644
index 000000000..9b4f31514
--- /dev/null
+++ b/src/cmd/godoc/index.go
@@ -0,0 +1,986 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains the infrastructure to create an
+// identifier and full-text index for a set of Go files.
+//
+// Algorithm for identifier index:
+// - traverse all .go files of the file tree specified by root
+// - for each word (identifier) encountered, collect all occurrences (spots)
+// into a list; this produces a list of spots for each word
+// - reduce the lists: from a list of spots to a list of FileRuns,
+// and from a list of FileRuns into a list of PakRuns
+// - make a HitList from the PakRuns
+//
+// Details:
+// - keep two lists per word: one containing package-level declarations
+// that have snippets, and one containing all other spots
+// - keep the snippets in a separate table indexed by snippet index
+// and store the snippet index in place of the line number in a SpotInfo
+// (the line number for spots with snippets is stored in the snippet)
+// - at the end, create lists of alternative spellings for a given
+// word
+//
+// Algorithm for full text index:
+// - concatenate all source code in a byte buffer (in memory)
+// - add the files to a file set in lockstep as they are added to the byte
+// buffer such that a byte buffer offset corresponds to the Pos value for
+// that file location
+// - create a suffix array from the concatenated sources
+//
+// String lookup in full text index:
+// - use the suffix array to lookup a string's offsets - the offsets
+// correspond to the Pos values relative to the file set
+// - translate the Pos values back into file and line information and
+// sort the result
+
+package main
+
+import (
+ "bytes"
+ "container/vector"
+ "go/ast"
+ "go/parser"
+ "go/token"
+ "go/scanner"
+ "index/suffixarray"
+ "os"
+ "path/filepath"
+ "regexp"
+ "sort"
+ "strings"
+)
+
+// ----------------------------------------------------------------------------
+// RunList
+
+// A RunList is a vector of entries that can be sorted according to some
+// criteria. A RunList may be compressed by grouping "runs" of entries
+// which are equal (according to the sort critera) into a new RunList of
+// runs. For instance, a RunList containing pairs (x, y) may be compressed
+// into a RunList containing pair runs (x, {y}) where each run consists of
+// a list of y's with the same x.
+type RunList struct {
+ vector.Vector
+ less func(x, y interface{}) bool
+}
+
+func (h *RunList) Less(i, j int) bool { return h.less(h.At(i), h.At(j)) }
+
+func (h *RunList) sort(less func(x, y interface{}) bool) {
+ h.less = less
+ sort.Sort(h)
+}
+
+// Compress entries which are the same according to a sort criteria
+// (specified by less) into "runs".
+func (h *RunList) reduce(less func(x, y interface{}) bool, newRun func(h *RunList, i, j int) interface{}) *RunList {
+ // create runs of entries with equal values
+ h.sort(less)
+
+ // for each run, make a new run object and collect them in a new RunList
+ var hh RunList
+ i := 0
+ for j := 0; j < h.Len(); j++ {
+ if less(h.At(i), h.At(j)) {
+ hh.Push(newRun(h, i, j))
+ i = j // start a new run
+ }
+ }
+ // add final run, if any
+ if i < h.Len() {
+ hh.Push(newRun(h, i, h.Len()))
+ }
+
+ return &hh
+}
+
+// ----------------------------------------------------------------------------
+// SpotInfo
+
+// A SpotInfo value describes a particular identifier spot in a given file;
+// It encodes three values: the SpotKind (declaration or use), a line or
+// snippet index "lori", and whether it's a line or index.
+//
+// The following encoding is used:
+//
+// bits 32 4 1 0
+// value [lori|kind|isIndex]
+//
+type SpotInfo uint32
+
+// SpotKind describes whether an identifier is declared (and what kind of
+// declaration) or used.
+type SpotKind uint32
+
+const (
+ PackageClause SpotKind = iota
+ ImportDecl
+ ConstDecl
+ TypeDecl
+ VarDecl
+ FuncDecl
+ MethodDecl
+ Use
+ nKinds
+)
+
+func init() {
+ // sanity check: if nKinds is too large, the SpotInfo
+ // accessor functions may need to be updated
+ if nKinds > 8 {
+ panic("nKinds > 8")
+ }
+}
+
+// makeSpotInfo makes a SpotInfo.
+func makeSpotInfo(kind SpotKind, lori int, isIndex bool) SpotInfo {
+ // encode lori: bits [4..32)
+ x := SpotInfo(lori) << 4
+ if int(x>>4) != lori {
+ // lori value doesn't fit - since snippet indices are
+ // most certainly always smaller then 1<<28, this can
+ // only happen for line numbers; give it no line number (= 0)
+ x = 0
+ }
+ // encode kind: bits [1..4)
+ x |= SpotInfo(kind) << 1
+ // encode isIndex: bit 0
+ if isIndex {
+ x |= 1
+ }
+ return x
+}
+
+func (x SpotInfo) Kind() SpotKind { return SpotKind(x >> 1 & 7) }
+func (x SpotInfo) Lori() int { return int(x >> 4) }
+func (x SpotInfo) IsIndex() bool { return x&1 != 0 }
+
+// ----------------------------------------------------------------------------
+// KindRun
+
+// Debugging support. Disable to see multiple entries per line.
+const removeDuplicates = true
+
+// A KindRun is a run of SpotInfos of the same kind in a given file.
+type KindRun struct {
+ Kind SpotKind
+ Infos []SpotInfo
+}
+
+// KindRuns are sorted by line number or index. Since the isIndex bit
+// is always the same for all infos in one list we can compare lori's.
+func (f *KindRun) Len() int { return len(f.Infos) }
+func (f *KindRun) Less(i, j int) bool { return f.Infos[i].Lori() < f.Infos[j].Lori() }
+func (f *KindRun) Swap(i, j int) { f.Infos[i], f.Infos[j] = f.Infos[j], f.Infos[i] }
+
+// FileRun contents are sorted by Kind for the reduction into KindRuns.
+func lessKind(x, y interface{}) bool { return x.(SpotInfo).Kind() < y.(SpotInfo).Kind() }
+
+// newKindRun allocates a new KindRun from the SpotInfo run [i, j) in h.
+func newKindRun(h *RunList, i, j int) interface{} {
+ kind := h.At(i).(SpotInfo).Kind()
+ infos := make([]SpotInfo, j-i)
+ k := 0
+ for ; i < j; i++ {
+ infos[k] = h.At(i).(SpotInfo)
+ k++
+ }
+ run := &KindRun{kind, infos}
+
+ // Spots were sorted by file and kind to create this run.
+ // Within this run, sort them by line number or index.
+ sort.Sort(run)
+
+ if removeDuplicates {
+ // Since both the lori and kind field must be
+ // same for duplicates, and since the isIndex
+ // bit is always the same for all infos in one
+ // list we can simply compare the entire info.
+ k := 0
+ var prev SpotInfo
+ for i, x := range infos {
+ if x != prev || i == 0 {
+ infos[k] = x
+ k++
+ prev = x
+ }
+ }
+ run.Infos = infos[0:k]
+ }
+
+ return run
+}
+
+// ----------------------------------------------------------------------------
+// FileRun
+
+// A Pak describes a Go package.
+type Pak struct {
+ Path string // path of directory containing the package
+ Name string // package name as declared by package clause
+}
+
+// Paks are sorted by name (primary key) and by import path (secondary key).
+func (p *Pak) less(q *Pak) bool {
+ return p.Name < q.Name || p.Name == q.Name && p.Path < q.Path
+}
+
+// A File describes a Go file.
+type File struct {
+ Path string // complete file name
+ Pak Pak // the package to which the file belongs
+}
+
+// A Spot describes a single occurrence of a word.
+type Spot struct {
+ File *File
+ Info SpotInfo
+}
+
+// A FileRun is a list of KindRuns belonging to the same file.
+type FileRun struct {
+ File *File
+ Groups []*KindRun
+}
+
+// Spots are sorted by path for the reduction into FileRuns.
+func lessSpot(x, y interface{}) bool { return x.(Spot).File.Path < y.(Spot).File.Path }
+
+// newFileRun allocates a new FileRun from the Spot run [i, j) in h.
+func newFileRun(h0 *RunList, i, j int) interface{} {
+ file := h0.At(i).(Spot).File
+
+ // reduce the list of Spots into a list of KindRuns
+ var h1 RunList
+ h1.Vector.Resize(j-i, 0)
+ k := 0
+ for ; i < j; i++ {
+ h1.Set(k, h0.At(i).(Spot).Info)
+ k++
+ }
+ h2 := h1.reduce(lessKind, newKindRun)
+
+ // create the FileRun
+ groups := make([]*KindRun, h2.Len())
+ for i := 0; i < h2.Len(); i++ {
+ groups[i] = h2.At(i).(*KindRun)
+ }
+ return &FileRun{file, groups}
+}
+
+// ----------------------------------------------------------------------------
+// PakRun
+
+// A PakRun describes a run of *FileRuns of a package.
+type PakRun struct {
+ Pak Pak
+ Files []*FileRun
+}
+
+// Sorting support for files within a PakRun.
+func (p *PakRun) Len() int { return len(p.Files) }
+func (p *PakRun) Less(i, j int) bool { return p.Files[i].File.Path < p.Files[j].File.Path }
+func (p *PakRun) Swap(i, j int) { p.Files[i], p.Files[j] = p.Files[j], p.Files[i] }
+
+// FileRuns are sorted by package for the reduction into PakRuns.
+func lessFileRun(x, y interface{}) bool {
+ return x.(*FileRun).File.Pak.less(&y.(*FileRun).File.Pak)
+}
+
+// newPakRun allocates a new PakRun from the *FileRun run [i, j) in h.
+func newPakRun(h *RunList, i, j int) interface{} {
+ pak := h.At(i).(*FileRun).File.Pak
+ files := make([]*FileRun, j-i)
+ k := 0
+ for ; i < j; i++ {
+ files[k] = h.At(i).(*FileRun)
+ k++
+ }
+ run := &PakRun{pak, files}
+ sort.Sort(run) // files were sorted by package; sort them by file now
+ return run
+}
+
+// ----------------------------------------------------------------------------
+// HitList
+
+// A HitList describes a list of PakRuns.
+type HitList []*PakRun
+
+// PakRuns are sorted by package.
+func lessPakRun(x, y interface{}) bool { return x.(*PakRun).Pak.less(&y.(*PakRun).Pak) }
+
+func reduce(h0 *RunList) HitList {
+ // reduce a list of Spots into a list of FileRuns
+ h1 := h0.reduce(lessSpot, newFileRun)
+ // reduce a list of FileRuns into a list of PakRuns
+ h2 := h1.reduce(lessFileRun, newPakRun)
+ // sort the list of PakRuns by package
+ h2.sort(lessPakRun)
+ // create a HitList
+ h := make(HitList, h2.Len())
+ for i := 0; i < h2.Len(); i++ {
+ h[i] = h2.At(i).(*PakRun)
+ }
+ return h
+}
+
+func (h HitList) filter(pakname string) HitList {
+ // determine number of matching packages (most of the time just one)
+ n := 0
+ for _, p := range h {
+ if p.Pak.Name == pakname {
+ n++
+ }
+ }
+ // create filtered HitList
+ hh := make(HitList, n)
+ i := 0
+ for _, p := range h {
+ if p.Pak.Name == pakname {
+ hh[i] = p
+ i++
+ }
+ }
+ return hh
+}
+
+// ----------------------------------------------------------------------------
+// AltWords
+
+type wordPair struct {
+ canon string // canonical word spelling (all lowercase)
+ alt string // alternative spelling
+}
+
+// An AltWords describes a list of alternative spellings for a
+// canonical (all lowercase) spelling of a word.
+type AltWords struct {
+ Canon string // canonical word spelling (all lowercase)
+ Alts []string // alternative spelling for the same word
+}
+
+// wordPairs are sorted by their canonical spelling.
+func lessWordPair(x, y interface{}) bool { return x.(*wordPair).canon < y.(*wordPair).canon }
+
+// newAltWords allocates a new AltWords from the *wordPair run [i, j) in h.
+func newAltWords(h *RunList, i, j int) interface{} {
+ canon := h.At(i).(*wordPair).canon
+ alts := make([]string, j-i)
+ k := 0
+ for ; i < j; i++ {
+ alts[k] = h.At(i).(*wordPair).alt
+ k++
+ }
+ return &AltWords{canon, alts}
+}
+
+func (a *AltWords) filter(s string) *AltWords {
+ if len(a.Alts) == 1 && a.Alts[0] == s {
+ // there are no different alternatives
+ return nil
+ }
+
+ // make a new AltWords with the current spelling removed
+ alts := make([]string, len(a.Alts))
+ i := 0
+ for _, w := range a.Alts {
+ if w != s {
+ alts[i] = w
+ i++
+ }
+ }
+ return &AltWords{a.Canon, alts[0:i]}
+}
+
+// ----------------------------------------------------------------------------
+// Indexer
+
+// Adjust these flags as seems best.
+const includeMainPackages = true
+const includeTestFiles = true
+
+type IndexResult struct {
+ Decls RunList // package-level declarations (with snippets)
+ Others RunList // all other occurrences
+}
+
+// Statistics provides statistics information for an index.
+type Statistics struct {
+ Bytes int // total size of indexed source files
+ Files int // number of indexed source files
+ Lines int // number of lines (all files)
+ Words int // number of different identifiers
+ Spots int // number of identifier occurrences
+}
+
+// An Indexer maintains the data structures and provides the machinery
+// for indexing .go files under a file tree. It implements the path.Visitor
+// interface for walking file trees, and the ast.Visitor interface for
+// walking Go ASTs.
+type Indexer struct {
+ fset *token.FileSet // file set for all indexed files
+ sources bytes.Buffer // concatenated sources
+ words map[string]*IndexResult // RunLists of Spots
+ snippets vector.Vector // vector of *Snippets, indexed by snippet indices
+ current *token.File // last file added to file set
+ file *File // AST for current file
+ decl ast.Decl // AST for current decl
+ stats Statistics
+}
+
+func (x *Indexer) addSnippet(s *Snippet) int {
+ index := x.snippets.Len()
+ x.snippets.Push(s)
+ return index
+}
+
+func (x *Indexer) visitComment(c *ast.CommentGroup) {
+ if c != nil {
+ ast.Walk(x, c)
+ }
+}
+
+func (x *Indexer) visitIdent(kind SpotKind, id *ast.Ident) {
+ if id != nil {
+ lists, found := x.words[id.Name]
+ if !found {
+ lists = new(IndexResult)
+ x.words[id.Name] = lists
+ }
+
+ if kind == Use || x.decl == nil {
+ // not a declaration or no snippet required
+ info := makeSpotInfo(kind, x.current.Line(id.Pos()), false)
+ lists.Others.Push(Spot{x.file, info})
+ } else {
+ // a declaration with snippet
+ index := x.addSnippet(NewSnippet(x.fset, x.decl, id))
+ info := makeSpotInfo(kind, index, true)
+ lists.Decls.Push(Spot{x.file, info})
+ }
+
+ x.stats.Spots++
+ }
+}
+
+func (x *Indexer) visitSpec(spec ast.Spec, isVarDecl bool) {
+ switch n := spec.(type) {
+ case *ast.ImportSpec:
+ x.visitComment(n.Doc)
+ x.visitIdent(ImportDecl, n.Name)
+ ast.Walk(x, n.Path)
+ x.visitComment(n.Comment)
+
+ case *ast.ValueSpec:
+ x.visitComment(n.Doc)
+ kind := ConstDecl
+ if isVarDecl {
+ kind = VarDecl
+ }
+ for _, n := range n.Names {
+ x.visitIdent(kind, n)
+ }
+ ast.Walk(x, n.Type)
+ for _, v := range n.Values {
+ ast.Walk(x, v)
+ }
+ x.visitComment(n.Comment)
+
+ case *ast.TypeSpec:
+ x.visitComment(n.Doc)
+ x.visitIdent(TypeDecl, n.Name)
+ ast.Walk(x, n.Type)
+ x.visitComment(n.Comment)
+ }
+}
+
+func (x *Indexer) Visit(node ast.Node) ast.Visitor {
+ // TODO(gri): methods in interface types are categorized as VarDecl
+ switch n := node.(type) {
+ case nil:
+ return nil
+
+ case *ast.Ident:
+ x.visitIdent(Use, n)
+
+ case *ast.Field:
+ x.decl = nil // no snippets for fields
+ x.visitComment(n.Doc)
+ for _, m := range n.Names {
+ x.visitIdent(VarDecl, m)
+ }
+ ast.Walk(x, n.Type)
+ ast.Walk(x, n.Tag)
+ x.visitComment(n.Comment)
+
+ case *ast.DeclStmt:
+ if decl, ok := n.Decl.(*ast.GenDecl); ok {
+ // local declarations can only be *ast.GenDecls
+ x.decl = nil // no snippets for local declarations
+ x.visitComment(decl.Doc)
+ for _, s := range decl.Specs {
+ x.visitSpec(s, decl.Tok == token.VAR)
+ }
+ } else {
+ // handle error case gracefully
+ ast.Walk(x, n.Decl)
+ }
+
+ case *ast.GenDecl:
+ x.decl = n
+ x.visitComment(n.Doc)
+ for _, s := range n.Specs {
+ x.visitSpec(s, n.Tok == token.VAR)
+ }
+
+ case *ast.FuncDecl:
+ x.visitComment(n.Doc)
+ kind := FuncDecl
+ if n.Recv != nil {
+ kind = MethodDecl
+ ast.Walk(x, n.Recv)
+ }
+ x.decl = n
+ x.visitIdent(kind, n.Name)
+ ast.Walk(x, n.Type)
+ if n.Body != nil {
+ ast.Walk(x, n.Body)
+ }
+
+ case *ast.File:
+ x.visitComment(n.Doc)
+ x.decl = nil
+ x.visitIdent(PackageClause, n.Name)
+ for _, d := range n.Decls {
+ ast.Walk(x, d)
+ }
+ // don't visit package level comments for now
+ // to avoid duplicate visiting from individual
+ // nodes
+
+ default:
+ return x
+ }
+
+ return nil
+}
+
+func pkgName(filename string) string {
+ // use a new file set each time in order to not pollute the indexer's
+ // file set (which must stay in sync with the concatenated source code)
+ file, err := parser.ParseFile(token.NewFileSet(), filename, nil, parser.PackageClauseOnly)
+ if err != nil || file == nil {
+ return ""
+ }
+ return file.Name.Name
+}
+
+// addFile adds a file to the index if possible and returns the file set file
+// and the file's AST if it was successfully parsed as a Go file. If addFile
+// failed (that is, if the file was not added), it returns file == nil.
+func (x *Indexer) addFile(filename string, goFile bool) (file *token.File, ast *ast.File) {
+ // open file
+ f, err := fs.Open(filename)
+ if err != nil {
+ return
+ }
+ defer f.Close()
+
+ // The file set's base offset and x.sources size must be in lock-step;
+ // this permits the direct mapping of suffix array lookup results to
+ // to corresponding Pos values.
+ //
+ // When a file is added to the file set, its offset base increases by
+ // the size of the file + 1; and the initial base offset is 1. Add an
+ // extra byte to the sources here.
+ x.sources.WriteByte(0)
+
+ // If the sources length doesn't match the file set base at this point
+ // the file set implementation changed or we have another error.
+ base := x.fset.Base()
+ if x.sources.Len() != base {
+ panic("internal error - file base incorrect")
+ }
+
+ // append file contents (src) to x.sources
+ if _, err := x.sources.ReadFrom(f); err == nil {
+ src := x.sources.Bytes()[base:]
+
+ if goFile {
+ // parse the file and in the process add it to the file set
+ if ast, err = parser.ParseFile(x.fset, filename, src, parser.ParseComments); err == nil {
+ file = x.fset.File(ast.Pos()) // ast.Pos() is inside the file
+ return
+ }
+ // file has parse errors, and the AST may be incorrect -
+ // set lines information explicitly and index as ordinary
+ // text file (cannot fall through to the text case below
+ // because the file has already been added to the file set
+ // by the parser)
+ file = x.fset.File(token.Pos(base)) // token.Pos(base) is inside the file
+ file.SetLinesForContent(src)
+ ast = nil
+ return
+ }
+
+ if isText(src) {
+ // only add the file to the file set (for the full text index)
+ file = x.fset.AddFile(filename, x.fset.Base(), len(src))
+ file.SetLinesForContent(src)
+ return
+ }
+ }
+
+ // discard possibly added data
+ x.sources.Truncate(base - 1) // -1 to remove added byte 0 since no file was added
+ return
+}
+
+// Design note: Using an explicit white list of permitted files for indexing
+// makes sure that the important files are included and massively reduces the
+// number of files to index. The advantage over a blacklist is that unexpected
+// (non-blacklisted) files won't suddenly explode the index.
+//
+// TODO(gri): We may want to make this list customizable, perhaps via a flag.
+
+// Files are whitelisted if they have a file name or extension
+// present as key in whitelisted.
+var whitelisted = map[string]bool{
+ ".bash": true,
+ ".c": true,
+ ".css": true,
+ ".go": true,
+ ".goc": true,
+ ".h": true,
+ ".html": true,
+ ".js": true,
+ ".out": true,
+ ".py": true,
+ ".s": true,
+ ".sh": true,
+ ".txt": true,
+ ".xml": true,
+ "AUTHORS": true,
+ "CONTRIBUTORS": true,
+ "LICENSE": true,
+ "Makefile": true,
+ "PATENTS": true,
+ "README": true,
+}
+
+// isWhitelisted returns true if a file is on the list
+// of "permitted" files for indexing. The filename must
+// be the directory-local name of the file.
+func isWhitelisted(filename string) bool {
+ key := filepath.Ext(filename)
+ if key == "" {
+ // file has no extension - use entire filename
+ key = filename
+ }
+ return whitelisted[key]
+}
+
+func (x *Indexer) visitFile(dirname string, f FileInfo, fulltextIndex bool) {
+ if !f.IsRegular() {
+ return
+ }
+
+ filename := filepath.Join(dirname, f.Name())
+ goFile := false
+
+ switch {
+ case isGoFile(f):
+ if !includeTestFiles && (!isPkgFile(f) || strings.HasPrefix(filename, "test/")) {
+ return
+ }
+ if !includeMainPackages && pkgName(filename) == "main" {
+ return
+ }
+ goFile = true
+
+ case !fulltextIndex || !isWhitelisted(f.Name()):
+ return
+ }
+
+ file, fast := x.addFile(filename, goFile)
+ if file == nil {
+ return // addFile failed
+ }
+
+ if fast != nil {
+ // we've got a Go file to index
+ x.current = file
+ dir, _ := filepath.Split(filename)
+ pak := Pak{dir, fast.Name.Name}
+ x.file = &File{filename, pak}
+ ast.Walk(x, fast)
+ }
+
+ // update statistics
+ x.stats.Bytes += file.Size()
+ x.stats.Files++
+ x.stats.Lines += file.LineCount()
+}
+
+// ----------------------------------------------------------------------------
+// Index
+
+type LookupResult struct {
+ Decls HitList // package-level declarations (with snippets)
+ Others HitList // all other occurrences
+}
+
+type Index struct {
+ fset *token.FileSet // file set used during indexing; nil if no textindex
+ suffixes *suffixarray.Index // suffixes for concatenated sources; nil if no textindex
+ words map[string]*LookupResult // maps words to hit lists
+ alts map[string]*AltWords // maps canonical(words) to lists of alternative spellings
+ snippets []*Snippet // all snippets, indexed by snippet index
+ stats Statistics
+}
+
+func canonical(w string) string { return strings.ToLower(w) }
+
+// NewIndex creates a new index for the .go files
+// in the directories given by dirnames.
+//
+func NewIndex(dirnames <-chan string, fulltextIndex bool) *Index {
+ var x Indexer
+
+ // initialize Indexer
+ x.fset = token.NewFileSet()
+ x.words = make(map[string]*IndexResult)
+
+ // index all files in the directories given by dirnames
+ for dirname := range dirnames {
+ list, err := fs.ReadDir(dirname)
+ if err != nil {
+ continue // ignore this directory
+ }
+ for _, f := range list {
+ if !f.IsDirectory() {
+ x.visitFile(dirname, f, fulltextIndex)
+ }
+ }
+ }
+
+ if !fulltextIndex {
+ // the file set, the current file, and the sources are
+ // not needed after indexing if no text index is built -
+ // help GC and clear them
+ x.fset = nil
+ x.sources.Reset()
+ x.current = nil // contains reference to fset!
+ }
+
+ // for each word, reduce the RunLists into a LookupResult;
+ // also collect the word with its canonical spelling in a
+ // word list for later computation of alternative spellings
+ words := make(map[string]*LookupResult)
+ var wlist RunList
+ for w, h := range x.words {
+ decls := reduce(&h.Decls)
+ others := reduce(&h.Others)
+ words[w] = &LookupResult{
+ Decls: decls,
+ Others: others,
+ }
+ wlist.Push(&wordPair{canonical(w), w})
+ }
+ x.stats.Words = len(words)
+
+ // reduce the word list {canonical(w), w} into
+ // a list of AltWords runs {canonical(w), {w}}
+ alist := wlist.reduce(lessWordPair, newAltWords)
+
+ // convert alist into a map of alternative spellings
+ alts := make(map[string]*AltWords)
+ for i := 0; i < alist.Len(); i++ {
+ a := alist.At(i).(*AltWords)
+ alts[a.Canon] = a
+ }
+
+ // convert snippet vector into a list
+ snippets := make([]*Snippet, x.snippets.Len())
+ for i := 0; i < x.snippets.Len(); i++ {
+ snippets[i] = x.snippets.At(i).(*Snippet)
+ }
+
+ // create text index
+ var suffixes *suffixarray.Index
+ if fulltextIndex {
+ suffixes = suffixarray.New(x.sources.Bytes())
+ }
+
+ return &Index{x.fset, suffixes, words, alts, snippets, x.stats}
+}
+
+// Stats() returns index statistics.
+func (x *Index) Stats() Statistics {
+ return x.stats
+}
+
+func (x *Index) LookupWord(w string) (match *LookupResult, alt *AltWords) {
+ match = x.words[w]
+ alt = x.alts[canonical(w)]
+ // remove current spelling from alternatives
+ // (if there is no match, the alternatives do
+ // not contain the current spelling)
+ if match != nil && alt != nil {
+ alt = alt.filter(w)
+ }
+ return
+}
+
+func isIdentifier(s string) bool {
+ var S scanner.Scanner
+ fset := token.NewFileSet()
+ S.Init(fset.AddFile("", fset.Base(), len(s)), []byte(s), nil, 0)
+ if _, tok, _ := S.Scan(); tok == token.IDENT {
+ _, tok, _ := S.Scan()
+ return tok == token.EOF
+ }
+ return false
+}
+
+// For a given query, which is either a single identifier or a qualified
+// identifier, Lookup returns a LookupResult, and a list of alternative
+// spellings, if any. If the query syntax is wrong, an error is reported.
+func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, err os.Error) {
+ ss := strings.Split(query, ".")
+
+ // check query syntax
+ for _, s := range ss {
+ if !isIdentifier(s) {
+ err = os.NewError("all query parts must be identifiers")
+ return
+ }
+ }
+
+ switch len(ss) {
+ case 1:
+ match, alt = x.LookupWord(ss[0])
+
+ case 2:
+ pakname := ss[0]
+ match, alt = x.LookupWord(ss[1])
+ if match != nil {
+ // found a match - filter by package name
+ decls := match.Decls.filter(pakname)
+ others := match.Others.filter(pakname)
+ match = &LookupResult{decls, others}
+ }
+
+ default:
+ err = os.NewError("query is not a (qualified) identifier")
+ }
+
+ return
+}
+
+func (x *Index) Snippet(i int) *Snippet {
+ // handle illegal snippet indices gracefully
+ if 0 <= i && i < len(x.snippets) {
+ return x.snippets[i]
+ }
+ return nil
+}
+
+type positionList []struct {
+ filename string
+ line int
+}
+
+func (list positionList) Len() int { return len(list) }
+func (list positionList) Less(i, j int) bool { return list[i].filename < list[j].filename }
+func (list positionList) Swap(i, j int) { list[i], list[j] = list[j], list[i] }
+
+// unique returns the list sorted and with duplicate entries removed
+func unique(list []int) []int {
+ sort.Ints(list)
+ var last int
+ i := 0
+ for _, x := range list {
+ if i == 0 || x != last {
+ last = x
+ list[i] = x
+ i++
+ }
+ }
+ return list[0:i]
+}
+
+// A FileLines value specifies a file and line numbers within that file.
+type FileLines struct {
+ Filename string
+ Lines []int
+}
+
+// LookupRegexp returns the number of matches and the matches where a regular
+// expression r is found in the full text index. At most n matches are
+// returned (thus found <= n).
+//
+func (x *Index) LookupRegexp(r *regexp.Regexp, n int) (found int, result []FileLines) {
+ if x.suffixes == nil || n <= 0 {
+ return
+ }
+ // n > 0
+
+ var list positionList
+ // FindAllIndex may returns matches that span across file boundaries.
+ // Such matches are unlikely, buf after eliminating them we may end up
+ // with fewer than n matches. If we don't have enough at the end, redo
+ // the search with an increased value n1, but only if FindAllIndex
+ // returned all the requested matches in the first place (if it
+ // returned fewer than that there cannot be more).
+ for n1 := n; found < n; n1 += n - found {
+ found = 0
+ matches := x.suffixes.FindAllIndex(r, n1)
+ // compute files, exclude matches that span file boundaries,
+ // and map offsets to file-local offsets
+ list = make(positionList, len(matches))
+ for _, m := range matches {
+ // by construction, an offset corresponds to the Pos value
+ // for the file set - use it to get the file and line
+ p := token.Pos(m[0])
+ if file := x.fset.File(p); file != nil {
+ if base := file.Base(); base <= m[1] && m[1] <= base+file.Size() {
+ // match [m[0], m[1]) is within the file boundaries
+ list[found].filename = file.Name()
+ list[found].line = file.Line(p)
+ found++
+ }
+ }
+ }
+ if found == n || len(matches) < n1 {
+ // found all matches or there's no chance to find more
+ break
+ }
+ }
+ list = list[0:found]
+ sort.Sort(list) // sort by filename
+
+ // collect matches belonging to the same file
+ var last string
+ var lines []int
+ addLines := func() {
+ if len(lines) > 0 {
+ // remove duplicate lines
+ result = append(result, FileLines{last, unique(lines)})
+ lines = nil
+ }
+ }
+ for _, m := range list {
+ if m.filename != last {
+ addLines()
+ last = m.filename
+ }
+ lines = append(lines, m.line)
+ }
+ addLines()
+
+ return
+}