diff options
Diffstat (limited to 'src/cmd/godoc/index.go')
-rw-r--r-- | src/cmd/godoc/index.go | 295 |
1 files changed, 258 insertions, 37 deletions
diff --git a/src/cmd/godoc/index.go b/src/cmd/godoc/index.go index 8745b8b0a..ba6fe9acd 100644 --- a/src/cmd/godoc/index.go +++ b/src/cmd/godoc/index.go @@ -3,11 +3,11 @@ // license that can be found in the LICENSE file. // This file contains the infrastructure to create an -// (identifier) index for a set of Go files. +// identifier and full-text index for a set of Go files. // -// Basic indexing algorithm: +// Algorithm for identifier index: // - traverse all .go files of the file tree specified by root -// - for each word (identifier) encountered, collect all occurences (spots) +// - for each word (identifier) encountered, collect all occurrences (spots) // into a list; this produces a list of spots for each word // - reduce the lists: from a list of spots to a list of FileRuns, // and from a list of FileRuns into a list of PakRuns @@ -21,17 +21,34 @@ // (the line number for spots with snippets is stored in the snippet) // - at the end, create lists of alternative spellings for a given // word +// +// Algorithm for full text index: +// - concatenate all source code in a byte buffer (in memory) +// - add the files to a file set in lockstep as they are added to the byte +// buffer such that a byte buffer offset corresponds to the Pos value for +// that file location +// - create a suffix array from the concatenated sources +// +// String lookup in full text index: +// - use the suffix array to lookup a string's offsets - the offsets +// correspond to the Pos values relative to the file set +// - translate the Pos values back into file and line information and +// sort the result package main import ( + "bytes" "container/vector" "go/ast" "go/parser" "go/token" "go/scanner" + "index/suffixarray" + "io/ioutil" "os" pathutil "path" + "regexp" "sort" "strings" ) @@ -231,7 +248,7 @@ type File struct { } -// A Spot describes a single occurence of a word. +// A Spot describes a single occurrence of a word. type Spot struct { File *File Info SpotInfo @@ -419,7 +436,17 @@ const excludeTestFiles = false type IndexResult struct { Decls RunList // package-level declarations (with snippets) - Others RunList // all other occurences + Others RunList // all other occurrences +} + + +// Statistics provides statistics information for an index. +type Statistics struct { + Bytes int // total size of indexed source files + Files int // number of indexed source files + Lines int // number of lines (all files) + Words int // number of different identifiers + Spots int // number of identifier occurrences } @@ -428,11 +455,14 @@ type IndexResult struct { // interface for walking file trees, and the ast.Visitor interface for // walking Go ASTs. type Indexer struct { + fset *token.FileSet // file set for all indexed files + sources bytes.Buffer // concatenated sources words map[string]*IndexResult // RunLists of Spots snippets vector.Vector // vector of *Snippets, indexed by snippet indices - file *File // current file - decl ast.Decl // current decl - nspots int // number of spots encountered + current *token.File // last file added to file set + file *File // AST for current file + decl ast.Decl // AST for current decl + stats Statistics } @@ -452,24 +482,24 @@ func (x *Indexer) visitComment(c *ast.CommentGroup) { func (x *Indexer) visitIdent(kind SpotKind, id *ast.Ident) { if id != nil { - lists, found := x.words[id.Name()] + lists, found := x.words[id.Name] if !found { lists = new(IndexResult) - x.words[id.Name()] = lists + x.words[id.Name] = lists } if kind == Use || x.decl == nil { // not a declaration or no snippet required - info := makeSpotInfo(kind, id.Pos().Line, false) + info := makeSpotInfo(kind, x.current.Line(id.Pos()), false) lists.Others.Push(Spot{x.file, info}) } else { // a declaration with snippet - index := x.addSnippet(NewSnippet(x.decl, id)) + index := x.addSnippet(NewSnippet(x.fset, x.decl, id)) info := makeSpotInfo(kind, index, true) lists.Decls.Push(Spot{x.file, info}) } - x.nspots++ + x.stats.Spots++ } } @@ -506,7 +536,7 @@ func (x *Indexer) visitSpec(spec ast.Spec, isVarDecl bool) { } -func (x *Indexer) Visit(node interface{}) ast.Visitor { +func (x *Indexer) Visit(node ast.Node) ast.Visitor { // TODO(gri): methods in interface types are categorized as VarDecl switch n := node.(type) { case nil: @@ -578,16 +608,69 @@ func (x *Indexer) Visit(node interface{}) ast.Visitor { } -func (x *Indexer) VisitDir(path string, f *os.FileInfo) bool { - return true +func pkgName(filename string) string { + // use a new file set each time in order to not pollute the indexer's + // file set (which must stay in sync with the concatenated source code) + file, err := parser.ParseFile(token.NewFileSet(), filename, nil, parser.PackageClauseOnly) + if err != nil || file == nil { + return "" + } + return file.Name.Name +} + + +func (x *Indexer) addFile(filename string) *ast.File { + // open file + f, err := os.Open(filename, os.O_RDONLY, 0) + if err != nil { + return nil + } + defer f.Close() + + // The file set's base offset and x.sources size must be in lock-step; + // this permits the direct mapping of suffix array lookup results to + // to corresponding Pos values. + // + // When a file is added to the file set, it's offset base increases by + // the size of the file + 1; and the initial base offset is 1. Add an + // extra byte to the sources here. + x.sources.WriteByte(0) + + // If the sources length doesn't match the file set base at this point + // the file set implementation changed or we have another error. + base := x.fset.Base() + if x.sources.Len() != base { + panic("internal error - file base incorrect") + } + + // append file contents to x.sources + if _, err := x.sources.ReadFrom(f); err != nil { + x.sources.Truncate(base) // discard possibly added data + return nil // ignore files with I/O errors + } + + // parse the file and in the process add it to the file set + src := x.sources.Bytes()[base:] // no need to reread the file + file, err := parser.ParseFile(x.fset, filename, src, parser.ParseComments) + if err != nil { + // do not discard the added source code in this case + // because the file has been added to the file set and + // the source size must match the file set base + // TODO(gri): given a FileSet.RemoveFile() one might be + // able to discard the data here (worthwhile?) + return nil // ignore files with (parse) errors + } + + return file } -func (x *Indexer) VisitFile(path string, f *os.FileInfo) { +func (x *Indexer) visitFile(dirname string, f *os.FileInfo) { if !isGoFile(f) { return } + path := pathutil.Join(dirname, f.Name) if excludeTestFiles && (!isPkgFile(f) || strings.HasPrefix(path, "test/")) { return } @@ -596,15 +679,23 @@ func (x *Indexer) VisitFile(path string, f *os.FileInfo) { return } - file, err := parser.ParseFile(path, nil, nil, parser.ParseComments) - if err != nil { - return // ignore files with (parse) errors + file := x.addFile(path) + if file == nil { + return } + // we've got a file to index + x.current = x.fset.File(file.Pos()) // file.Pos is in the current file dir, _ := pathutil.Split(path) - pak := Pak{dir, file.Name.Name()} + pak := Pak{dir, file.Name.Name} x.file = &File{path, pak} ast.Walk(x, file) + + // update statistics + // (count real file size as opposed to using the padded x.sources.Len()) + x.stats.Bytes += x.current.Size() + x.stats.Files++ + x.stats.Lines += x.current.LineCount() } @@ -613,30 +704,54 @@ func (x *Indexer) VisitFile(path string, f *os.FileInfo) { type LookupResult struct { Decls HitList // package-level declarations (with snippets) - Others HitList // all other occurences + Others HitList // all other occurrences } type Index struct { + fset *token.FileSet // file set used during indexing; nil if no textindex + suffixes *suffixarray.Index // suffixes for concatenated sources; nil if no textindex words map[string]*LookupResult // maps words to hit lists alts map[string]*AltWords // maps canonical(words) to lists of alternative spellings snippets []*Snippet // all snippets, indexed by snippet index - nspots int // number of spots indexed (a measure of the index size) + stats Statistics } func canonical(w string) string { return strings.ToLower(w) } -// NewIndex creates a new index for the file tree rooted at root. -func NewIndex(root string) *Index { +// NewIndex creates a new index for the .go files +// in the directories given by dirnames. +// +func NewIndex(dirnames <-chan string, fulltextIndex bool) *Index { var x Indexer // initialize Indexer + x.fset = token.NewFileSet() x.words = make(map[string]*IndexResult) - // collect all Spots - pathutil.Walk(root, &x, nil) + // index all files in the directories given by dirnames + for dirname := range dirnames { + list, err := ioutil.ReadDir(dirname) + if err != nil { + continue // ignore this directory + } + for _, f := range list { + if !f.IsDirectory() { + x.visitFile(dirname, f) + } + } + } + + if !fulltextIndex { + // the file set, the current file, and the sources are + // not needed after indexing if no text index is built - + // help GC and clear them + x.fset = nil + x.sources.Reset() + x.current = nil // contains reference to fset! + } // for each word, reduce the RunLists into a LookupResult; // also collect the word with its canonical spelling in a @@ -652,6 +767,7 @@ func NewIndex(root string) *Index { } wlist.Push(&wordPair{canonical(w), w}) } + x.stats.Words = len(words) // reduce the word list {canonical(w), w} into // a list of AltWords runs {canonical(w), {w}} @@ -670,14 +786,19 @@ func NewIndex(root string) *Index { snippets[i] = x.snippets.At(i).(*Snippet) } - return &Index{words, alts, snippets, x.nspots} + // create text index + var suffixes *suffixarray.Index + if fulltextIndex { + suffixes = suffixarray.New(x.sources.Bytes()) + } + + return &Index{x.fset, suffixes, words, alts, snippets, x.stats} } -// Size returns the number of different words and -// spots indexed as a measure for the index size. -func (x *Index) Size() (nwords int, nspots int) { - return len(x.words), x.nspots +// Stats() returns index statistics. +func (x *Index) Stats() Statistics { + return x.stats } @@ -696,7 +817,7 @@ func (x *Index) LookupWord(w string) (match *LookupResult, alt *AltWords) { func isIdentifier(s string) bool { var S scanner.Scanner - S.Init("", []byte(s), nil, 0) + S.Init(token.NewFileSet(), "", []byte(s), nil, 0) if _, tok, _ := S.Scan(); tok == token.IDENT { _, tok, _ := S.Scan() return tok == token.EOF @@ -707,14 +828,14 @@ func isIdentifier(s string) bool { // For a given query, which is either a single identifier or a qualified // identifier, Lookup returns a LookupResult, and a list of alternative -// spellings, if any. If the query syntax is wrong, illegal is set. -func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, illegal bool) { +// spellings, if any. If the query syntax is wrong, an error is reported. +func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, err os.Error) { ss := strings.Split(query, ".", -1) // check query syntax for _, s := range ss { if !isIdentifier(s) { - illegal = true + err = os.NewError("all query parts must be identifiers") return } } @@ -734,7 +855,7 @@ func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, illega } default: - illegal = true + err = os.NewError("query is not a (qualified) identifier") } return @@ -748,3 +869,103 @@ func (x *Index) Snippet(i int) *Snippet { } return nil } + + +type positionList []struct { + filename string + line int +} + +func (list positionList) Len() int { return len(list) } +func (list positionList) Less(i, j int) bool { return list[i].filename < list[j].filename } +func (list positionList) Swap(i, j int) { list[i], list[j] = list[j], list[i] } + + +// unique returns the list sorted and with duplicate entries removed +func unique(list []int) []int { + sort.SortInts(list) + var last int + i := 0 + for _, x := range list { + if i == 0 || x != last { + last = x + list[i] = x + i++ + } + } + return list[0:i] +} + + +// A FileLines value specifies a file and line numbers within that file. +type FileLines struct { + Filename string + Lines []int +} + + +// LookupRegexp returns the number of matches and the matches where a regular +// expression r is found in the full text index. At most n matches are +// returned (thus found <= n). +// +func (x *Index) LookupRegexp(r *regexp.Regexp, n int) (found int, result []FileLines) { + if x.suffixes == nil || n <= 0 { + return + } + // n > 0 + + var list positionList + // FindAllIndex may returns matches that span across file boundaries. + // Such matches are unlikely, buf after eliminating them we may end up + // with fewer than n matches. If we don't have enough at the end, redo + // the search with an increased value n1, but only if FindAllIndex + // returned all the requested matches in the first place (if it + // returned fewer than that there cannot be more). + for n1 := n; found < n; n1 += n - found { + found = 0 + matches := x.suffixes.FindAllIndex(r, n1) + // compute files, exclude matches that span file boundaries, + // and map offsets to file-local offsets + list = make(positionList, len(matches)) + for _, m := range matches { + // by construction, an offset corresponds to the Pos value + // for the file set - use it to get the file and line + p := token.Pos(m[0]) + if file := x.fset.File(p); file != nil { + if base := file.Base(); base <= m[1] && m[1] <= base+file.Size() { + // match [m[0], m[1]) is within the file boundaries + list[found].filename = file.Name() + list[found].line = file.Line(p) + found++ + } + } + } + if found == n || len(matches) < n1 { + // found all matches or there's no chance to find more + break + } + } + list = list[0:found] + sort.Sort(list) // sort by filename + + // collect matches belonging to the same file + var last string + var lines []int + addLines := func() { + if len(lines) > 0 { + // remove duplicate lines + result = append(result, FileLines{last, unique(lines)}) + lines = nil + } + } + for _, m := range list { + if m.filename != last { + addLines() + last = m.filename + } + lines = append(lines, m.line) + } + addLines() + + return +} |