1 files changed, 258 insertions, 37 deletions
diff --git a/src/cmd/godoc/index.go b/src/cmd/godoc/index.go
index 8745b8b0a..ba6fe9acd 100644
--- a/src/cmd/godoc/index.go
+++ b/src/cmd/godoc/index.go
@@ -3,11 +3,11 @@
 // license that can be found in the LICENSE file.
 
 // This file contains the infrastructure to create an
-// (identifier) index for a set of Go files.
+// identifier and full-text index for a set of Go files.
 //
-// Basic indexing algorithm:
+// Algorithm for identifier index:
 // - traverse all .go files of the file tree specified by root
-// - for each word (identifier) encountered, collect all occurences (spots)
+// - for each word (identifier) encountered, collect all occurrences (spots)
 //   into a list; this produces a list of spots for each word
 // - reduce the lists: from a list of spots to a list of FileRuns,
 //   and from a list of FileRuns into a list of PakRuns
@@ -21,17 +21,34 @@
 //   (the line number for spots with snippets is stored in the snippet)
 // - at the end, create lists of alternative spellings for a given
 //   word
+//
+// Algorithm for full text index:
+// - concatenate all source code in a byte buffer (in memory)
+// - add the files to a file set in lockstep as they are added to the byte
+//   buffer such that a byte buffer offset corresponds to the Pos value for
+//   that file location
+// - create a suffix array from the concatenated sources
+//
+// String lookup in full text index:
+// - use the suffix array to lookup a string's offsets - the offsets
+//   correspond to the Pos values relative to the file set
+// - translate the Pos values back into file and line information and
+//   sort the result
 
 package main
 
 import (
+	"bytes"
 	"container/vector"
 	"go/ast"
 	"go/parser"
 	"go/token"
 	"go/scanner"
+	"index/suffixarray"
+	"io/ioutil"
 	"os"
 	pathutil "path"
+	"regexp"
 	"sort"
 	"strings"
 )
@@ -231,7 +248,7 @@ type File struct {
 }
 
 
-// A Spot describes a single occurence of a word.
+// A Spot describes a single occurrence of a word.
 type Spot struct {
 	File *File
 	Info SpotInfo
@@ -419,7 +436,17 @@ const excludeTestFiles = false
 
 type IndexResult struct {
 	Decls  RunList // package-level declarations (with snippets)
-	Others RunList // all other occurences
+	Others RunList // all other occurrences
+}
+
+
+// Statistics provides statistics information for an index.
+type Statistics struct {
+	Bytes int // total size of indexed source files
+	Files int // number of indexed source files
+	Lines int // number of lines (all files)
+	Words int // number of different identifiers
+	Spots int // number of identifier occurrences
 }
 
 
@@ -428,11 +455,14 @@ type IndexResult struct {
 // interface for walking file trees, and the ast.Visitor interface for
 // walking Go ASTs.
 type Indexer struct {
+	fset     *token.FileSet          // file set for all indexed files
+	sources  bytes.Buffer            // concatenated sources
 	words    map[string]*IndexResult // RunLists of Spots
 	snippets vector.Vector           // vector of *Snippets, indexed by snippet indices
-	file     *File                   // current file
-	decl     ast.Decl                // current decl
-	nspots   int                     // number of spots encountered
+	current  *token.File             // last file added to file set
+	file     *File                   // AST for current file
+	decl     ast.Decl                // AST for current decl
+	stats    Statistics
 }
 
 
@@ -452,24 +482,24 @@ func (x *Indexer) visitComment(c *ast.CommentGroup) {
 
 func (x *Indexer) visitIdent(kind SpotKind, id *ast.Ident) {
 	if id != nil {
-		lists, found := x.words[id.Name()]
+		lists, found := x.words[id.Name]
 		if !found {
 			lists = new(IndexResult)
-			x.words[id.Name()] = lists
+			x.words[id.Name] = lists
 		}
 
 		if kind == Use || x.decl == nil {
 			// not a declaration or no snippet required
-			info := makeSpotInfo(kind, id.Pos().Line, false)
+			info := makeSpotInfo(kind, x.current.Line(id.Pos()), false)
 			lists.Others.Push(Spot{x.file, info})
 		} else {
 			// a declaration with snippet
-			index := x.addSnippet(NewSnippet(x.decl, id))
+			index := x.addSnippet(NewSnippet(x.fset, x.decl, id))
 			info := makeSpotInfo(kind, index, true)
 			lists.Decls.Push(Spot{x.file, info})
 		}
 
-		x.nspots++
+		x.stats.Spots++
 	}
 }
 
@@ -506,7 +536,7 @@ func (x *Indexer) visitSpec(spec ast.Spec, isVarDecl bool) {
 }
 
 
-func (x *Indexer) Visit(node interface{}) ast.Visitor {
+func (x *Indexer) Visit(node ast.Node) ast.Visitor {
 	// TODO(gri): methods in interface types are categorized as VarDecl
 	switch n := node.(type) {
 	case nil:
@@ -578,16 +608,69 @@ func (x *Indexer) Visit(node interface{}) ast.Visitor {
 }
 
 
-func (x *Indexer) VisitDir(path string, f *os.FileInfo) bool {
-	return true
+func pkgName(filename string) string {
+	// use a new file set each time in order to not pollute the indexer's
+	// file set (which must stay in sync with the concatenated source code)
+	file, err := parser.ParseFile(token.NewFileSet(), filename, nil, parser.PackageClauseOnly)
+	if err != nil || file == nil {
+		return ""
+	}
+	return file.Name.Name
+}
+
+
+func (x *Indexer) addFile(filename string) *ast.File {
+	// open file
+	f, err := os.Open(filename, os.O_RDONLY, 0)
+	if err != nil {
+		return nil
+	}
+	defer f.Close()
+
+	// The file set's base offset and x.sources size must be in lock-step;
+	// this permits the direct mapping of suffix array lookup results to
+	// to corresponding Pos values.
+	//
+	// When a file is added to the file set, it's offset base increases by
+	// the size of the file + 1; and the initial base offset is 1. Add an
+	// extra byte to the sources here.
+	x.sources.WriteByte(0)
+
+	// If the sources length doesn't match the file set base at this point
+	// the file set implementation changed or we have another error.
+	base := x.fset.Base()
+	if x.sources.Len() != base {
+		panic("internal error - file base incorrect")
+	}
+
+	// append file contents to x.sources
+	if _, err := x.sources.ReadFrom(f); err != nil {
+		x.sources.Truncate(base) // discard possibly added data
+		return nil               // ignore files with I/O errors
+	}
+
+	// parse the file and in the process add it to the file set
+	src := x.sources.Bytes()[base:] // no need to reread the file
+	file, err := parser.ParseFile(x.fset, filename, src, parser.ParseComments)
+	if err != nil {
+		// do not discard the added source code in this case
+		// because the file has been added to the file set and
+		// the source size must match the file set base
+		// TODO(gri): given a FileSet.RemoveFile() one might be
+		//            able to discard the data here (worthwhile?)
+		return nil // ignore files with (parse) errors
+	}
+
+	return file
 }
 
 
-func (x *Indexer) VisitFile(path string, f *os.FileInfo) {
+func (x *Indexer) visitFile(dirname string, f *os.FileInfo) {
 	if !isGoFile(f) {
 		return
 	}
 
+	path := pathutil.Join(dirname, f.Name)
 	if excludeTestFiles && (!isPkgFile(f) || strings.HasPrefix(path, "test/")) {
 		return
 	}
@@ -596,15 +679,23 @@ func (x *Indexer) VisitFile(path string, f *os.FileInfo) {
 		return
 	}
 
-	file, err := parser.ParseFile(path, nil, nil, parser.ParseComments)
-	if err != nil {
-		return // ignore files with (parse) errors
+	file := x.addFile(path)
+	if file == nil {
+		return
 	}
 
+	// we've got a file to index
+	x.current = x.fset.File(file.Pos()) // file.Pos is in the current file
 	dir, _ := pathutil.Split(path)
-	pak := Pak{dir, file.Name.Name()}
+	pak := Pak{dir, file.Name.Name}
 	x.file = &File{path, pak}
 	ast.Walk(x, file)
+
+	// update statistics
+	// (count real file size as opposed to using the padded x.sources.Len())
+	x.stats.Bytes += x.current.Size()
+	x.stats.Files++
+	x.stats.Lines += x.current.LineCount()
 }
 
 
@@ -613,30 +704,54 @@ func (x *Indexer) VisitFile(path string, f *os.FileInfo) {
 
 type LookupResult struct {
 	Decls  HitList // package-level declarations (with snippets)
-	Others HitList // all other occurences
+	Others HitList // all other occurrences
 }
 
 
 type Index struct {
+	fset     *token.FileSet           // file set used during indexing; nil if no textindex
+	suffixes *suffixarray.Index       // suffixes for concatenated sources; nil if no textindex
 	words    map[string]*LookupResult // maps words to hit lists
 	alts     map[string]*AltWords     // maps canonical(words) to lists of alternative spellings
 	snippets []*Snippet               // all snippets, indexed by snippet index
-	nspots   int                      // number of spots indexed (a measure of the index size)
+	stats    Statistics
 }
 
 
 func canonical(w string) string { return strings.ToLower(w) }
 
 
-// NewIndex creates a new index for the file tree rooted at root.
-func NewIndex(root string) *Index {
+// NewIndex creates a new index for the .go files
+// in the directories given by dirnames.
+//
+func NewIndex(dirnames <-chan string, fulltextIndex bool) *Index {
 	var x Indexer
 
 	// initialize Indexer
+	x.fset = token.NewFileSet()
 	x.words = make(map[string]*IndexResult)
 
-	// collect all Spots
-	pathutil.Walk(root, &x, nil)
+	// index all files in the directories given by dirnames
+	for dirname := range dirnames {
+		list, err := ioutil.ReadDir(dirname)
+		if err != nil {
+			continue // ignore this directory
+		}
+		for _, f := range list {
+			if !f.IsDirectory() {
+				x.visitFile(dirname, f)
+			}
+		}
+	}
+
+	if !fulltextIndex {
+		// the file set, the current file, and the sources are
+		// not needed after indexing if no text index is built -
+		// help GC and clear them
+		x.fset = nil
+		x.sources.Reset()
+		x.current = nil // contains reference to fset!
+	}
 
 	// for each word, reduce the RunLists into a LookupResult;
 	// also collect the word with its canonical spelling in a
@@ -652,6 +767,7 @@ func NewIndex(root string) *Index {
 		}
 		wlist.Push(&wordPair{canonical(w), w})
 	}
+	x.stats.Words = len(words)
 
 	// reduce the word list {canonical(w), w} into
 	// a list of AltWords runs {canonical(w), {w}}
@@ -670,14 +786,19 @@ func NewIndex(root string) *Index {
 		snippets[i] = x.snippets.At(i).(*Snippet)
 	}
 
-	return &Index{words, alts, snippets, x.nspots}
+	// create text index
+	var suffixes *suffixarray.Index
+	if fulltextIndex {
+		suffixes = suffixarray.New(x.sources.Bytes())
+	}
+
+	return &Index{x.fset, suffixes, words, alts, snippets, x.stats}
 }
 
 
-// Size returns the number of different words and
-// spots indexed as a measure for the index size.
-func (x *Index) Size() (nwords int, nspots int) {
-	return len(x.words), x.nspots
+// Stats() returns index statistics.
+func (x *Index) Stats() Statistics {
+	return x.stats
 }
 
 
@@ -696,7 +817,7 @@ func (x *Index) LookupWord(w string) (match *LookupResult, alt *AltWords) {
 
 func isIdentifier(s string) bool {
 	var S scanner.Scanner
-	S.Init("", []byte(s), nil, 0)
+	S.Init(token.NewFileSet(), "", []byte(s), nil, 0)
 	if _, tok, _ := S.Scan(); tok == token.IDENT {
 		_, tok, _ := S.Scan()
 		return tok == token.EOF
@@ -707,14 +828,14 @@ func isIdentifier(s string) bool {
 
 // For a given query, which is either a single identifier or a qualified
 // identifier, Lookup returns a LookupResult, and a list of alternative
-// spellings, if any. If the query syntax is wrong, illegal is set.
-func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, illegal bool) {
+// spellings, if any. If the query syntax is wrong, an error is reported.
+func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, err os.Error) {
 	ss := strings.Split(query, ".", -1)
 
 	// check query syntax
 	for _, s := range ss {
 		if !isIdentifier(s) {
-			illegal = true
+			err = os.NewError("all query parts must be identifiers")
 			return
 		}
 	}
@@ -734,7 +855,7 @@ func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, illega
 		}
 
 	default:
-		illegal = true
+		err = os.NewError("query is not a (qualified) identifier")
 	}
 
 	return
@@ -748,3 +869,103 @@ func (x *Index) Snippet(i int) *Snippet {
 	}
 	return nil
 }
+
+
+type positionList []struct {
+	filename string
+	line     int
+}
+
+func (list positionList) Len() int           { return len(list) }
+func (list positionList) Less(i, j int) bool { return list[i].filename < list[j].filename }
+func (list positionList) Swap(i, j int)      { list[i], list[j] = list[j], list[i] }
+
+
+// unique returns the list sorted and with duplicate entries removed
+func unique(list []int) []int {
+	sort.SortInts(list)
+	var last int
+	i := 0
+	for _, x := range list {
+		if i == 0 || x != last {
+			last = x
+			list[i] = x
+			i++
+		}
+	}
+	return list[0:i]
+}
+
+
+// A FileLines value specifies a file and line numbers within that file.
+type FileLines struct {
+	Filename string
+	Lines    []int
+}
+
+
+// LookupRegexp returns the number of matches and the matches where a regular
+// expression r is found in the full text index. At most n matches are
+// returned (thus found <= n).
+//
+func (x *Index) LookupRegexp(r *regexp.Regexp, n int) (found int, result []FileLines) {
+	if x.suffixes == nil || n <= 0 {
+		return
+	}
+	// n > 0
+
+	var list positionList
+	// FindAllIndex may returns matches that span across file boundaries.
+	// Such matches are unlikely, buf after eliminating them we may end up
+	// with fewer than n matches. If we don't have enough at the end, redo
+	// the search with an increased value n1, but only if FindAllIndex
+	// returned all the requested matches in the first place (if it
+	// returned fewer than that there cannot be more).
+	for n1 := n; found < n; n1 += n - found {
+		found = 0
+		matches := x.suffixes.FindAllIndex(r, n1)
+		// compute files, exclude matches that span file boundaries,
+		// and map offsets to file-local offsets
+		list = make(positionList, len(matches))
+		for _, m := range matches {
+			// by construction, an offset corresponds to the Pos value
+			// for the file set - use it to get the file and line
+			p := token.Pos(m[0])
+			if file := x.fset.File(p); file != nil {
+				if base := file.Base(); base <= m[1] && m[1] <= base+file.Size() {
+					// match [m[0], m[1]) is within the file boundaries
+					list[found].filename = file.Name()
+					list[found].line = file.Line(p)
+					found++
+				}
+			}
+		}
+		if found == n || len(matches) < n1 {
+			// found all matches or there's no chance to find more
+			break
+		}
+	}
+	list = list[0:found]
+	sort.Sort(list) // sort by filename
+
+	// collect matches belonging to the same file
+	var last string
+	var lines []int
+	addLines := func() {
+		if len(lines) > 0 {
+			// remove duplicate lines
+			result = append(result, FileLines{last, unique(lines)})
+			lines = nil
+		}
+	}
+	for _, m := range list {
+		if m.filename != last {
+			addLines()
+			last = m.filename
+		}
+		lines = append(lines, m.line)
+	}
+	addLines()
+
+	return
+}