Imported Upstream version 60

author: Ondřej Surý <ondrej@sury.org> 2011-09-13 13:11:55 +0200
committer: Ondřej Surý <ondrej@sury.org> 2011-09-13 13:11:55 +0200
commit: 80f18fc933cf3f3e829c5455a1023d69f7b86e52 (patch)
tree: 4b825dc642cb6eb9a060e54bf8d69288fbee4904 /src/pkg/index/suffixarray
parent: 28592ee1ea1f5cdffcf85472f9de0285d928cf12 (diff)
download: golang-80f18fc933cf3f3e829c5455a1023d69f7b86e52.tar.gz
4 files changed, 0 insertions, 610 deletions
diff --git a/src/pkg/index/suffixarray/Makefile b/src/pkg/index/suffixarray/Makefile
deleted file mode 100644
index 297c4279f..000000000
--- a/src/pkg/index/suffixarray/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright 2010 The Go Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-include ../../../Make.inc
-
-TARG=index/suffixarray
-GOFILES=\
-	qsufsort.go\
-	suffixarray.go\
-
-include ../../../Make.pkg
diff --git a/src/pkg/index/suffixarray/qsufsort.go b/src/pkg/index/suffixarray/qsufsort.go
deleted file mode 100644
index 9751b5c76..000000000
--- a/src/pkg/index/suffixarray/qsufsort.go
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This algorithm is based on "Faster Suffix Sorting"
-//   by N. Jesper Larsson and Kunihiko Sadakane
-// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
-// code:  http://www.larsson.dogma.net/qsufsort.c
-
-// This algorithm computes the suffix array sa by computing its inverse.
-// Consecutive groups of suffixes in sa are labeled as sorted groups or
-// unsorted groups. For a given pass of the sorter, all suffixes are ordered
-// up to their first h characters, and sa is h-ordered. Suffixes in their
-// final positions and unambiguouly sorted in h-order are in a sorted group.
-// Consecutive groups of suffixes with identical first h characters are an
-// unsorted group. In each pass of the algorithm, unsorted groups are sorted
-// according to the group number of their following suffix.
-
-// In the implementation, if sa[i] is negative, it indicates that i is
-// the first element of a sorted group of length -sa[i], and can be skipped.
-// An unsorted group sa[i:k] is given the group number of the index of its
-// last element, k-1. The group numbers are stored in the inverse slice (inv),
-// and when all groups are sorted, this slice is the inverse suffix array.
-
-package suffixarray
-
-import "sort"
-
-func qsufsort(data []byte) []int {
-	// initial sorting by first byte of suffix
-	sa := sortedByFirstByte(data)
-	if len(sa) < 2 {
-		return sa
-	}
-	// initialize the group lookup table
-	// this becomes the inverse of the suffix array when all groups are sorted
-	inv := initGroups(sa, data)
-
-	// the index starts 1-ordered
-	sufSortable := &suffixSortable{sa, inv, 1}
-
-	for sa[0] > -len(sa) { // until all suffixes are one big sorted group
-		// The suffixes are h-ordered, make them 2*h-ordered
-		pi := 0 // pi is first position of first group
-		sl := 0 // sl is negated length of sorted groups
-		for pi < len(sa) {
-			if s := sa[pi]; s < 0 { // if pi starts sorted group
-				pi -= s // skip over sorted group
-				sl += s // add negated length to sl
-			} else { // if pi starts unsorted group
-				if sl != 0 {
-					sa[pi+sl] = sl // combine sorted groups before pi
-					sl = 0
-				}
-				pk := inv[s] + 1 // pk-1 is last position of unsorted group
-				sufSortable.sa = sa[pi:pk]
-				sort.Sort(sufSortable)
-				sufSortable.updateGroups(pi)
-				pi = pk // next group
-			}
-		}
-		if sl != 0 { // if the array ends with a sorted group
-			sa[pi+sl] = sl // combine sorted groups at end of sa
-		}
-
-		sufSortable.h *= 2 // double sorted depth
-	}
-
-	for i := range sa { // reconstruct suffix array from inverse
-		sa[inv[i]] = i
-	}
-	return sa
-}
-
-
-func sortedByFirstByte(data []byte) []int {
-	// total byte counts
-	var count [256]int
-	for _, b := range data {
-		count[b]++
-	}
-	// make count[b] equal index of first occurence of b in sorted array
-	sum := 0
-	for b := range count {
-		count[b], sum = sum, count[b]+sum
-	}
-	// iterate through bytes, placing index into the correct spot in sa
-	sa := make([]int, len(data))
-	for i, b := range data {
-		sa[count[b]] = i
-		count[b]++
-	}
-	return sa
-}
-
-
-func initGroups(sa []int, data []byte) []int {
-	// label contiguous same-letter groups with the same group number
-	inv := make([]int, len(data))
-	prevGroup := len(sa) - 1
-	groupByte := data[sa[prevGroup]]
-	for i := len(sa) - 1; i >= 0; i-- {
-		if b := data[sa[i]]; b < groupByte {
-			if prevGroup == i+1 {
-				sa[i+1] = -1
-			}
-			groupByte = b
-			prevGroup = i
-		}
-		inv[sa[i]] = prevGroup
-		if prevGroup == 0 {
-			sa[0] = -1
-		}
-	}
-	// Separate out the final suffix to the start of its group.
-	// This is necessary to ensure the suffix "a" is before "aba"
-	// when using a potentially unstable sort.
-	lastByte := data[len(data)-1]
-	s := -1
-	for i := range sa {
-		if sa[i] >= 0 {
-			if data[sa[i]] == lastByte && s == -1 {
-				s = i
-			}
-			if sa[i] == len(sa)-1 {
-				sa[i], sa[s] = sa[s], sa[i]
-				inv[sa[s]] = s
-				sa[s] = -1 // mark it as an isolated sorted group
-				break
-			}
-		}
-	}
-	return inv
-}
-
-
-type suffixSortable struct {
-	sa  []int
-	inv []int
-	h   int
-}
-
-func (x *suffixSortable) Len() int           { return len(x.sa) }
-func (x *suffixSortable) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
-func (x *suffixSortable) Swap(i, j int)      { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-
-
-func (x *suffixSortable) updateGroups(offset int) {
-	bounds := make([]int, 0, 4)
-	group := x.inv[x.sa[0]+x.h]
-	for i := 1; i < len(x.sa); i++ {
-		if g := x.inv[x.sa[i]+x.h]; g > group {
-			bounds = append(bounds, i)
-			group = g
-		}
-	}
-	bounds = append(bounds, len(x.sa))
-
-	// update the group numberings after all new groups are determined
-	prev := 0
-	for _, b := range bounds {
-		for i := prev; i < b; i++ {
-			x.inv[x.sa[i]] = offset + b - 1
-		}
-		if b-prev == 1 {
-			x.sa[prev] = -1
-		}
-		prev = b
-	}
-}
diff --git a/src/pkg/index/suffixarray/suffixarray.go b/src/pkg/index/suffixarray/suffixarray.go
deleted file mode 100644
index 9d4e93217..000000000
--- a/src/pkg/index/suffixarray/suffixarray.go
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package suffixarray implements substring search in logarithmic time using
-// an in-memory suffix array.
-//
-// Example use:
-//
-//	// create index for some data
-//	index := suffixarray.New(data)
-//
-//	// lookup byte slice s
-//	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
-//	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
-//
-package suffixarray
-
-import (
-	"bytes"
-	"regexp"
-	"sort"
-)
-
-
-// Index implements a suffix array for fast substring search.
-type Index struct {
-	data []byte
-	sa   []int // suffix array for data
-}
-
-
-// New creates a new Index for data.
-// Index creation time is O(N*log(N)) for N = len(data).
-func New(data []byte) *Index {
-	return &Index{data, qsufsort(data)}
-}
-
-
-// Bytes returns the data over which the index was created.
-// It must not be modified.
-//
-func (x *Index) Bytes() []byte {
-	return x.data
-}
-
-
-func (x *Index) at(i int) []byte {
-	return x.data[x.sa[i]:]
-}
-
-
-// lookupAll returns a slice into the matching region of the index.
-// The runtime is O(log(N)*len(s)).
-func (x *Index) lookupAll(s []byte) []int {
-	// find matching suffix index range [i:j]
-	// find the first index where s would be the prefix
-	i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
-	// starting at i, find the first index at which s is not a prefix
-	j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
-	return x.sa[i:j]
-}
-
-
-// Lookup returns an unsorted list of at most n indices where the byte string s
-// occurs in the indexed data. If n < 0, all occurrences are returned.
-// The result is nil if s is empty, s is not found, or n == 0.
-// Lookup time is O(log(N)*len(s) + len(result)) where N is the
-// size of the indexed data.
-//
-func (x *Index) Lookup(s []byte, n int) (result []int) {
-	if len(s) > 0 && n != 0 {
-		matches := x.lookupAll(s)
-		if len(matches) < n || n < 0 {
-			n = len(matches)
-		}
-		if n > 0 {
-			result = make([]int, n)
-			copy(result, matches)
-		}
-	}
-	return
-}
-
-
-// FindAllIndex returns a sorted list of non-overlapping matches of the
-// regular expression r, where a match is a pair of indices specifying
-// the matched slice of x.Bytes(). If n < 0, all matches are returned
-// in successive order. Otherwise, at most n matches are returned and
-// they may not be successive. The result is nil if there are no matches,
-// or if n == 0.
-//
-func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
-	// a non-empty literal prefix is used to determine possible
-	// match start indices with Lookup
-	prefix, complete := r.LiteralPrefix()
-	lit := []byte(prefix)
-
-	// worst-case scenario: no literal prefix
-	if prefix == "" {
-		return r.FindAllIndex(x.data, n)
-	}
-
-	// if regexp is a literal just use Lookup and convert its
-	// result into match pairs
-	if complete {
-		// Lookup returns indices that may belong to overlapping matches.
-		// After eliminating them, we may end up with fewer than n matches.
-		// If we don't have enough at the end, redo the search with an
-		// increased value n1, but only if Lookup returned all the requested
-		// indices in the first place (if it returned fewer than that then
-		// there cannot be more).
-		for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
-			indices := x.Lookup(lit, n1)
-			if len(indices) == 0 {
-				return
-			}
-			sort.Ints(indices)
-			pairs := make([]int, 2*len(indices))
-			result = make([][]int, len(indices))
-			count := 0
-			prev := 0
-			for _, i := range indices {
-				if count == n {
-					break
-				}
-				// ignore indices leading to overlapping matches
-				if prev <= i {
-					j := 2 * count
-					pairs[j+0] = i
-					pairs[j+1] = i + len(lit)
-					result[count] = pairs[j : j+2]
-					count++
-					prev = i + len(lit)
-				}
-			}
-			result = result[0:count]
-			if len(result) >= n || len(indices) != n1 {
-				// found all matches or there's no chance to find more
-				// (n and n1 can be negative)
-				break
-			}
-		}
-		if len(result) == 0 {
-			result = nil
-		}
-		return
-	}
-
-	// regexp has a non-empty literal prefix; Lookup(lit) computes
-	// the indices of possible complete matches; use these as starting
-	// points for anchored searches
-	// (regexp "^" matches beginning of input, not beginning of line)
-	r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
-
-	// same comment about Lookup applies here as in the loop above
-	for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
-		indices := x.Lookup(lit, n1)
-		if len(indices) == 0 {
-			return
-		}
-		sort.Ints(indices)
-		result = result[0:0]
-		prev := 0
-		for _, i := range indices {
-			if len(result) == n {
-				break
-			}
-			m := r.FindIndex(x.data[i:]) // anchored search - will not run off
-			// ignore indices leading to overlapping matches
-			if m != nil && prev <= i {
-				m[0] = i // correct m
-				m[1] += i
-				result = append(result, m)
-				prev = m[1]
-			}
-		}
-		if len(result) >= n || len(indices) != n1 {
-			// found all matches or there's no chance to find more
-			// (n and n1 can be negative)
-			break
-		}
-	}
-	if len(result) == 0 {
-		result = nil
-	}
-	return
-}
diff --git a/src/pkg/index/suffixarray/suffixarray_test.go b/src/pkg/index/suffixarray/suffixarray_test.go
deleted file mode 100644
index 385ff0e56..000000000
--- a/src/pkg/index/suffixarray/suffixarray_test.go
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package suffixarray
-
-import (
-	"bytes"
-	"container/vector"
-	"regexp"
-	"sort"
-	"strings"
-	"testing"
-)
-
-
-type testCase struct {
-	name     string   // name of test case
-	source   string   // source to index
-	patterns []string // patterns to lookup
-}
-
-
-var testCases = []testCase{
-	{
-		"empty string",
-		"",
-		[]string{
-			"",
-			"foo",
-			"(foo)",
-			".*",
-			"a*",
-		},
-	},
-
-	{
-		"all a's",
-		"aaaaaaaaaa", // 10 a's
-		[]string{
-			"",
-			"a",
-			"aa",
-			"aaa",
-			"aaaa",
-			"aaaaa",
-			"aaaaaa",
-			"aaaaaaa",
-			"aaaaaaaa",
-			"aaaaaaaaa",
-			"aaaaaaaaaa",
-			"aaaaaaaaaaa", // 11 a's
-			".",
-			".*",
-			"a+",
-			"aa+",
-			"aaaa[b]?",
-			"aaa*",
-		},
-	},
-
-	{
-		"abc",
-		"abc",
-		[]string{
-			"a",
-			"b",
-			"c",
-			"ab",
-			"bc",
-			"abc",
-			"a.c",
-			"a(b|c)",
-			"abc?",
-		},
-	},
-
-	{
-		"barbara*3",
-		"barbarabarbarabarbara",
-		[]string{
-			"a",
-			"bar",
-			"rab",
-			"arab",
-			"barbar",
-			"bara?bar",
-		},
-	},
-
-	{
-		"typing drill",
-		"Now is the time for all good men to come to the aid of their country.",
-		[]string{
-			"Now",
-			"the time",
-			"to come the aid",
-			"is the time for all good men to come to the aid of their",
-			"to (come|the)?",
-		},
-	},
-
-	{
-		"godoc simulation",
-		"package main\n\nimport(\n    \"rand\"\n    ",
-		[]string{},
-	},
-}
-
-
-// find all occurrences of s in source; report at most n occurrences
-func find(src, s string, n int) []int {
-	var res vector.IntVector
-	if s != "" && n != 0 {
-		// find at most n occurrences of s in src
-		for i := -1; n < 0 || len(res) < n; {
-			j := strings.Index(src[i+1:], s)
-			if j < 0 {
-				break
-			}
-			i += j + 1
-			res.Push(i)
-		}
-	}
-	return res
-}
-
-
-func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) {
-	res := x.Lookup([]byte(s), n)
-	exp := find(tc.source, s, n)
-
-	// check that the lengths match
-	if len(res) != len(exp) {
-		t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res))
-	}
-
-	// if n >= 0 the number of results is limited --- unless n >= all results,
-	// we may obtain different positions from the Index and from find (because
-	// Index may not find the results in the same order as find) => in general
-	// we cannot simply check that the res and exp lists are equal
-
-	// check that each result is in fact a correct match and there are no duplicates
-	sort.Ints(res)
-	for i, r := range res {
-		if r < 0 || len(tc.source) <= r {
-			t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source))
-		} else if !strings.HasPrefix(tc.source[r:], s) {
-			t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r)
-		}
-		if i > 0 && res[i-1] == r {
-			t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r)
-		}
-	}
-
-	if n < 0 {
-		// all results computed - sorted res and exp must be equal
-		for i, r := range res {
-			e := exp[i]
-			if r != e {
-				t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r)
-			}
-		}
-	}
-}
-
-
-func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) {
-	res := x.FindAllIndex(rx, n)
-	exp := rx.FindAllStringIndex(tc.source, n)
-
-	// check that the lengths match
-	if len(res) != len(exp) {
-		t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res))
-	}
-
-	// if n >= 0 the number of results is limited --- unless n >= all results,
-	// we may obtain different positions from the Index and from regexp (because
-	// Index may not find the results in the same order as regexp) => in general
-	// we cannot simply check that the res and exp lists are equal
-
-	// check that each result is in fact a correct match and the result is sorted
-	for i, r := range res {
-		if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] {
-			t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1])
-		} else if !rx.MatchString(tc.source[r[0]:r[1]]) {
-			t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1])
-		}
-	}
-
-	if n < 0 {
-		// all results computed - sorted res and exp must be equal
-		for i, r := range res {
-			e := exp[i]
-			if r[0] != e[0] || r[1] != e[1] {
-				t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]",
-					tc.name, rx, i, e[0], e[1], r[0], r[1])
-			}
-		}
-	}
-}
-
-
-func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
-	for _, pat := range tc.patterns {
-		testLookup(t, tc, x, pat, n)
-		if rx, err := regexp.Compile(pat); err == nil {
-			testFindAllIndex(t, tc, x, rx, n)
-		}
-	}
-}
-
-
-// index is used to hide the sort.Interface
-type index Index
-
-func (x *index) Len() int           { return len(x.sa) }
-func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
-func (x *index) Swap(i, j int)      { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
-func (a *index) at(i int) []byte    { return a.data[a.sa[i]:] }
-
-
-func testConstruction(t *testing.T, tc *testCase, x *Index) {
-	if !sort.IsSorted((*index)(x)) {
-		t.Errorf("testConstruction failed %s", tc.name)
-	}
-}
-
-
-func TestIndex(t *testing.T) {
-	for _, tc := range testCases {
-		x := New([]byte(tc.source))
-		testConstruction(t, &tc, x)
-		testLookups(t, &tc, x, 0)
-		testLookups(t, &tc, x, 1)
-		testLookups(t, &tc, x, 10)
-		testLookups(t, &tc, x, 2e9)
-		testLookups(t, &tc, x, -1)
-	}
-}
author	Ondřej Surý <ondrej@sury.org>	2011-09-13 13:11:55 +0200
committer	Ondřej Surý <ondrej@sury.org>	2011-09-13 13:11:55 +0200
commit	80f18fc933cf3f3e829c5455a1023d69f7b86e52 (patch)
tree	4b825dc642cb6eb9a060e54bf8d69288fbee4904 /src/pkg/index/suffixarray
parent	28592ee1ea1f5cdffcf85472f9de0285d928cf12 (diff)
download	golang-80f18fc933cf3f3e829c5455a1023d69f7b86e52.tar.gz