summaryrefslogtreecommitdiff
path: root/misc/linkcheck/linkcheck.go
diff options
context:
space:
mode:
Diffstat (limited to 'misc/linkcheck/linkcheck.go')
-rw-r--r--misc/linkcheck/linkcheck.go193
1 files changed, 193 insertions, 0 deletions
diff --git a/misc/linkcheck/linkcheck.go b/misc/linkcheck/linkcheck.go
new file mode 100644
index 000000000..d9bfd2f76
--- /dev/null
+++ b/misc/linkcheck/linkcheck.go
@@ -0,0 +1,193 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The linkcheck command finds missing links in the godoc website.
+// It crawls a URL recursively and notes URLs and URL fragments
+// that it's seen and prints a report of missing links at the end.
+package main
+
+import (
+ "errors"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "os"
+ "regexp"
+ "strings"
+ "sync"
+)
+
+var (
+ root = flag.String("root", "http://localhost:6060", "Root to crawl")
+ verbose = flag.Bool("verbose", false, "verbose")
+)
+
+var wg sync.WaitGroup // outstanding fetches
+var urlq = make(chan string) // URLs to crawl
+
+// urlFrag is a URL and its optional #fragment (without the #)
+type urlFrag struct {
+ url, frag string
+}
+
+var (
+ mu sync.Mutex
+ crawled = make(map[string]bool) // URL without fragment -> true
+ neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
+)
+
+var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
+
+// Owned by crawlLoop goroutine:
+var (
+ linkSources = make(map[string][]string) // url no fragment -> sources
+ fragExists = make(map[urlFrag]bool)
+ problems []string
+)
+
+func localLinks(body string) (links []string) {
+ seen := map[string]bool{}
+ mv := aRx.FindAllStringSubmatch(body, -1)
+ for _, m := range mv {
+ ref := m[1]
+ if strings.HasPrefix(ref, "/src/") {
+ continue
+ }
+ if !seen[ref] {
+ seen[ref] = true
+ links = append(links, m[1])
+ }
+ }
+ return
+}
+
+var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
+
+func pageIDs(body string) (ids []string) {
+ mv := idRx.FindAllStringSubmatch(body, -1)
+ for _, m := range mv {
+ ids = append(ids, m[1])
+ }
+ return
+}
+
+// url may contain a #fragment, and the fragment is then noted as needing to exist.
+func crawl(url string, sourceURL string) {
+ if strings.Contains(url, "/devel/release") {
+ return
+ }
+ mu.Lock()
+ defer mu.Unlock()
+ var frag string
+ if i := strings.Index(url, "#"); i >= 0 {
+ frag = url[i+1:]
+ url = url[:i]
+ if frag != "" {
+ uf := urlFrag{url, frag}
+ neededFrags[uf] = append(neededFrags[uf], sourceURL)
+ }
+ }
+ if crawled[url] {
+ return
+ }
+ crawled[url] = true
+
+ wg.Add(1)
+ go func() {
+ urlq <- url
+ }()
+}
+
+func addProblem(url, errmsg string) {
+ msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
+ if *verbose {
+ log.Print(msg)
+ }
+ problems = append(problems, msg)
+}
+
+func crawlLoop() {
+ for url := range urlq {
+ if err := doCrawl(url); err != nil {
+ addProblem(url, err.Error())
+ }
+ }
+}
+
+func doCrawl(url string) error {
+ defer wg.Done()
+
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return err
+ }
+ res, err := http.DefaultTransport.RoundTrip(req)
+ if err != nil {
+ return err
+ }
+ // Handle redirects.
+ if res.StatusCode/100 == 3 {
+ newURL, err := res.Location()
+ if err != nil {
+ return fmt.Errorf("resolving redirect: %v", err)
+ }
+ if !strings.HasPrefix(newURL.String(), *root) {
+ // Skip off-site redirects.
+ return nil
+ }
+ crawl(newURL.String(), url)
+ return nil
+ }
+ if res.StatusCode != 200 {
+ return errors.New(res.Status)
+ }
+ slurp, err := ioutil.ReadAll(res.Body)
+ res.Body.Close()
+ if err != nil {
+ log.Fatalf("Error reading %s body: %v", url, err)
+ }
+ if *verbose {
+ log.Printf("Len of %s: %d", url, len(slurp))
+ }
+ body := string(slurp)
+ for _, ref := range localLinks(body) {
+ if *verbose {
+ log.Printf(" links to %s", ref)
+ }
+ dest := *root + ref
+ linkSources[dest] = append(linkSources[dest], url)
+ crawl(dest, url)
+ }
+ for _, id := range pageIDs(body) {
+ if *verbose {
+ log.Printf(" url %s has #%s", url, id)
+ }
+ fragExists[urlFrag{url, id}] = true
+ }
+ return nil
+}
+
+func main() {
+ flag.Parse()
+
+ go crawlLoop()
+ crawl(*root, "")
+
+ wg.Wait()
+ close(urlq)
+ for uf, needers := range neededFrags {
+ if !fragExists[uf] {
+ problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
+ }
+ }
+
+ for _, s := range problems {
+ fmt.Println(s)
+ }
+ if len(problems) > 0 {
+ os.Exit(1)
+ }
+}