diff options
author | Michael Stapelberg <stapelberg@debian.org> | 2013-12-03 09:43:15 +0100 |
---|---|---|
committer | Michael Stapelberg <stapelberg@debian.org> | 2013-12-03 09:43:15 +0100 |
commit | 64d2a7c8945ba05af859901f5e248f1befdd8621 (patch) | |
tree | 013fcb7e9e3296ecdda876012252c36bd6bcb063 /misc/linkcheck | |
parent | b901efe83e212f0c34c769c079e41373da12d723 (diff) | |
download | golang-64d2a7c8945ba05af859901f5e248f1befdd8621.tar.gz |
Imported Upstream version 1.2upstream/1.2
Diffstat (limited to 'misc/linkcheck')
-rw-r--r-- | misc/linkcheck/linkcheck.go | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/misc/linkcheck/linkcheck.go b/misc/linkcheck/linkcheck.go new file mode 100644 index 000000000..d9bfd2f76 --- /dev/null +++ b/misc/linkcheck/linkcheck.go @@ -0,0 +1,193 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The linkcheck command finds missing links in the godoc website. +// It crawls a URL recursively and notes URLs and URL fragments +// that it's seen and prints a report of missing links at the end. +package main + +import ( + "errors" + "flag" + "fmt" + "io/ioutil" + "log" + "net/http" + "os" + "regexp" + "strings" + "sync" +) + +var ( + root = flag.String("root", "http://localhost:6060", "Root to crawl") + verbose = flag.Bool("verbose", false, "verbose") +) + +var wg sync.WaitGroup // outstanding fetches +var urlq = make(chan string) // URLs to crawl + +// urlFrag is a URL and its optional #fragment (without the #) +type urlFrag struct { + url, frag string +} + +var ( + mu sync.Mutex + crawled = make(map[string]bool) // URL without fragment -> true + neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it +) + +var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`) + +// Owned by crawlLoop goroutine: +var ( + linkSources = make(map[string][]string) // url no fragment -> sources + fragExists = make(map[urlFrag]bool) + problems []string +) + +func localLinks(body string) (links []string) { + seen := map[string]bool{} + mv := aRx.FindAllStringSubmatch(body, -1) + for _, m := range mv { + ref := m[1] + if strings.HasPrefix(ref, "/src/") { + continue + } + if !seen[ref] { + seen[ref] = true + links = append(links, m[1]) + } + } + return +} + +var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`) + +func pageIDs(body string) (ids []string) { + mv := idRx.FindAllStringSubmatch(body, -1) + for _, m := range mv { + ids = append(ids, m[1]) + } + return +} + +// url may contain a #fragment, and the fragment is then noted as needing to exist. +func crawl(url string, sourceURL string) { + if strings.Contains(url, "/devel/release") { + return + } + mu.Lock() + defer mu.Unlock() + var frag string + if i := strings.Index(url, "#"); i >= 0 { + frag = url[i+1:] + url = url[:i] + if frag != "" { + uf := urlFrag{url, frag} + neededFrags[uf] = append(neededFrags[uf], sourceURL) + } + } + if crawled[url] { + return + } + crawled[url] = true + + wg.Add(1) + go func() { + urlq <- url + }() +} + +func addProblem(url, errmsg string) { + msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) + if *verbose { + log.Print(msg) + } + problems = append(problems, msg) +} + +func crawlLoop() { + for url := range urlq { + if err := doCrawl(url); err != nil { + addProblem(url, err.Error()) + } + } +} + +func doCrawl(url string) error { + defer wg.Done() + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return err + } + res, err := http.DefaultTransport.RoundTrip(req) + if err != nil { + return err + } + // Handle redirects. + if res.StatusCode/100 == 3 { + newURL, err := res.Location() + if err != nil { + return fmt.Errorf("resolving redirect: %v", err) + } + if !strings.HasPrefix(newURL.String(), *root) { + // Skip off-site redirects. + return nil + } + crawl(newURL.String(), url) + return nil + } + if res.StatusCode != 200 { + return errors.New(res.Status) + } + slurp, err := ioutil.ReadAll(res.Body) + res.Body.Close() + if err != nil { + log.Fatalf("Error reading %s body: %v", url, err) + } + if *verbose { + log.Printf("Len of %s: %d", url, len(slurp)) + } + body := string(slurp) + for _, ref := range localLinks(body) { + if *verbose { + log.Printf(" links to %s", ref) + } + dest := *root + ref + linkSources[dest] = append(linkSources[dest], url) + crawl(dest, url) + } + for _, id := range pageIDs(body) { + if *verbose { + log.Printf(" url %s has #%s", url, id) + } + fragExists[urlFrag{url, id}] = true + } + return nil +} + +func main() { + flag.Parse() + + go crawlLoop() + crawl(*root, "") + + wg.Wait() + close(urlq) + for uf, needers := range neededFrags { + if !fragExists[uf] { + problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers)) + } + } + + for _, s := range problems { + fmt.Println(s) + } + if len(problems) > 0 { + os.Exit(1) + } +} |