Source file misc/linkcheck/linkcheck.go

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // The linkcheck command finds missing links in the godoc website.
     6  // It crawls a URL recursively and notes URLs and URL fragments
     7  // that it's seen and prints a report of missing links at the end.
     8  package main
     9  
    10  import (
    11  	"errors"
    12  	"flag"
    13  	"fmt"
    14  	"io"
    15  	"log"
    16  	"net/http"
    17  	"os"
    18  	"regexp"
    19  	"strings"
    20  	"sync"
    21  )
    22  
    23  var (
    24  	root    = flag.String("root", "http://localhost:6060", "Root to crawl")
    25  	verbose = flag.Bool("verbose", false, "verbose")
    26  )
    27  
    28  var wg sync.WaitGroup        // outstanding fetches
    29  var urlq = make(chan string) // URLs to crawl
    30  
    31  // urlFrag is a URL and its optional #fragment (without the #)
    32  type urlFrag struct {
    33  	url, frag string
    34  }
    35  
    36  var (
    37  	mu          sync.Mutex
    38  	crawled     = make(map[string]bool)      // URL without fragment -> true
    39  	neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
    40  )
    41  
    42  var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
    43  
    44  // Owned by crawlLoop goroutine:
    45  var (
    46  	linkSources = make(map[string][]string) // url no fragment -> sources
    47  	fragExists  = make(map[urlFrag]bool)
    48  	problems    []string
    49  )
    50  
    51  func localLinks(body string) (links []string) {
    52  	seen := map[string]bool{}
    53  	mv := aRx.FindAllStringSubmatch(body, -1)
    54  	for _, m := range mv {
    55  		ref := m[1]
    56  		if strings.HasPrefix(ref, "/src/") {
    57  			continue
    58  		}
    59  		if !seen[ref] {
    60  			seen[ref] = true
    61  			links = append(links, m[1])
    62  		}
    63  	}
    64  	return
    65  }
    66  
    67  var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
    68  
    69  func pageIDs(body string) (ids []string) {
    70  	mv := idRx.FindAllStringSubmatch(body, -1)
    71  	for _, m := range mv {
    72  		ids = append(ids, m[1])
    73  	}
    74  	return
    75  }
    76  
    77  // url may contain a #fragment, and the fragment is then noted as needing to exist.
    78  func crawl(url string, sourceURL string) {
    79  	if strings.Contains(url, "/devel/release") {
    80  		return
    81  	}
    82  	mu.Lock()
    83  	defer mu.Unlock()
    84  	if u, frag, ok := strings.Cut(url, "#"); ok {
    85  		url = u
    86  		if frag != "" {
    87  			uf := urlFrag{url, frag}
    88  			neededFrags[uf] = append(neededFrags[uf], sourceURL)
    89  		}
    90  	}
    91  	if crawled[url] {
    92  		return
    93  	}
    94  	crawled[url] = true
    95  
    96  	wg.Add(1)
    97  	go func() {
    98  		urlq <- url
    99  	}()
   100  }
   101  
   102  func addProblem(url, errmsg string) {
   103  	msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
   104  	if *verbose {
   105  		log.Print(msg)
   106  	}
   107  	problems = append(problems, msg)
   108  }
   109  
   110  func crawlLoop() {
   111  	for url := range urlq {
   112  		if err := doCrawl(url); err != nil {
   113  			addProblem(url, err.Error())
   114  		}
   115  	}
   116  }
   117  
   118  func doCrawl(url string) error {
   119  	defer wg.Done()
   120  
   121  	req, err := http.NewRequest("GET", url, nil)
   122  	if err != nil {
   123  		return err
   124  	}
   125  	res, err := http.DefaultTransport.RoundTrip(req)
   126  	if err != nil {
   127  		return err
   128  	}
   129  	// Handle redirects.
   130  	if res.StatusCode/100 == 3 {
   131  		newURL, err := res.Location()
   132  		if err != nil {
   133  			return fmt.Errorf("resolving redirect: %v", err)
   134  		}
   135  		if !strings.HasPrefix(newURL.String(), *root) {
   136  			// Skip off-site redirects.
   137  			return nil
   138  		}
   139  		crawl(newURL.String(), url)
   140  		return nil
   141  	}
   142  	if res.StatusCode != 200 {
   143  		return errors.New(res.Status)
   144  	}
   145  	slurp, err := io.ReadAll(res.Body)
   146  	res.Body.Close()
   147  	if err != nil {
   148  		log.Fatalf("Error reading %s body: %v", url, err)
   149  	}
   150  	if *verbose {
   151  		log.Printf("Len of %s: %d", url, len(slurp))
   152  	}
   153  	body := string(slurp)
   154  	for _, ref := range localLinks(body) {
   155  		if *verbose {
   156  			log.Printf("  links to %s", ref)
   157  		}
   158  		dest := *root + ref
   159  		linkSources[dest] = append(linkSources[dest], url)
   160  		crawl(dest, url)
   161  	}
   162  	for _, id := range pageIDs(body) {
   163  		if *verbose {
   164  			log.Printf(" url %s has #%s", url, id)
   165  		}
   166  		fragExists[urlFrag{url, id}] = true
   167  	}
   168  	return nil
   169  }
   170  
   171  func main() {
   172  	flag.Parse()
   173  
   174  	go crawlLoop()
   175  	crawl(*root, "")
   176  
   177  	wg.Wait()
   178  	close(urlq)
   179  	for uf, needers := range neededFrags {
   180  		if !fragExists[uf] {
   181  			problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
   182  		}
   183  	}
   184  
   185  	for _, s := range problems {
   186  		fmt.Println(s)
   187  	}
   188  	if len(problems) > 0 {
   189  		os.Exit(1)
   190  	}
   191  }
   192  

View as plain text