Skip to content

Instantly share code, notes, and snippets.

@mewmew
Last active August 9, 2016 01:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mewmew/57196a0281f48ff83c5d78397f1ae73e to your computer and use it in GitHub Desktop.
Save mewmew/57196a0281f48ff83c5d78397f1ae73e to your computer and use it in GitHub Desktop.
Identify broken links based on GitHub directory listing.
// Based on https://github.com/PuerkitoBio/gocrawl/blob/master/examples_test.go
package main
import (
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"path/filepath"
"regexp"
"time"
"github.com/PuerkitoBio/gocrawl"
"github.com/PuerkitoBio/goquery"
"github.com/mewkiz/pkg/errutil"
)
// Only enqueue the paths related to the diary directory.
var rxOk = regexp.MustCompile(`https?://github\.com/xh3b4sd/anna/(tree|blob)/master/doc/diary.*?$`)
// Create the Extender implementation, based on the gocrawl-provided
// DefaultExtender, because we don't want/need to override all methods.
type Extender struct {
gocrawl.DefaultExtender // Will use the default implementation of all but Visit and Filter
}
// Override Visit for our need.
func (x *Extender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
s, err := doc.Html()
if err != nil {
log.Fatal(errutil.Err(err))
}
url := filepath.Join(doc.Url.Host, doc.Url.Path)
dir := url
name := "index.html"
if ext := filepath.Ext(url); len(ext) > 0 {
dir, name = filepath.Split(url)
}
path := filepath.Join(dir, name)
if err := os.MkdirAll(dir, 0755); err != nil {
log.Fatal(errutil.Err(err))
}
if err := ioutil.WriteFile(path, []byte(s), 0644); err != nil {
log.Fatal(errutil.Err(err))
}
// Return nil and true - let gocrawl find the links
return nil, true
}
var exists = make(map[string]bool)
// Override Filter for our need.
func (x *Extender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool {
url := ctx.NormalizedURL().String()
if _, ok := exists[url]; !ok {
resp, err := http.Get(url)
if err != nil {
fmt.Printf("GET ERROR; %v (from %s)\n", err, ctx.SourceURL())
return !isVisited && rxOk.MatchString(url)
}
defer resp.Body.Close()
exists[url] = resp.StatusCode == http.StatusNotFound
}
if exists[url] {
fmt.Printf("NOT FOUND: %s (from %s)\n", url, ctx.SourceURL())
}
return !isVisited && rxOk.MatchString(url)
}
func main() {
// Set custom options
opts := gocrawl.NewOptions(new(Extender))
// should always set your robot name so that it looks for the most
// specific rules possible in robots.txt.
opts.RobotUserAgent = "CCBot"
// and reflect that in the user-agent string used to make requests,
// ideally with a link so site owners can contact you if there's an issue
opts.UserAgent = "CCBot"
opts.CrawlDelay = 1 * time.Second
opts.LogFlags = gocrawl.LogAll
opts.WorkerIdleTTL = 0
// Create crawler and start at root of the diary directory.
c := gocrawl.NewCrawlerWithOptions(opts)
c.Run("https://github.com/xh3b4sd/anna/tree/master/doc/diary")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment