mewmew/crawl.go

## crawl.go
// Based on https://github.com/PuerkitoBio/gocrawl/blob/master/examples_test.go

package main

import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"time"

	"github.com/PuerkitoBio/gocrawl"
	"github.com/PuerkitoBio/goquery"
	"github.com/mewkiz/pkg/errutil"
)

// Only enqueue the paths related to the diary directory.
var rxOk = regexp.MustCompile(`https?://github\.com/xh3b4sd/anna/(tree|blob)/master/doc/diary.*?$`)

// Create the Extender implementation, based on the gocrawl-provided
// DefaultExtender, because we don't want/need to override all methods.
type Extender struct {
	gocrawl.DefaultExtender // Will use the default implementation of all but Visit and Filter
}

// Override Visit for our need.
func (x *Extender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
	s, err := doc.Html()
	if err != nil {
		log.Fatal(errutil.Err(err))
	}
	url := filepath.Join(doc.Url.Host, doc.Url.Path)
	dir := url
	name := "index.html"
	if ext := filepath.Ext(url); len(ext) > 0 {
		dir, name = filepath.Split(url)
	}
	path := filepath.Join(dir, name)
	if err := os.MkdirAll(dir, 0755); err != nil {
		log.Fatal(errutil.Err(err))
	}
	if err := ioutil.WriteFile(path, []byte(s), 0644); err != nil {
		log.Fatal(errutil.Err(err))
	}

	// Return nil and true - let gocrawl find the links
	return nil, true
}

var exists = make(map[string]bool)

// Override Filter for our need.
func (x *Extender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool {
	url := ctx.NormalizedURL().String()
	if _, ok := exists[url]; !ok {
		resp, err := http.Get(url)
		if err != nil {
			fmt.Printf("GET ERROR; %v (from %s)\n", err, ctx.SourceURL())
			return !isVisited && rxOk.MatchString(url)
		}
		defer resp.Body.Close()
		exists[url] = resp.StatusCode == http.StatusNotFound
	}
	if exists[url] {
		fmt.Printf("NOT FOUND: %s (from %s)\n", url, ctx.SourceURL())
	}
	return !isVisited && rxOk.MatchString(url)
}

func main() {
	// Set custom options
	opts := gocrawl.NewOptions(new(Extender))

	// should always set your robot name so that it looks for the most
	// specific rules possible in robots.txt.
	opts.RobotUserAgent = "CCBot"
	// and reflect that in the user-agent string used to make requests,
	// ideally with a link so site owners can contact you if there's an issue
	opts.UserAgent = "CCBot"

	opts.CrawlDelay = 1 * time.Second
	opts.LogFlags = gocrawl.LogAll
	opts.WorkerIdleTTL = 0

	// Create crawler and start at root of the diary directory.
	c := gocrawl.NewCrawlerWithOptions(opts)
	c.Run("https://github.com/xh3b4sd/anna/tree/master/doc/diary")
}
	// Based on https://github.com/PuerkitoBio/gocrawl/blob/master/examples_test.go

	package main

	import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"time"

	"github.com/PuerkitoBio/gocrawl"
	"github.com/PuerkitoBio/goquery"
	"github.com/mewkiz/pkg/errutil"
	)

	// Only enqueue the paths related to the diary directory.
	var rxOk = regexp.MustCompile(`https?://github\.com/xh3b4sd/anna/(tree\|blob)/master/doc/diary.*?$`)

	// Create the Extender implementation, based on the gocrawl-provided
	// DefaultExtender, because we don't want/need to override all methods.
	type Extender struct {
	gocrawl.DefaultExtender // Will use the default implementation of all but Visit and Filter
	}

	// Override Visit for our need.
	func (x Extender) Visit(ctx gocrawl.URLContext, res http.Response, doc goquery.Document) (interface{}, bool) {
	s, err := doc.Html()
	if err != nil {
	log.Fatal(errutil.Err(err))
	}
	url := filepath.Join(doc.Url.Host, doc.Url.Path)
	dir := url
	name := "index.html"
	if ext := filepath.Ext(url); len(ext) > 0 {
	dir, name = filepath.Split(url)
	}
	path := filepath.Join(dir, name)
	if err := os.MkdirAll(dir, 0755); err != nil {
	log.Fatal(errutil.Err(err))
	}
	if err := ioutil.WriteFile(path, []byte(s), 0644); err != nil {
	log.Fatal(errutil.Err(err))
	}

	// Return nil and true - let gocrawl find the links
	return nil, true
	}

	var exists = make(map[string]bool)

	// Override Filter for our need.
	func (x Extender) Filter(ctx gocrawl.URLContext, isVisited bool) bool {
	url := ctx.NormalizedURL().String()
	if _, ok := exists[url]; !ok {
	resp, err := http.Get(url)
	if err != nil {
	fmt.Printf("GET ERROR; %v (from %s)\n", err, ctx.SourceURL())
	return !isVisited && rxOk.MatchString(url)
	}
	defer resp.Body.Close()
	exists[url] = resp.StatusCode == http.StatusNotFound
	}
	if exists[url] {
	fmt.Printf("NOT FOUND: %s (from %s)\n", url, ctx.SourceURL())
	}
	return !isVisited && rxOk.MatchString(url)
	}

	func main() {
	// Set custom options
	opts := gocrawl.NewOptions(new(Extender))

	// should always set your robot name so that it looks for the most
	// specific rules possible in robots.txt.
	opts.RobotUserAgent = "CCBot"
	// and reflect that in the user-agent string used to make requests,
	// ideally with a link so site owners can contact you if there's an issue
	opts.UserAgent = "CCBot"

	opts.CrawlDelay = 1 * time.Second
	opts.LogFlags = gocrawl.LogAll
	opts.WorkerIdleTTL = 0

	// Create crawler and start at root of the diary directory.
	c := gocrawl.NewCrawlerWithOptions(opts)
	c.Run("https://github.com/xh3b4sd/anna/tree/master/doc/diary")
	}