Skip to content

Instantly share code, notes, and snippets.

@mikeflynn
Created March 14, 2016 07:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikeflynn/c70068a26d93f3b82b83 to your computer and use it in GitHub Desktop.
Save mikeflynn/c70068a26d93f3b82b83 to your computer and use it in GitHub Desktop.
YT Crawler
package main
import (
"log"
"net/http"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/gocrawl"
"github.com/PuerkitoBio/goquery"
"github.com/boltdb/bolt"
)
var boltDB *bolt.DB
var rxOk = regexp.MustCompile(`^(http|https)://(www\.)?youtube\.com(/|/watch.*)?$`) // Video URLs
type ExampleExtender struct {
gocrawl.DefaultExtender
}
func (x *ExampleExtender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
// Use the goquery document or res.Body to manipulate the data
// ...
attribution := "NONE"
channelID := ""
doc.Find("meta").Each(func(i int, s *goquery.Selection) {
if name, _ := s.Attr("name"); strings.EqualFold(name, "attribution") {
attribution, _ = s.Attr("content")
} else if itemprop, _ := s.Attr("itemprop"); strings.EqualFold(itemprop, "channelId") {
channelID, _ = s.Attr("content")
}
})
log.Println("VID: " + ctx.URL().Query().Get("v") + "; CO: " + attribution + "; CID: " + channelID)
// Return nil and true - let gocrawl find the links
return nil, true
}
// Override Filter for our need.
func (x *ExampleExtender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool {
return !isVisited && rxOk.MatchString(ctx.NormalizedURL().String())
}
func main() {
var err error
boltDB, err = bolt.Open("crawler.db", 0600, nil)
if err != nil {
log.Fatal(err)
}
defer boltDB.Close()
// Set custom options
opts := gocrawl.NewOptions(new(ExampleExtender))
opts.RobotUserAgent = "S71Cralwer"
opts.UserAgent = "Mozilla/5.0 (compatible; S71Crawler/1.0; +http://studio71.com)"
opts.CrawlDelay = 100 * time.Millisecond
//opts.LogFlags = gocrawl.LogAll
opts.MaxVisits = 100
opts.SameHostOnly = true
// Create crawler and start at root of duckduckgo
c := gocrawl.NewCrawlerWithOptions(opts)
c.Run("https://www.youtube.com/")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment