Skip to content

Instantly share code, notes, and snippets.

@tmitz
Created December 15, 2016 03:15
Show Gist options
  • Save tmitz/3145f43d6d43bead367b93305dfdbdeb to your computer and use it in GitHub Desktop.
Save tmitz/3145f43d6d43bead367b93305dfdbdeb to your computer and use it in GitHub Desktop.
gocrawl sample
package main
import (
"fmt"
"net/http"
"regexp"
"time"
"github.com/PuerkitoBio/gocrawl"
"github.com/PuerkitoBio/goquery"
)
var (
rxOk = regexp.MustCompile(`https?://amazon\.co\.jp/(review/top-reviewers.*page.*).*`)
rxTopReviewer = regexp.MustCompile(`https?://amazon\.co\.jp/gp/cdp/member-reviews.*`)
)
type Ext struct {
*gocrawl.DefaultExtender
}
func (e *Ext) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
if rxTopReviewer.MatchString(ctx.NormalizedURL().String()) {
star := doc.Find("span.s_star_1_0")
if star.Length() > 0 {
star.Each(func(i int, n *goquery.Selection) {
link := n.Parent().Parent().SiblingsFiltered("div").Last().Find("a").Last()
if href, ok := link.Attr("href"); ok {
fmt.Printf("link: %v\n", href)
}
})
}
}
return nil, true
}
func (e *Ext) Filter(ctx *gocrawl.URLContext, isVisited bool) bool {
return !isVisited && (rxOk.MatchString(ctx.NormalizedURL().String()) || rxTopReviewer.MatchString(ctx.NormalizedURL().String()))
}
func main() {
ext := &Ext{&gocrawl.DefaultExtender{}}
opts := gocrawl.NewOptions(ext)
opts.CrawlDelay = 5 * time.Second
opts.LogFlags = gocrawl.LogError
opts.SameHostOnly = true
opts.MaxVisits = 10
c := gocrawl.NewCrawlerWithOptions(opts)
c.Run("https://www.amazon.co.jp/review/top-reviewers/ref=cm_cr_tr_link_1?ie=UTF8&page=1")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment