Created
March 9, 2016 13:09
-
-
Save kechako/bf1501cf9d21ac08b322 to your computer and use it in GitHub Desktop.
Webクローラー
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"net/http" | |
"net/url" | |
"os" | |
"github.com/PuerkitoBio/goquery" | |
) | |
// リクエスト | |
type Request struct { | |
url string | |
depth int | |
} | |
// 結果 | |
type Result struct { | |
err error | |
url string | |
} | |
// チャンネル | |
type Channels struct { | |
req chan Request | |
res chan Result | |
quit chan int | |
} | |
// チャンネルを取得。 | |
func NewChannels() *Channels { | |
return &Channels{ | |
req: make(chan Request, 10), | |
res: make(chan Result, 10), | |
quit: make(chan int, 10), | |
} | |
} | |
// 指定された URL の Web ページを取得し、ページに含まれる URL の一覧を取得。 | |
func Fetch(u string) (urls []string, err error) { | |
baseUrl, err := url.Parse(u) | |
if err != nil { | |
return | |
} | |
resp, err := http.Get(baseUrl.String()) | |
if err != nil { | |
return | |
} | |
defer resp.Body.Close() | |
doc, err := goquery.NewDocumentFromReader(resp.Body) | |
if err != nil { | |
return | |
} | |
urls = make([]string, 0) | |
doc.Find("a").Each(func(_ int, s *goquery.Selection) { | |
href, exists := s.Attr("href") | |
if exists { | |
reqUrl, err := baseUrl.Parse(href) | |
if err == nil { | |
urls = append(urls, reqUrl.String()) | |
} | |
} | |
}) | |
return | |
} | |
// クロール。 | |
func Crawl(url string, depth int, ch *Channels) { | |
defer func() { ch.quit <- 0 }() | |
// WebページからURLを取得 | |
urls, err := Fetch(url) | |
// 結果送信 | |
ch.res <- Result{ | |
url: url, | |
err: err, | |
} | |
if err == nil { | |
for _, url := range urls { | |
// 新しいリクエスト送信 | |
ch.req <- Request{ | |
url: url, | |
depth: depth - 1, | |
} | |
} | |
} | |
} | |
// クロールの深さの初期値 | |
const crawlerDepthDefault = 5 | |
// クロールの深さ | |
var crawlerDepth int | |
func main() { | |
flag.IntVar(&crawlerDepth, "depth", crawlerDepthDefault, "クロールする深さを指定。") | |
flag.Parse() | |
if len(flag.Args()) < 1 { | |
fmt.Fprintln(os.Stderr, "URLを指定してください。") | |
os.Exit(1) | |
} | |
startUrl := flag.Arg(0) | |
if crawlerDepth < 1 { | |
crawlerDepth = crawlerDepthDefault | |
} | |
chs := NewChannels() | |
urlMap := make(map[string]bool) | |
// 最初のリクエスト | |
chs.req <- Request{ | |
url: startUrl, | |
depth: crawlerDepth, | |
} | |
// ワーカーの数 | |
wc := 0 | |
done := false | |
for !done { | |
select { | |
case res := <-chs.res: | |
if res.err == nil { | |
fmt.Printf("Success %s\n", res.url) | |
} else { | |
fmt.Fprintf(os.Stderr, "Error %s\n%v\n", res.url, res.err) | |
} | |
case req := <-chs.req: | |
if req.depth == 0 { | |
break | |
} | |
if urlMap[req.url] { | |
// 取得済み | |
break | |
} | |
urlMap[req.url] = true | |
wc++ | |
go Crawl(req.url, req.depth, chs) | |
case <-chs.quit: | |
wc-- | |
if wc == 0 { | |
done = true | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment