Skip to content

Instantly share code, notes, and snippets.

@hengfeiyang
Created September 23, 2016 13:40
Show Gist options
  • Save hengfeiyang/32e4b99e7348e953577e607727625698 to your computer and use it in GitHub Desktop.
Save hengfeiyang/32e4b99e7348e953577e607727625698 to your computer and use it in GitHub Desktop.
use go crawl website urls
package main
import (
"fmt"
"net/url"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const (
uriSuffix = ".html" // 页面后缀
)
func main() {
start := time.Now()
// 种子URL
initURLs := []string{"http://xiezhenye.com/", "http://www.laruence.com/"}
// 存储的URL
store := make(map[string]bool)
// 获取过程中的URL通道
chanURL := make(chan []string, 10000)
// 计算终止数
wait := 0
// 控制抓取的协程数
chanLink := make(chan struct{}, 10)
// 开启种子协程
for _, url := range initURLs {
wait++
chanLink <- struct{}{}
go func(url string) {
chanURL <- link(url)
<-chanLink
}(url)
}
// 开启主进程
for ; wait > 0; wait-- {
urls := <-chanURL
if urls == nil {
continue
}
for _, url := range urls {
// 判断URL是否采集过
if _, ok := store[url]; ok {
fmt.Println("repeat!!! ->", url)
continue
}
// 存储抓取状态
store[url] = true
// 开启抓取
fmt.Println("crawl -> ", url)
wait++
chanLink <- struct{}{}
go func(url string) {
chanURL <- link(url)
<-chanLink
}(url)
}
}
// 执行结束
i := 0
for url := range store {
i++
fmt.Printf("%6d %s\n", i, url)
}
fmt.Printf("总数:%d, 耗时:%v\n", len(store), time.Since(start))
}
// link 抓取符合规则的URL
func link(url string) []string {
urls := crawl(url)
newURLs := make([]string, 0, len(urls))
for _, href := range urls {
href = fixURL(href, url)
if strings.Contains(href, uriSuffix) {
newURLs = append(newURLs, href)
}
}
return newURLs
}
// crawl 抓取页面上所有的URL
func crawl(url string) []string {
dom, err := goquery.NewDocument(url)
if err != nil {
return nil
}
var urls []string
as := dom.Find("a")
for i := 0; i < as.Size(); i++ {
if href, exist := goquery.NewDocumentFromNode(as.Get(i)).Attr("href"); exist {
urls = append(urls, href)
}
}
return urls
}
// fixURL 过滤非本站URL和修复相对路径
func fixURL(href, uri string) string {
if href == "" {
return ""
}
// 过滤锚点
if href[0] == '#' {
return ""
}
if pos := strings.Index(href, "#"); pos > 0 {
href = href[0:pos]
}
// 过滤JS连接
if strings.Contains(href, "javascript:") {
return ""
}
// 过滤mailto
if strings.Contains(href, "mailto:") {
return ""
}
u, err := url.Parse(uri)
if err != nil {
return ""
}
if strings.Contains(href, "://") {
if strings.HasPrefix(href, u.Scheme+"://"+u.Host) == false {
return ""
}
return href
}
if href[0] == '/' {
return u.Scheme + "://" + u.Host + href
}
return u.Scheme + "://" + u.Host + u.Path + "/" + href
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment