Skip to content

Instantly share code, notes, and snippets.

@lintianzhi
Last active December 20, 2015 10:59
Show Gist options
  • Save lintianzhi/6119930 to your computer and use it in GitHub Desktop.
Save lintianzhi/6119930 to your computer and use it in GitHub Desktop.
搜索一个网站内所有包含关键字的页面,并统计
package main
import (
"os"
"log"
"fmt"
"io/ioutil"
"net/http"
"regexp"
)
type Fetch struct {
Host string
UrlRe, KeyRe *regexp.Regexp
QUrl chan string
MarkUrl map[string]bool
}
func NewFetch(host, startUrl, key string) (*Fetch) {
qUrl := make(chan string, 1000)
qUrl <- startUrl
linkRe := "href=\"(/[./\\w]*)\""
fmt.Println(linkRe)
return &Fetch{
Host: host,
UrlRe: regexp.MustCompile(linkRe),
KeyRe: regexp.MustCompile(key),
QUrl: qUrl,
MarkUrl: make(map[string]bool),
}
}
func (fc *Fetch) FetchPage(url string) string {
res, err := http.Get(url)
if err != nil {
log.Printf("get error: %v\n", err)
return ""
}
page, err := ioutil.ReadAll(res.Body)
res.Body.Close()
if err != nil {
log.Printf("%v", err)
return ""
}
return string(page)
}
func (fc *Fetch) ParsePage(url, page string) {
rst := fc.KeyRe.FindAllString(page, -1)
if len(rst) > 0 {
fmt.Printf("%v: %v\n", fc.GetFullUrl(url), len(rst))
}
matchs := fc.UrlRe.FindAllStringSubmatch(page, -1)
for _, match := range matchs {
for i, v := range match {
if i == 0 { continue; }
if fc.MarkUrl[v] != true {
fc.MarkUrl[v] = true
fc.QUrl <- v
}
}
}
}
func (fc *Fetch) Process(n int) {
runChan := make(chan bool, n)
for i:=0; i<n; i++ {
runChan <- true
}
inProcess := make(chan bool, n)
for {
<-runChan
go func() {
if len(inProcess) == 0 && len(fc.QUrl) == 0 {
os.Exit(0)
}
url := <-fc.QUrl
fullUrl := fc.GetFullUrl(url)
inProcess <- true
page := fc.FetchPage(fullUrl)
fc.ParsePage(url, page)
<-inProcess
runChan <- true
}()
}
}
func (fc *Fetch) GetFullUrl(url string) string {
return "http://" + fc.Host + url
}
func main() {
fc := NewFetch("docs.qiniutek.com", "/v3/api/", "dev.qiniutek.com")
fc.Process(5)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment