Skip to content

Instantly share code, notes, and snippets.

@technoweenie
Last active June 3, 2017 14:56
Show Gist options
  • Save technoweenie/5078118 to your computer and use it in GitHub Desktop.
Save technoweenie/5078118 to your computer and use it in GitHub Desktop.
go web crawler: http://tour.golang.org/#70
package main
import (
"fmt"
"github.com/moovweb/gokogiri"
"io/ioutil"
"net/http"
"net/url"
"time"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, finished chan bool) {
if depth <= 0 {
finished <- false
return
}
_, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
finished <- false
return
}
urlCount := len(urls)
if urlCount > 0 {
fmt.Printf("found: %s %d in depth: %d\n", url, urlCount, depth)
}
innerFinished := make(chan bool)
for _, u := range urls {
go Crawl(u, depth-1, fetcher, innerFinished)
}
for i := 0; i < urlCount; i += 1 {
<-innerFinished
}
finished <- true
return
}
func main() {
fetcher := new(urlFetcher)
t0 := time.Now()
finished := make(chan bool)
go Crawl("http://golang.org/", 3, fetcher, finished)
<-finished
t1 := time.Now()
fmt.Printf("%v\n", t1.Sub(t0))
}
type urlFetcher map[string]*fetcherResult
type fetcherResult struct {
body string
urls []string
}
func (f *urlFetcher) Fetch(url string) (string, []string, error) {
if res, ok := (*f)[url]; ok {
return res.body, res.urls, nil
}
return "body", f.scanForUrls(url), nil
}
func (f *urlFetcher) scanForUrls(url string) []string {
body, _ := f.download(url)
urlStrings := f.urlsInBody(body)
return f.validUrls(url, urlStrings)
}
func (f *urlFetcher) validUrls(parentUrl string, urls []string) []string {
parent, _ := url.Parse(parentUrl)
validUrls := make([]string, len(urls))
validUrlCount := 0
for _, urlString := range urls {
uri, err := url.Parse(urlString)
if err == nil && (uri.Scheme == "http" || uri.Scheme == "https") {
validUrls[validUrlCount] = parent.ResolveReference(uri).String()
validUrlCount += 1
}
}
return validUrls[:validUrlCount]
}
func (f *urlFetcher) urlsInBody(body []byte) []string {
doc, _ := gokogiri.ParseHtml(body)
defer doc.Free()
nodes, _ := doc.Search("//a/@href")
urls := make([]string, len(nodes))
for n, node := range nodes {
urls[n] = node.String()
}
return urls
}
func (f *urlFetcher) download(url string) ([]byte, error) {
var resp, err = http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, err2 := ioutil.ReadAll(resp.Body)
return body, err2
}
@syabinminmin
Copy link

I do it :
go get github.com/moovweb/gokogiri
and then ,show me that:

github.com/moovweb/gokogiri/xml

/tmp/go-build806676929/github.com/moovweb/gokogiri/xml/_obj/helper.o: In function removeDefaultNamespace': github.com/moovweb/gokogiri/xml/helper.c:183: undefined reference toxmlFirstElementChild'
github.com/moovweb/gokogiri/xml/helper.c:183: undefined reference to `xmlNextElementSibling'
collect2: ld

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment