Skip to content

Instantly share code, notes, and snippets.

@dlintw
Created October 14, 2011 06:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dlintw/1286427 to your computer and use it in GitHub Desktop.
Save dlintw/1286427 to your computer and use it in GitHub Desktop.
go tour #69 real fetcher
package main
import (
"bytes"
"os"
"fmt"
"html"
"http"
"io/ioutil"
"log"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err os.Error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth, workerCnt int, fetcher Fetcher) {
reqch := make(chan int)
seen := make(map[string]bool)
working := make(chan bool, workerCnt)
var crawl func(string, int)
crawl = func(url string, depth int) {
defer func() { reqch <- -1 }()
if depth <= 0 {
return
}
if _, ok := seen[url]; ok {
return
}
seen[url] = true
working <- true
defer func() { <-working }()
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
reqch <- len(urls)
for _, u := range urls {
go crawl(u, depth-1)
}
}
go crawl(url, depth)
actsum := 1
for diff := range reqch {
actsum += diff
if actsum == 0 {
break
}
}
}
func main() {
Crawl("http://localhost:6060", 1, 1, wfetcher)
}
// webFetcher is fetcher that return real results
//type webFetcher interface{}
type webFetcher struct{ dummy int }
func (f *webFetcher) Fetch(url string) (string, []string, os.Error) {
r, err := http.Get(url)
if err != nil {
return "", nil, err
}
defer r.Body.Close()
if r.StatusCode != http.StatusOK {
log.Println(err)
return "", nil, err
}
bs, err := ioutil.ReadAll(r.Body)
if err != nil {
log.Println(err)
return "", nil, err
}
buf := bytes.NewBuffer(bs)
z := html.NewTokenizer(buf)
title := ""
in_title := false
urls := []string{}
for {
tt := z.Next()
if tt == html.ErrorToken {
// ...
break
}
if tt == html.StartTagToken {
name, hasAttr := z.TagName()
if string(name) == "title" {
in_title = true
} else if hasAttr {
key, val, _ /* moreAttr */ := z.TagAttr()
if string(key) == "href" {
urls = append(urls, string(val))
}
}
} else if tt == html.EndTagToken {
name, _ := z.TagName()
if string(name) == "title" {
in_title = false
}
} else if tt == html.TextToken {
if in_title {
title += string(z.Text())
}
}
}
return title, urls, nil
}
var wfetcher = &webFetcher{}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment