Skip to content

Instantly share code, notes, and snippets.

@aarti
Last active December 31, 2016 07:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aarti/c3761ceaa807012d1e4124c666924a57 to your computer and use it in GitHub Desktop.
Save aarti/c3761ceaa807012d1e4124c666924a57 to your computer and use it in GitHub Desktop.
Golang Tour Exercise: parallelize a web crawler. This implementation is using an html parser to scan href's.
package main
import (
"fmt"
"log"
"net/http"
"golang.org/x/net/html"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
_, urls, _ := fetcher.Fetch(url)
matched <- result{depth: depth - 1, urls: urls}
}
var completed = map[string]bool{}
var matched = make(chan result, 100)
var fetcher = new(urlFetcher)
type result struct {
depth int
urls []string
}
func main() {
url := "https://www.facebook.com/"
//url := "http://golang.org/"
completed[url] = true
Crawl(url, 2, fetcher)
remaining := 1
for remaining > 0 {
select {
case s := <-matched:
remaining--
if len(s.urls) == 0 {
break
}
for _, u := range s.urls {
_, more := completed[u]
if more {
continue
}
completed[u] = true
if s.depth <= 0 {
continue
}
remaining++
go Crawl(u, s.depth, fetcher)
}
default:
}
}
fmt.Println("Crawled URLS", completed, len(completed))
}
type urlFetcher map[string]*urlResult
type urlResult struct {
body string
urls []string
}
func (u urlFetcher) Fetch(url string) (string, []string, error) {
res, err := http.Get(url)
if err != nil {
return "", nil, fmt.Errorf("not found: %s", url)
}
doc, err := html.Parse(res.Body)
if err != nil {
log.Fatal(err)
}
urls := []string{}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" && len(a.Val) > 1 && a.Val[0:1] != "." && a.Val[0:1] != "#" {
if a.Val[0:1] == "/" {
urls = append(urls, url+a.Val)
} else {
urls = append(urls, a.Val)
}
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return "", urls, nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment