Last active
April 25, 2016 04:58
-
-
Save alexhsamuel/5cdc10b68a06596a835c88e6c293e7cc to your computer and use it in GitHub Desktop.
"Web Crawler" exercise from A Tour of Go (https://tour.golang.org/concurrency/10)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"sync" | |
) | |
type Fetcher interface { | |
// Fetch returns the body of URL and | |
// a slice of URLs found on that page. | |
Fetch(url string) (body string, urls []string, err error) | |
} | |
type context struct { | |
fetcher Fetcher | |
// Channel to count crawl goroutines in process: +1 when starting, -1 when ending. | |
inProgress chan int | |
// Set of crawled URLs, or rather, URLs, we're about to start to call. | |
urls map[string]bool | |
// Protects `urls`. | |
mutex sync.Mutex | |
} | |
// Checks if `url` should be crawled; if so, marks it as being crawled. | |
func (ctx context) check(url string) bool { | |
ctx.mutex.Lock() | |
defer ctx.mutex.Unlock() | |
if _, ok := ctx.urls[url]; ok { | |
return false | |
} else { | |
ctx.urls[url] = true | |
return true | |
} | |
} | |
func crawl(url string, depth int, ctx context) { | |
// Fetch this page. | |
body, urls, err := ctx.fetcher.Fetch(url) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
fmt.Printf("loaded: %s %q\n", url, body) | |
// Start crawlers for child pages. | |
if depth > 0 { | |
for _, u := range urls { | |
goCrawl(u, depth - 1, ctx) | |
} | |
} | |
} | |
func goCrawl(url string, depth int, ctx context) { | |
if ctx.check(url) { | |
ctx.inProgress <- 1 | |
go func(){ | |
defer func(){ ctx.inProgress <- -1 }() | |
crawl(url, depth, ctx) | |
}() | |
} | |
} | |
func Crawl(url string, depth int) { | |
if depth == 0 { return } | |
ctx := context{fetcher, make(chan int, 64), make(map[string]bool), sync.Mutex{}} | |
goCrawl(url, depth, ctx) | |
// Keep going until no more crawls are in progress. | |
for num := <-ctx.inProgress; num > 0; num += <-ctx.inProgress {} | |
} | |
func main() { | |
Crawl("http://golang.org/", 4) | |
} | |
// fakeFetcher is Fetcher that returns canned results. | |
type fakeFetcher map[string]*fakeResult | |
type fakeResult struct { | |
body string | |
urls []string | |
} | |
func (f fakeFetcher) Fetch(url string) (string, []string, error) { | |
if res, ok := f[url]; ok { | |
return res.body, res.urls, nil | |
} | |
return "", nil, fmt.Errorf("not found: %s", url) | |
} | |
// fetcher is a populated fakeFetcher. | |
var fetcher = fakeFetcher{ | |
"http://golang.org/": &fakeResult{ | |
"The Go Programming Language", | |
[]string{ | |
"http://golang.org/pkg/", | |
"http://golang.org/cmd/", | |
}, | |
}, | |
"http://golang.org/pkg/": &fakeResult{ | |
"Packages", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/cmd/", | |
"http://golang.org/pkg/fmt/", | |
"http://golang.org/pkg/os/", | |
}, | |
}, | |
"http://golang.org/pkg/fmt/": &fakeResult{ | |
"Package fmt", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/", | |
}, | |
}, | |
"http://golang.org/pkg/os/": &fakeResult{ | |
"Package os", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/", | |
}, | |
}, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment