Created
March 18, 2013 04:45
-
-
Save jtmcdole/5185145 to your computer and use it in GitHub Desktop.
Falling in love with Go! Chapter 70 of tour.golang.org asks you to re-write their crawler to use goroutines and to skip already seen URLs. Here's my simple solution. Note; according the what documentation I read, the Go runtime will schedule threads accordingly. One could simply take the incoming URLs and queue them up for a smaller number of ca…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
) | |
type Fetcher interface { | |
// Fetch returns the body of URL and | |
// a slice of URLs found on that page. | |
Fetch(url string) (body string, urls []string, err error) | |
} | |
// Crawl uses fetcher with goroutines to scour for | |
// more urls while skipping repeats | |
func Crawl(url string, fetcher Fetcher) { | |
crawler := func(url string, ch chan string) { | |
body, urls, err := fetcher.Fetch(url) | |
if err != nil { | |
fmt.Println(err) | |
// treat errors as done. | |
ch <- "" | |
return | |
} | |
fmt.Printf("found: %s %q\n", url, body) | |
for i := range urls { | |
ch <- urls[i] | |
} | |
ch <- "" | |
} | |
// map of already seen URLs | |
seenUrls := map[string]bool{url: true} | |
// Number of crawler threads in existence | |
crawlers := 1 | |
// Channel for all crawler threads to communicate back on; | |
// and empty string means that one has ended; who cares which. | |
c := make(chan string) | |
seenMultiple := 0 | |
go crawler(url, c) | |
for crawlers > 0 { | |
foundUrl := <-c | |
if "" == foundUrl { | |
crawlers-- | |
continue | |
} | |
if _, seen := seenUrls[foundUrl]; !seen { | |
seenUrls[foundUrl] = true | |
crawlers++ | |
go crawler(foundUrl, c) | |
} else { | |
seenMultiple++ | |
} | |
} | |
fmt.Println("seenUrls: ", seenUrls, " Repeats: ", seenMultiple) | |
return | |
} | |
func main() { | |
Crawl("http://golang.org/", fetcher) | |
} | |
// fakeFetcher is Fetcher that returns canned results. | |
type fakeFetcher map[string]*fakeResult | |
type fakeResult struct { | |
body string | |
urls []string | |
} | |
func (f *fakeFetcher) Fetch(url string) (string, []string, error) { | |
if res, ok := (*f)[url]; ok { | |
return res.body, res.urls, nil | |
} | |
return "", nil, fmt.Errorf("not found: %s", url) | |
} | |
// fetcher is a populated fakeFetcher. | |
var fetcher = &fakeFetcher{ | |
"http://golang.org/": &fakeResult{ | |
"The Go Programming Language", | |
[]string{ | |
"http://golang.org/pkg/", | |
"http://golang.org/cmd/", | |
}, | |
}, | |
"http://golang.org/pkg/": &fakeResult{ | |
"Packages", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/cmd/", | |
"http://golang.org/pkg/fmt/", | |
"http://golang.org/pkg/os/", | |
}, | |
}, | |
"http://golang.org/pkg/fmt/": &fakeResult{ | |
"Package fmt", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/", | |
}, | |
}, | |
"http://golang.org/pkg/os/": &fakeResult{ | |
"Package os", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/", | |
}, | |
}, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment