Skip to content

Instantly share code, notes, and snippets.

@curbol
Created June 14, 2023 23:46
Show Gist options
  • Save curbol/2e9cb7b4cc0711ad6ec2678e80fec538 to your computer and use it in GitHub Desktop.
Save curbol/2e9cb7b4cc0711ad6ec2678e80fec538 to your computer and use it in GitHub Desktop.
A Tour of Go - Exercise: Web Crawler
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type crawlResponse struct {
url string
depth int
body string
urls []string
err error
}
type crawlState struct {
mu sync.Mutex
found map[string]any
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
ch := make(chan crawlResponse, 10)
state := crawlState{
found: map[string]any{url:true},
}
var wg sync.WaitGroup
wg.Add(1)
go ReadUrl(url, depth, fetcher, ch)
go func() {
for r := range ch {
if r.err != nil {
fmt.Println(r.err)
}
fmt.Printf("found: %s %q\n", r.url, r.body)
if r.depth > 0 {
for _, u := range r.urls {
_, ok := state.found[u]
if ok {
continue
}
state.mu.Lock()
state.found[u] = true
state.mu.Unlock()
wg.Add(1)
go ReadUrl(u, r.depth - 1, fetcher, ch)
}
}
wg.Done()
}
}()
wg.Wait()
close(ch)
}
func ReadUrl(url string, depth int, fetcher Fetcher, ch chan crawlResponse) {
body, urls, err := fetcher.Fetch(url)
ch <- crawlResponse{url, depth, body, urls, err}
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
@curbol
Copy link
Author

curbol commented Jun 14, 2023

The exercise can be found here

My goals were to not change the original function signatures or the main() function and to not use recursion.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment