Skip to content

Instantly share code, notes, and snippets.

@Version2beta
Last active August 30, 2021 05:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Version2beta/190378e9dd41642dd911b8e112b05700 to your computer and use it in GitHub Desktop.
Save Version2beta/190378e9dd41642dd911b8e112b05700 to your computer and use it in GitHub Desktop.
A Tour of Go Exercise: Web Crawler
package main
import (
"fmt"
"math/rand"
"sync"
"time"
)
const logging = true
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
type Fetched struct {
Pages map[string]string
Broken map[string]error
Fetcher Fetcher
mu sync.Mutex
numCrawlers int
maxConcurrency int
}
func log(s string) {
if logging {
println(s)
}
}
func (f *Fetched) Inc() int {
f.mu.Lock()
f.numCrawlers++
if f.numCrawlers > f.maxConcurrency {
f.maxConcurrency = f.numCrawlers
}
f.mu.Unlock()
return f.numCrawlers
}
func (f *Fetched) Dec() int {
f.mu.Lock()
f.numCrawlers--
f.mu.Unlock()
return f.numCrawlers
}
func (f *Fetched) Update(url string, body string, err error) {
f.mu.Lock()
if body != "" {
f.Pages[url] = body
} else {
f.Broken[url] = err
}
f.mu.Unlock()
return
}
func (f *Fetched) Fetch(url string, depth int) {
myCrawlerNum := f.Inc()
defer f.Dec()
_, exists := f.Pages[url]
if exists {
log(fmt.Sprintf("[%v] Skipped %s", myCrawlerNum, url))
} else {
body, urls, err := f.Fetcher.Fetch(url)
f.Update(url, body, err)
for _, u := range urls {
go f.Fetch(u, depth-1)
}
log(fmt.Sprintf("[%v] Fetched %s", myCrawlerNum, url))
}
return
}
func (f *Fetched) Report() {
fmt.Println("\n# Found:")
for k, v := range f.Pages {
fmt.Printf("- %s: %s\n", k, v)
}
fmt.Println("\n# Not found:")
for k, v := range f.Broken {
fmt.Printf("- %s: %s\n", k, v)
}
fmt.Printf("\nConcurrency: %v", f.maxConcurrency)
return
}
func Crawl(url string, depth int, fetcher Fetcher) {
f := &Fetched{
Fetcher: fetcher,
Pages: make(map[string]string),
Broken: make(map[string]error),
}
go f.Fetch(url, depth)
for f.numCrawlers > 0 || f.maxConcurrency == 0 {
time.Sleep(250 * time.Millisecond)
}
f.Report()
return
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
time.Sleep(time.Duration(rand.Intn(100)+80) * time.Millisecond)
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment