Skip to content

Instantly share code, notes, and snippets.

Last active June 3, 2017 14:56
Show Gist options
  • Save technoweenie/5078118 to your computer and use it in GitHub Desktop.
Save technoweenie/5078118 to your computer and use it in GitHub Desktop.
go web crawler:
package main
import (
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, finished chan bool) {
if depth <= 0 {
finished <- false
_, urls, err := fetcher.Fetch(url)
if err != nil {
finished <- false
urlCount := len(urls)
if urlCount > 0 {
fmt.Printf("found: %s %d in depth: %d\n", url, urlCount, depth)
innerFinished := make(chan bool)
for _, u := range urls {
go Crawl(u, depth-1, fetcher, innerFinished)
for i := 0; i < urlCount; i += 1 {
finished <- true
func main() {
fetcher := new(urlFetcher)
t0 := time.Now()
finished := make(chan bool)
go Crawl("", 3, fetcher, finished)
t1 := time.Now()
fmt.Printf("%v\n", t1.Sub(t0))
type urlFetcher map[string]*fetcherResult
type fetcherResult struct {
body string
urls []string
func (f *urlFetcher) Fetch(url string) (string, []string, error) {
if res, ok := (*f)[url]; ok {
return res.body, res.urls, nil
return "body", f.scanForUrls(url), nil
func (f *urlFetcher) scanForUrls(url string) []string {
body, _ :=
urlStrings := f.urlsInBody(body)
return f.validUrls(url, urlStrings)
func (f *urlFetcher) validUrls(parentUrl string, urls []string) []string {
parent, _ := url.Parse(parentUrl)
validUrls := make([]string, len(urls))
validUrlCount := 0
for _, urlString := range urls {
uri, err := url.Parse(urlString)
if err == nil && (uri.Scheme == "http" || uri.Scheme == "https") {
validUrls[validUrlCount] = parent.ResolveReference(uri).String()
validUrlCount += 1
return validUrls[:validUrlCount]
func (f *urlFetcher) urlsInBody(body []byte) []string {
doc, _ := gokogiri.ParseHtml(body)
defer doc.Free()
nodes, _ := doc.Search("//a/@href")
urls := make([]string, len(nodes))
for n, node := range nodes {
urls[n] = node.String()
return urls
func (f *urlFetcher) download(url string) ([]byte, error) {
var resp, err = http.Get(url)
if err != nil {
return nil, err
defer resp.Body.Close()
body, err2 := ioutil.ReadAll(resp.Body)
return body, err2
Copy link

I do it :
go get
and then ,show me that:

/tmp/go-build806676929/ In function removeDefaultNamespace': undefined reference toxmlFirstElementChild' undefined reference to `xmlNextElementSibling'
collect2: ld

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment