Skip to content

Instantly share code, notes, and snippets.

@stojg
Last active October 11, 2015 23:28
Show Gist options
  • Save stojg/3936518 to your computer and use it in GitHub Desktop.
Save stojg/3936518 to your computer and use it in GitHub Desktop.
go implementation of a page scraper
/**
*
* Requires these 'thirdparty' packages
* go get github.com/moovweb/gokogiri
*/
package main
import (
"fmt"
"github.com/moovweb/gokogiri"
"io/ioutil"
"log"
"net/http"
"net/url"
"strings"
)
type Page struct {
url string
body []byte
}
type Fetcher interface {
// Fetch returns the body of URL and a slice of URLs found on that page.
Fetch(url string) (body []byte, urls map[int]string, err error)
}
// Crawl uses fetcher to recursively crawl pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 {
return
}
_, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s\n", url)
for _, linkURL := range urls {
fmt.Printf("next: %s\n", linkURL)
Crawl(linkURL, depth-1, fetcher)
}
return
}
func main() {
fetcher := &pageFetcher{}
Crawl("http://google.com/", 4, fetcher)
}
type pageFetcher struct {
results map[string]*pageResult
}
type pageResult struct {
body string
urls []string
}
var urlList map[int]string
/**
* Fetch a page from a url
*
*/
func (f *pageFetcher) Fetch(url string) ([]byte, map[int]string, error) {
res, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
body, err := ioutil.ReadAll(res.Body)
if err != nil {
log.Fatal(err)
}
res.Body.Close()
list, _ := f.getURLs(url, body)
if err != nil {
return nil, nil, fmt.Errorf("Not found: %s", url)
}
return body, list, nil
}
/**
* Get a list of all valid URLs from this from the HTML document
*
*/
func (f *pageFetcher) getURLs(documentUrl string, body []byte) (map[int]string, error) {
// get a baseHref from the current page, just for relative links
baseHref := urlObject.Scheme + "://" + urlObject.Host + "/ "
doc, _ := gokogiri.ParseHtml(body)
defer doc.Free()
urls, err := doc.Root().Search("//a/@href")
if err != nil {
return nil, err
}
basehrefs, err := doc.Root().Search("//base/@href")
if err != nil {
return nil, err
}
urlObject, err := url.Parse(documentUrl)
if err != nil {
return nil, err
}
if len(basehrefs) > 0 && basehrefs[0].String() != "" {
baseHref = basehrefs[0].String()
}
fmt.Println(baseHref)
list := make(map[int]string)
for idx, link := range urls {
if strings.HasPrefix("javascript", link.String()) {
continue
}
list[idx] = link.String()
}
return list, nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment