Last active
October 11, 2015 23:28
-
-
Save stojg/3936518 to your computer and use it in GitHub Desktop.
go implementation of a page scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* | |
* Requires these 'thirdparty' packages | |
* go get github.com/moovweb/gokogiri | |
*/ | |
package main | |
import ( | |
"fmt" | |
"github.com/moovweb/gokogiri" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"net/url" | |
"strings" | |
) | |
type Page struct { | |
url string | |
body []byte | |
} | |
type Fetcher interface { | |
// Fetch returns the body of URL and a slice of URLs found on that page. | |
Fetch(url string) (body []byte, urls map[int]string, err error) | |
} | |
// Crawl uses fetcher to recursively crawl pages starting with url, to a maximum of depth. | |
func Crawl(url string, depth int, fetcher Fetcher) { | |
// TODO: Fetch URLs in parallel. | |
// TODO: Don't fetch the same URL twice. | |
// This implementation doesn't do either: | |
if depth <= 0 { | |
return | |
} | |
_, urls, err := fetcher.Fetch(url) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
fmt.Printf("found: %s\n", url) | |
for _, linkURL := range urls { | |
fmt.Printf("next: %s\n", linkURL) | |
Crawl(linkURL, depth-1, fetcher) | |
} | |
return | |
} | |
func main() { | |
fetcher := &pageFetcher{} | |
Crawl("http://google.com/", 4, fetcher) | |
} | |
type pageFetcher struct { | |
results map[string]*pageResult | |
} | |
type pageResult struct { | |
body string | |
urls []string | |
} | |
var urlList map[int]string | |
/** | |
* Fetch a page from a url | |
* | |
*/ | |
func (f *pageFetcher) Fetch(url string) ([]byte, map[int]string, error) { | |
res, err := http.Get(url) | |
if err != nil { | |
log.Fatal(err) | |
} | |
body, err := ioutil.ReadAll(res.Body) | |
if err != nil { | |
log.Fatal(err) | |
} | |
res.Body.Close() | |
list, _ := f.getURLs(url, body) | |
if err != nil { | |
return nil, nil, fmt.Errorf("Not found: %s", url) | |
} | |
return body, list, nil | |
} | |
/** | |
* Get a list of all valid URLs from this from the HTML document | |
* | |
*/ | |
func (f *pageFetcher) getURLs(documentUrl string, body []byte) (map[int]string, error) { | |
// get a baseHref from the current page, just for relative links | |
baseHref := urlObject.Scheme + "://" + urlObject.Host + "/ " | |
doc, _ := gokogiri.ParseHtml(body) | |
defer doc.Free() | |
urls, err := doc.Root().Search("//a/@href") | |
if err != nil { | |
return nil, err | |
} | |
basehrefs, err := doc.Root().Search("//base/@href") | |
if err != nil { | |
return nil, err | |
} | |
urlObject, err := url.Parse(documentUrl) | |
if err != nil { | |
return nil, err | |
} | |
if len(basehrefs) > 0 && basehrefs[0].String() != "" { | |
baseHref = basehrefs[0].String() | |
} | |
fmt.Println(baseHref) | |
list := make(map[int]string) | |
for idx, link := range urls { | |
if strings.HasPrefix("javascript", link.String()) { | |
continue | |
} | |
list[idx] = link.String() | |
} | |
return list, nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment