Skip to content

Instantly share code, notes, and snippets.

@kazu69
Last active October 20, 2015 14:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kazu69/c2851460f8463c43a716 to your computer and use it in GitHub Desktop.
Save kazu69/c2851460f8463c43a716 to your computer and use it in GitHub Desktop.
golang simple crawler
package main
import (
"crypto/tls"
"flag"
"golang.org/x/net/html"
"log"
"net/http"
"net/url"
"fmt"
"io/ioutil"
"encoding/json"
"os"
"time"
)
type(
Data struct {
Visited bool `json:"visited"`
Url string `json:"url"`
Status int `json:status"`
}
Sites map[string][]Data
Visited map[string]bool
Crawler struct {
html.Tokenizer
url.URL
}
)
type Tokenizer struct {
*html.Tokenizer
}
var(
sites Sites = Sites{}
visited Visited = Visited{}
crawlhost = flag.String("host", "", "Crawling host name")
jsonFile = flag.String("file", "", "Output crawl log json file")
version = flag.Bool("version", false, "Print version and exit")
)
const(
Version = "0.0.1"
)
func usage() {
fmt.Fprintf(os.Stderr, `
Usage of:
crawler <option> [URL]
Examples:
crawler -host github.com -file log.json https://github.com/kazu69/ https://google.com
Options
`)
flag.PrintDefaults()
}
func main() {
flag.Usage = usage
flag.Parse()
args := flag.Args()
crawler := Crawler{html.Tokenizer{}, url.URL{}}
// if exist option versions
if *version {
fmt.Println(Version)
return
}
// show usage if args not exist
if len(args) == 0 {
flag.Usage()
os.Exit(1)
}
for _, url := range args {
crawler.Fetch(url)
}
if *jsonFile != "" {
writeJsonFile(*jsonFile, &sites)
}
fmt.Printf("Finished all goroutines: %v\n", time.Now())
}
func (self Crawler) Fetch(url string) {
fmt.Println("GET URL: ", url)
transport := &http.Transport {
TLSClientConfig: &tls.Config {
InsecureSkipVerify: true,
},
}
client := http.Client {
Transport: transport,
}
resp, err := client.Get(url)
if err != nil {
log.Println("http transport error is:", err)
return
}
d := Data {
Visited: true,
Url: url,
Status: resp.StatusCode,
}
host := self.hostname(url)
if _, ok := sites[host]; ok {
sites[host] = append(sites[host], d)
}
defer resp.Body.Close()
token := html.NewTokenizer(resp.Body)
tokenizer := Tokenizer{token}
links := tokenizer.findLinks(url)
if len(links) == 0 {
fmt.Println("crawl end...")
return
}
for _, link := range links {
url := link
if url != "" {
if *crawlhost != "" {
if *crawlhost == self.hostname(url) {
crawler := &Crawler{}
crawler.Fetch(url)
}
}
}
}
return
}
func (self Crawler) hostname(uri string) (host string) {
u, _ := self.Parse(uri)
host = u.Host
return
}
func writeJsonFile(path string, dataMap *Sites) error {
if len(*dataMap) == 0 {
return nil
}
bytes, _ := json.Marshal(*dataMap)
return ioutil.WriteFile(path, bytes, os.FileMode(0755))
}
func hrefAttribute(token html.Token) (href string) {
for _, attr := range token.Attr {
if attr.Key == "href" {
href = attr.Val
}
}
return
}
func isVisited(uri string) bool {
if _, ok := visited[uri]; ok {
return true
}
return false
}
func (self Tokenizer) findLinks(baseURL string) (foundLink []string) {
foundLink = make([]string, 0)
for {
tokenType := self.Next()
switch {
case tokenType == html.ErrorToken:
return
case tokenType == html.StartTagToken:
token := self.Token()
if token.Data != "a" {
continue
}
reference := hrefAttribute(token)
path := resolvePath(reference, baseURL)
if !isVisited(path) {
visited[path] = true
foundLink = append(foundLink, path)
}
default:
continue
}
}
}
func resolvePath(link, base string) string {
uri, err := url.Parse(link)
if err != nil {
return ""
}
baseURL, err := url.Parse(base)
if err != nil {
return ""
}
uri = baseURL.ResolveReference(uri)
return uri.String()
}

Usage of: crawler [URL] Examples: crawler -host github.com -file log.json https://github.com/kazu69/ https://google.com Options -file string Output crawl log json file -host string Crawling host name -version Print version and exit

go run agent.go -host=example.com -file=log.json http://example.com
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment