|
package main |
|
|
|
import ( |
|
"crypto/tls" |
|
"flag" |
|
"golang.org/x/net/html" |
|
"log" |
|
"net/http" |
|
"net/url" |
|
"fmt" |
|
"io/ioutil" |
|
"encoding/json" |
|
"os" |
|
"time" |
|
) |
|
|
|
type( |
|
Data struct { |
|
Visited bool `json:"visited"` |
|
Url string `json:"url"` |
|
Status int `json:status"` |
|
} |
|
Sites map[string][]Data |
|
Visited map[string]bool |
|
Crawler struct { |
|
html.Tokenizer |
|
url.URL |
|
} |
|
) |
|
|
|
type Tokenizer struct { |
|
*html.Tokenizer |
|
} |
|
|
|
var( |
|
sites Sites = Sites{} |
|
visited Visited = Visited{} |
|
crawlhost = flag.String("host", "", "Crawling host name") |
|
jsonFile = flag.String("file", "", "Output crawl log json file") |
|
version = flag.Bool("version", false, "Print version and exit") |
|
) |
|
|
|
const( |
|
Version = "0.0.1" |
|
) |
|
|
|
func usage() { |
|
fmt.Fprintf(os.Stderr, ` |
|
Usage of: |
|
crawler <option> [URL] |
|
Examples: |
|
crawler -host github.com -file log.json https://github.com/kazu69/ https://google.com |
|
Options |
|
`) |
|
flag.PrintDefaults() |
|
} |
|
|
|
func main() { |
|
flag.Usage = usage |
|
|
|
flag.Parse() |
|
args := flag.Args() |
|
crawler := Crawler{html.Tokenizer{}, url.URL{}} |
|
|
|
// if exist option versions |
|
if *version { |
|
fmt.Println(Version) |
|
return |
|
} |
|
|
|
// show usage if args not exist |
|
if len(args) == 0 { |
|
flag.Usage() |
|
os.Exit(1) |
|
} |
|
|
|
for _, url := range args { |
|
crawler.Fetch(url) |
|
} |
|
|
|
if *jsonFile != "" { |
|
writeJsonFile(*jsonFile, &sites) |
|
} |
|
|
|
fmt.Printf("Finished all goroutines: %v\n", time.Now()) |
|
} |
|
|
|
func (self Crawler) Fetch(url string) { |
|
fmt.Println("GET URL: ", url) |
|
|
|
transport := &http.Transport { |
|
TLSClientConfig: &tls.Config { |
|
InsecureSkipVerify: true, |
|
}, |
|
} |
|
|
|
client := http.Client { |
|
Transport: transport, |
|
} |
|
resp, err := client.Get(url) |
|
|
|
if err != nil { |
|
log.Println("http transport error is:", err) |
|
return |
|
} |
|
|
|
d := Data { |
|
Visited: true, |
|
Url: url, |
|
Status: resp.StatusCode, |
|
} |
|
|
|
host := self.hostname(url) |
|
|
|
if _, ok := sites[host]; ok { |
|
sites[host] = append(sites[host], d) |
|
} |
|
|
|
defer resp.Body.Close() |
|
|
|
token := html.NewTokenizer(resp.Body) |
|
tokenizer := Tokenizer{token} |
|
links := tokenizer.findLinks(url) |
|
|
|
if len(links) == 0 { |
|
fmt.Println("crawl end...") |
|
return |
|
} |
|
|
|
for _, link := range links { |
|
url := link |
|
if url != "" { |
|
if *crawlhost != "" { |
|
if *crawlhost == self.hostname(url) { |
|
crawler := &Crawler{} |
|
crawler.Fetch(url) |
|
} |
|
} |
|
} |
|
} |
|
|
|
return |
|
} |
|
|
|
func (self Crawler) hostname(uri string) (host string) { |
|
u, _ := self.Parse(uri) |
|
host = u.Host |
|
return |
|
} |
|
|
|
func writeJsonFile(path string, dataMap *Sites) error { |
|
if len(*dataMap) == 0 { |
|
return nil |
|
} |
|
bytes, _ := json.Marshal(*dataMap) |
|
return ioutil.WriteFile(path, bytes, os.FileMode(0755)) |
|
} |
|
|
|
func hrefAttribute(token html.Token) (href string) { |
|
for _, attr := range token.Attr { |
|
if attr.Key == "href" { |
|
href = attr.Val |
|
} |
|
} |
|
|
|
return |
|
} |
|
|
|
func isVisited(uri string) bool { |
|
if _, ok := visited[uri]; ok { |
|
return true |
|
} |
|
return false |
|
} |
|
|
|
func (self Tokenizer) findLinks(baseURL string) (foundLink []string) { |
|
foundLink = make([]string, 0) |
|
for { |
|
tokenType := self.Next() |
|
|
|
switch { |
|
case tokenType == html.ErrorToken: |
|
return |
|
case tokenType == html.StartTagToken: |
|
token := self.Token() |
|
|
|
if token.Data != "a" { |
|
continue |
|
} |
|
|
|
reference := hrefAttribute(token) |
|
|
|
path := resolvePath(reference, baseURL) |
|
if !isVisited(path) { |
|
visited[path] = true |
|
foundLink = append(foundLink, path) |
|
} |
|
default: |
|
continue |
|
} |
|
} |
|
} |
|
|
|
func resolvePath(link, base string) string { |
|
uri, err := url.Parse(link) |
|
if err != nil { |
|
return "" |
|
} |
|
baseURL, err := url.Parse(base) |
|
if err != nil { |
|
return "" |
|
} |
|
uri = baseURL.ResolveReference(uri) |
|
return uri.String() |
|
} |