Skip to content

Instantly share code, notes, and snippets.

@ficapy
Created December 24, 2017 10:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ficapy/5bbff5b3136bd5daa70bc1f2b3ed1222 to your computer and use it in GitHub Desktop.
Save ficapy/5bbff5b3136bd5daa70bc1f2b3ed1222 to your computer and use it in GitHub Desktop.
package main
import (
"encoding/gob"
"encoding/json"
"fmt"
"os"
"strings"
"sync"
"github.com/gocolly/colly"
)
type Page struct {
Title string
Url string
}
var roots = []string{"https://docs.gitlab.com/ee/ci/", "https://docs.gitlab.com/runner/"}
func scrapy(roots []string) (map[string]string, map[string][]string) {
type Chain struct {
Prev string
Next string
}
var chain []Chain
chainMap := make(map[string][]string)
var urlMapTitle sync.Map
urlMapTitleTemp := make(map[string]string)
c := colly.NewCollector()
c.MaxDepth = 30
c.AllowedDomains = []string{"docs.gitlab.com"}
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 10})
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
if strings.HasSuffix(link, ".png") {
return
}
if !strings.Contains(link, "docs.gitlab.com/ee/ci") && !strings.Contains(link, "docs.gitlab.com/runner") {
return
}
chain = append(chain, Chain{
Prev: e.Request.URL.String(),
Next: link,
})
urlMapTitle.Store(link, strings.TrimSpace(e.Text))
go e.Request.Visit(link)
})
c.OnScraped(func(r *colly.Response) {
fmt.Println(r.Request.URL, len(string(r.Body)))
})
for _, root := range roots {
c.Visit(root)
}
c.Wait()
for _, value := range chain {
_, ok := chainMap[value.Prev]
if ok {
chainMap[value.Prev] = append(chainMap[value.Prev], value.Next)
} else {
fmt.Printf("%+v\n", value)
chainMap[value.Prev] = []string{value.Next}
}
}
urlMapTitle.Range(func(key, value interface{}) bool {
urlMapTitleTemp[key.(string)] = value.(string)
return true
})
return urlMapTitleTemp, chainMap
}
func Save(path string, obj interface{}) (err error) {
file, err := os.Create(path)
defer file.Close()
if err == nil {
encoder := gob.NewEncoder(file)
err = encoder.Encode(obj)
}
return
}
func Load(path string, obj interface{}) (err error) {
file, err := os.Open(path)
defer file.Close()
if err == nil {
decoder := gob.NewDecoder(file)
err = decoder.Decode(obj)
}
return err
}
type Node struct {
Name string `json:"name"`
Value string `json:"value"`
Children []*Node `json:"children"`
}
var Cache = map[string]struct{}{}
func recursion(chain map[string][]string, signNode *Node, urlMapTitle map[string]string) {
_, ok := chain[signNode.Value]
if ok {
_, ok := Cache[signNode.Value]
if ok {
return
} else {
Cache[signNode.Value] = struct{}{}
}
children := make([]*Node, 0, 10)
for _, i := range chain[signNode.Value] {
children = append(children, &Node{Name: getTitle(urlMapTitle, i), Value: i})
}
signNode.Children = children
for _, i := range signNode.Children {
recursion(chain, i, urlMapTitle)
}
}
}
func getTitle(urlMapTitle map[string]string, url string) string {
ret, ok := urlMapTitle[url]
if ok {
return ret
}
return ""
}
func generateTree(chain map[string][]string, urlMapTitle map[string]string, roots []string) {
nodes := Node{}
for _, i := range roots {
node := Node{Name: getTitle(urlMapTitle, i), Value: i}
recursion(chain, &node, urlMapTitle)
nodes.Children = append(nodes.Children, &node)
}
cj, err := json.MarshalIndent(nodes, "", "\t")
if err == nil {
fmt.Println(string(cj))
}
}
func main() {
var chainMap = make(map[string][]string)
var urlMapTitle = make(map[string]string)
err := Load("chain", &chainMap);
if err == nil {
Load("urlMapTitle", &urlMapTitle)
} else {
urlMapTitle, chainMap = scrapy(roots)
Save("chain", chainMap)
Save("urlMapTitle", urlMapTitle)
}
generateTree(chainMap, urlMapTitle, roots)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment