Created
January 11, 2019 22:28
-
-
Save maxsei/a7d7619037999c5fb1029b057814ae80 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package sitemap | |
import ( | |
"sync/atomic" | |
"fmt" | |
"net/http" | |
"io" | |
"log" | |
"sync" | |
"time" | |
"encoding/xml" | |
"strconv" | |
"github.com/maxsei/Gophercises/linkparser" | |
) | |
type dataStore struct { | |
sync.Mutex // ← this mutex protects the cache below | |
cache map[string]bool | |
} | |
func newDataStore() *dataStore { | |
return &dataStore{ | |
cache: make(map[string]bool), | |
} | |
} | |
func (ds *dataStore) set(k string, v bool) { | |
ds.Lock() | |
defer ds.Unlock() | |
ds.cache[k] = v | |
} | |
func (ds *dataStore) seen(k string) bool { | |
ds.Lock() | |
defer ds.Unlock() | |
_, has := ds.cache[k] | |
return has | |
} | |
type SiteNode struct { | |
Depth int | |
Url string | |
Children []*SiteNode | |
} | |
type url struct{ | |
Url string `xml:"url"` | |
} | |
type urlnode struct { | |
Urlnodes []urlnode `xml:urlnode` | |
Url string `xml:"id,attr"` | |
Urls []url `xml:url` | |
} | |
func SiteNodesConcurrent(domainUrl string, maxDepthRaw ...int) (*SiteNode, error) { | |
maxDepth, err := maxDepthHandler(maxDepthRaw...) | |
if err != nil { | |
return nil, err | |
} | |
rootNode := SiteNode{ | |
Depth: 0, | |
Url: domainUrl, | |
Children: make([]*SiteNode, 0), | |
} | |
var visitedNodes = newDataStore() | |
visitedNodes.set(rootNode.Url, true) | |
deepestNodes := make(chan *SiteNode) | |
doneNodeDepth := make(chan int) | |
var nodesWorking int32 = 0 | |
var layerWG sync.WaitGroup | |
var nodeChildren func(node *SiteNode) error | |
nodeChildren = func(node *SiteNode) error { | |
//fmt.Printf("Depth: %d\tLink: %s\tNodes working: %d\n", node.Depth, node.Url, nodesWorking) | |
links, err := linksAtUrl(node.Url) | |
if err != nil { | |
return err | |
} | |
if len(links) == 0{ | |
doneNodeDepth <- node.Depth //done using this node to find other nodes | |
return nil | |
} | |
for _, link := range links { | |
absurl := absoluteUrl(domainUrl, link.Href) | |
if visitedNodes.seen(absurl) { | |
continue | |
} | |
visitedNodes.set(absurl, true) | |
nextNode := SiteNode{ | |
Depth: node.Depth + 1, | |
Url: absurl, | |
Children: make([]*SiteNode, 0), | |
} | |
node.Children = append(node.Children, &nextNode) | |
deepestNodes <- &nextNode | |
} | |
doneNodeDepth <- node.Depth //done using this node to find other nodes | |
return nil | |
} | |
currentDepth := 0 | |
var wg sync.WaitGroup | |
go func (){deepestNodes <- &rootNode}() | |
for{ | |
select{ | |
case node := <- deepestNodes: | |
go func(){ | |
if node.Depth == maxDepth{ | |
return | |
} | |
atomic.AddInt32(&nodesWorking, 1) | |
if node.Depth > currentDepth { | |
wg.Add(1) | |
time.Sleep(time.Nanosecond) | |
layerWG.Wait() | |
wg.Done() | |
} | |
wg.Wait() | |
layerWG.Add(1) | |
err := nodeChildren(node) //can return an error | |
if err != nil { | |
log.Print(err) | |
} | |
}() | |
case depth := <- doneNodeDepth: | |
currentDepth = depth | |
layerWG.Done() | |
atomic.AddInt32(&nodesWorking, -1) | |
//fmt.Printf("%d Nodes Working\n", nodesWorking) | |
if nodesWorking == 0{ | |
return &rootNode, nil | |
} | |
case <- time.After(15*time.Second): | |
return &rootNode, nil | |
} | |
} | |
return &rootNode, nil | |
} | |
func SiteNodes(domainUrl string, maxDepthRaw ...int) (*SiteNode, error) { | |
maxDepth, err := maxDepthHandler(maxDepthRaw...) | |
if err != nil { | |
return nil, err | |
} | |
rootNode := SiteNode{ | |
Depth: 0, | |
Url: domainUrl, | |
Children: make([]*SiteNode, 0), | |
} | |
var visitedNodes = map[string]int8{rootNode.Url: 1} | |
deepestNodes := append(make([]*SiteNode, 0), &rootNode) | |
var nodeChildren func(node *SiteNode) error | |
nodeChildren = func(node *SiteNode) error { | |
links, err := linksAtUrl(node.Url) | |
if err != nil { | |
return err | |
} | |
for _, link := range links { | |
absurl := absoluteUrl(domainUrl, link.Href) | |
if _, seen := visitedNodes[absurl]; seen { | |
continue | |
} | |
visitedNodes[absurl] = 1 | |
nextNode := SiteNode{ | |
Depth: node.Depth + 1, | |
Url: absurl, | |
Children: make([]*SiteNode, 0), | |
} | |
node.Children = append(node.Children, &nextNode) | |
deepestNodes = append(deepestNodes, &nextNode) | |
} | |
return nil | |
} | |
for len(deepestNodes) > 0 { | |
if deepestNodes[0].Depth == maxDepth && maxDepth != -1{ | |
return &rootNode, nil | |
} | |
//fmt.Printf("Depth: %d\tLink: %s\n", deepestNodes[0].Depth, deepestNodes[0].Url) | |
err := nodeChildren(deepestNodes[0]) | |
if err != nil { | |
return nil, err | |
} | |
deepestNodes = deepestNodes[1:] // Dequeue | |
} | |
return &rootNode, nil | |
} | |
func NodeToXML(rootNode *SiteNode, w io.Writer){ | |
var buildXML func(node *SiteNode) urlnode | |
buildXML = func(node *SiteNode) urlnode{ | |
n := urlnode{ | |
Urlnodes : make([]urlnode, 0), | |
Url : node.Url, | |
Urls : make([]url, 0), | |
} | |
if len(node.Children) == 0 { | |
return n | |
} | |
/* | |
if len(node.Children[0].Children) == 0{ | |
for _, v := range node.Children{ | |
n.Urls = append(n.Urls, url{v.Url}) | |
} | |
n.Urlnodes = nil | |
return n | |
} | |
*/ | |
for _, v := range node.Children{ | |
n.Urlnodes = append(n.Urlnodes, buildXML(v)) | |
} | |
return n | |
} | |
rootUrlNode := buildXML(rootNode) | |
enc := xml.NewEncoder(w) | |
enc.Indent("", " ") | |
if err := enc.Encode(rootUrlNode); err != nil{ | |
panic(err) | |
} | |
} | |
func NodeToStrings(rootNode *SiteNode)[]string{ | |
var result []string | |
nodeQueue := append(make([]*SiteNode, 0), rootNode) | |
for len(nodeQueue) > 0 { | |
nodeQueue = append(nodeQueue, nodeQueue[0].Children...) | |
result = append(result, nodeQueue[0].Url +" Depth: "+ strconv.Itoa( nodeQueue[0].Depth)) | |
nodeQueue = nodeQueue[1:] | |
} | |
return result | |
} | |
func maxDepthHandler(maxDepth ...int) (int, error){ | |
if len(maxDepth) == 0{ | |
return -1,nil | |
} | |
if len(maxDepth) != 1 { | |
return 0, fmt.Errorf("%d arguements for maxDepth were proved. Either enter one or no arguements for max depth\n",len(maxDepth)) | |
} | |
if maxDepth[0] < -1 { | |
return 0, fmt.Errorf("%d was entered for maxDepth. Must be non negative.\n",len(maxDepth)) | |
} | |
return maxDepth[0], nil | |
} | |
func linksAtUrl(url string) ([]linkparser.Link, error) { | |
resp, err := http.Get(url) | |
if err != nil { | |
return nil, err | |
} | |
links, err := linkparser.Parse(resp.Body) | |
if err != nil { | |
return nil, err | |
} | |
return links, nil | |
} | |
//pass in domain url and relative url | |
func absoluteUrl(root, branch string) string { | |
if len(branch) < 2 { | |
return root | |
} | |
if branch[0] == "/"[0] { | |
return root + branch | |
} | |
if len(branch) < len(root) { | |
return root | |
} | |
if branch[:len(root)] == root { | |
return branch | |
} | |
return root | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"log" | |
"os" | |
"time" | |
"github.com/maxsei/Gophercises/sitemap" | |
) | |
func main() { | |
maxDepth := flag.Int("depth", -1, "How many layers deep the map will go") | |
concurrency := flag.Int("cc", 1, "run program concurrently press 1") | |
siteUrl := flag.String("url", "https://gophercises.com", "this is the home page for the website to be mapped") | |
flag.Parse() | |
start := time.Now() | |
fns := []interface{}{ | |
sitemap.SiteNodes, | |
sitemap.SiteNodesConcurrent, | |
} | |
node, err := fns[*concurrency].(func(string, ...int) (*sitemap.SiteNode, error))(*siteUrl, *maxDepth) | |
if err != nil { | |
log.Fatal(err) | |
} | |
urls := sitemap.NodeToStrings(node) | |
/* | |
for _, url := range urls{ | |
fmt.Println(url) | |
} | |
*/ | |
fmt.Printf("Total of %d links\n", len(urls)) | |
sitemap.NodeToXML(node, os.Stdout)//you can use another io.writer | |
fmt.Printf("Finished in %v\n\n", time.Since(start)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment