Skip to content

Instantly share code, notes, and snippets.

@maxsei
Created January 11, 2019 22:28
Show Gist options
  • Save maxsei/a7d7619037999c5fb1029b057814ae80 to your computer and use it in GitHub Desktop.
Save maxsei/a7d7619037999c5fb1029b057814ae80 to your computer and use it in GitHub Desktop.
package sitemap
import (
"sync/atomic"
"fmt"
"net/http"
"io"
"log"
"sync"
"time"
"encoding/xml"
"strconv"
"github.com/maxsei/Gophercises/linkparser"
)
type dataStore struct {
sync.Mutex // ← this mutex protects the cache below
cache map[string]bool
}
func newDataStore() *dataStore {
return &dataStore{
cache: make(map[string]bool),
}
}
func (ds *dataStore) set(k string, v bool) {
ds.Lock()
defer ds.Unlock()
ds.cache[k] = v
}
func (ds *dataStore) seen(k string) bool {
ds.Lock()
defer ds.Unlock()
_, has := ds.cache[k]
return has
}
type SiteNode struct {
Depth int
Url string
Children []*SiteNode
}
type url struct{
Url string `xml:"url"`
}
type urlnode struct {
Urlnodes []urlnode `xml:urlnode`
Url string `xml:"id,attr"`
Urls []url `xml:url`
}
func SiteNodesConcurrent(domainUrl string, maxDepthRaw ...int) (*SiteNode, error) {
maxDepth, err := maxDepthHandler(maxDepthRaw...)
if err != nil {
return nil, err
}
rootNode := SiteNode{
Depth: 0,
Url: domainUrl,
Children: make([]*SiteNode, 0),
}
var visitedNodes = newDataStore()
visitedNodes.set(rootNode.Url, true)
deepestNodes := make(chan *SiteNode)
doneNodeDepth := make(chan int)
var nodesWorking int32 = 0
var layerWG sync.WaitGroup
var nodeChildren func(node *SiteNode) error
nodeChildren = func(node *SiteNode) error {
//fmt.Printf("Depth: %d\tLink: %s\tNodes working: %d\n", node.Depth, node.Url, nodesWorking)
links, err := linksAtUrl(node.Url)
if err != nil {
return err
}
if len(links) == 0{
doneNodeDepth <- node.Depth //done using this node to find other nodes
return nil
}
for _, link := range links {
absurl := absoluteUrl(domainUrl, link.Href)
if visitedNodes.seen(absurl) {
continue
}
visitedNodes.set(absurl, true)
nextNode := SiteNode{
Depth: node.Depth + 1,
Url: absurl,
Children: make([]*SiteNode, 0),
}
node.Children = append(node.Children, &nextNode)
deepestNodes <- &nextNode
}
doneNodeDepth <- node.Depth //done using this node to find other nodes
return nil
}
currentDepth := 0
var wg sync.WaitGroup
go func (){deepestNodes <- &rootNode}()
for{
select{
case node := <- deepestNodes:
go func(){
if node.Depth == maxDepth{
return
}
atomic.AddInt32(&nodesWorking, 1)
if node.Depth > currentDepth {
wg.Add(1)
time.Sleep(time.Nanosecond)
layerWG.Wait()
wg.Done()
}
wg.Wait()
layerWG.Add(1)
err := nodeChildren(node) //can return an error
if err != nil {
log.Print(err)
}
}()
case depth := <- doneNodeDepth:
currentDepth = depth
layerWG.Done()
atomic.AddInt32(&nodesWorking, -1)
//fmt.Printf("%d Nodes Working\n", nodesWorking)
if nodesWorking == 0{
return &rootNode, nil
}
case <- time.After(15*time.Second):
return &rootNode, nil
}
}
return &rootNode, nil
}
func SiteNodes(domainUrl string, maxDepthRaw ...int) (*SiteNode, error) {
maxDepth, err := maxDepthHandler(maxDepthRaw...)
if err != nil {
return nil, err
}
rootNode := SiteNode{
Depth: 0,
Url: domainUrl,
Children: make([]*SiteNode, 0),
}
var visitedNodes = map[string]int8{rootNode.Url: 1}
deepestNodes := append(make([]*SiteNode, 0), &rootNode)
var nodeChildren func(node *SiteNode) error
nodeChildren = func(node *SiteNode) error {
links, err := linksAtUrl(node.Url)
if err != nil {
return err
}
for _, link := range links {
absurl := absoluteUrl(domainUrl, link.Href)
if _, seen := visitedNodes[absurl]; seen {
continue
}
visitedNodes[absurl] = 1
nextNode := SiteNode{
Depth: node.Depth + 1,
Url: absurl,
Children: make([]*SiteNode, 0),
}
node.Children = append(node.Children, &nextNode)
deepestNodes = append(deepestNodes, &nextNode)
}
return nil
}
for len(deepestNodes) > 0 {
if deepestNodes[0].Depth == maxDepth && maxDepth != -1{
return &rootNode, nil
}
//fmt.Printf("Depth: %d\tLink: %s\n", deepestNodes[0].Depth, deepestNodes[0].Url)
err := nodeChildren(deepestNodes[0])
if err != nil {
return nil, err
}
deepestNodes = deepestNodes[1:] // Dequeue
}
return &rootNode, nil
}
func NodeToXML(rootNode *SiteNode, w io.Writer){
var buildXML func(node *SiteNode) urlnode
buildXML = func(node *SiteNode) urlnode{
n := urlnode{
Urlnodes : make([]urlnode, 0),
Url : node.Url,
Urls : make([]url, 0),
}
if len(node.Children) == 0 {
return n
}
/*
if len(node.Children[0].Children) == 0{
for _, v := range node.Children{
n.Urls = append(n.Urls, url{v.Url})
}
n.Urlnodes = nil
return n
}
*/
for _, v := range node.Children{
n.Urlnodes = append(n.Urlnodes, buildXML(v))
}
return n
}
rootUrlNode := buildXML(rootNode)
enc := xml.NewEncoder(w)
enc.Indent("", " ")
if err := enc.Encode(rootUrlNode); err != nil{
panic(err)
}
}
func NodeToStrings(rootNode *SiteNode)[]string{
var result []string
nodeQueue := append(make([]*SiteNode, 0), rootNode)
for len(nodeQueue) > 0 {
nodeQueue = append(nodeQueue, nodeQueue[0].Children...)
result = append(result, nodeQueue[0].Url +" Depth: "+ strconv.Itoa( nodeQueue[0].Depth))
nodeQueue = nodeQueue[1:]
}
return result
}
func maxDepthHandler(maxDepth ...int) (int, error){
if len(maxDepth) == 0{
return -1,nil
}
if len(maxDepth) != 1 {
return 0, fmt.Errorf("%d arguements for maxDepth were proved. Either enter one or no arguements for max depth\n",len(maxDepth))
}
if maxDepth[0] < -1 {
return 0, fmt.Errorf("%d was entered for maxDepth. Must be non negative.\n",len(maxDepth))
}
return maxDepth[0], nil
}
func linksAtUrl(url string) ([]linkparser.Link, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
links, err := linkparser.Parse(resp.Body)
if err != nil {
return nil, err
}
return links, nil
}
//pass in domain url and relative url
func absoluteUrl(root, branch string) string {
if len(branch) < 2 {
return root
}
if branch[0] == "/"[0] {
return root + branch
}
if len(branch) < len(root) {
return root
}
if branch[:len(root)] == root {
return branch
}
return root
}
package main
import (
"flag"
"fmt"
"log"
"os"
"time"
"github.com/maxsei/Gophercises/sitemap"
)
func main() {
maxDepth := flag.Int("depth", -1, "How many layers deep the map will go")
concurrency := flag.Int("cc", 1, "run program concurrently press 1")
siteUrl := flag.String("url", "https://gophercises.com", "this is the home page for the website to be mapped")
flag.Parse()
start := time.Now()
fns := []interface{}{
sitemap.SiteNodes,
sitemap.SiteNodesConcurrent,
}
node, err := fns[*concurrency].(func(string, ...int) (*sitemap.SiteNode, error))(*siteUrl, *maxDepth)
if err != nil {
log.Fatal(err)
}
urls := sitemap.NodeToStrings(node)
/*
for _, url := range urls{
fmt.Println(url)
}
*/
fmt.Printf("Total of %d links\n", len(urls))
sitemap.NodeToXML(node, os.Stdout)//you can use another io.writer
fmt.Printf("Finished in %v\n\n", time.Since(start))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment