Skip to content

Instantly share code, notes, and snippets.

@antsmartian
Last active March 18, 2018 15:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antsmartian/a74f5e5c9ccc97ffbe0775b9e5f5ce22 to your computer and use it in GitHub Desktop.
Save antsmartian/a74f5e5c9ccc97ffbe0775b9e5f5ce22 to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"net/http"
"io/ioutil"
"golang.org/x/net/html"
"bytes"
"github.com/PuerkitoBio/goquery"
"net/url"
"strings"
)
//Worker for visiting the urls
type Worker struct {
url string
response chan []*url.URL
}
func (w *Worker) processLinks(doc *goquery.Document) (result []*url.URL) {
//Should improve it to filter version.
urls := doc.Find("a[href]").Map(func(i int, selection *goquery.Selection) string {
val, _ := selection.Attr("href")
if strings.Contains(val,"http") {
return val
}
return ""
})
for _, s := range urls {
if len(s) > 0 && !strings.HasPrefix(s, "#") {
if parsed, e := url.Parse(s); e == nil {
parsed = doc.Url.ResolveReference(parsed)
result = append(result, parsed)
}
}
}
return
}
func (w *Worker) visitUrl() (e error) {
var doc *goquery.Document
req, e := http.NewRequest("GET",w.url,nil)
if e != nil {
return e
}
var HttpClient = &http.Client{}
res , _ := HttpClient.Do(req)
if res != nil && res.StatusCode == 200 {
if bd, e := ioutil.ReadAll(res.Body); e != nil {
fmt.Println("Error while reading the body")
} else {
if node , e := html.Parse(bytes.NewBuffer(bd)); e != nil {
fmt.Println("Error parsing the body")
} else {
doc = goquery.NewDocumentFromNode(node)
doc.Url = res.Request.URL
}
res.Body = ioutil.NopCloser(bytes.NewBuffer(bd))
urls := w.processLinks(doc)
w.response <- urls //send response to crawler the list of urls
}
} else {
fmt.Println("Error, status code is not 200")
}
return
}
//A Nicer implementation
type Crawler struct {
url string
response chan []*url.URL
}
//Spwan worker.
func (c *Crawler) Run() {
c.response = make(chan []*url.URL)
worker := Worker{url:c.url,response:c.response}
go worker.visitUrl()
for {
select {
//received the urls from worker
//iterate and create new worker
//All worker run in go routine.
case res := <-c.response:
for _, element := range res {
fmt.Println("Visiting",element)
if strings.HasPrefix(element.String(),"http:") {
worker := Worker{url:element.String(),response:c.response}
go worker.visitUrl()
}
}
break
}
}
}
func main() {
crawler := & Crawler{
url : "https://en.wikipedia.org/wiki/Groovy",
}
crawler.Run()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment