Skip to content

Instantly share code, notes, and snippets.

@thomd

thomd/crawl.go

Created Oct 3, 2020
Embed
What would you like to do?
Investigating Webscraping with colly #go
package main
import (
"encoding/base64"
"flag"
"fmt"
"strings"
"github.com/gocolly/colly"
"github.com/gookit/color"
)
var green = color.FgGreen.Render
var red = color.FgRed.Render
var yellow = color.FgYellow.Render
func SetBasicAuth(c *colly.Collector, user string, password string) {
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Authorization", "Basic "+base64.StdEncoding.EncodeToString([]byte(user+":"+password)))
})
}
func main() {
verbose := flag.Bool("v", false, "verbose")
visiting := flag.Bool("visiting", false, "print visiting links")
found := flag.Bool("found", false, "print found links")
flag.Parse()
c := colly.NewCollector(
colly.AllowedDomains("localhost"),
)
SetBasicAuth(c, "test", "test")
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
if *verbose {
fmt.Printf("%s %s\n", strings.TrimSpace(e.Text), yellow(link))
}
if *found {
fmt.Println(e.Request.AbsoluteURL(link))
}
// Only those links are visited which are in AllowedDomains
c.Visit(e.Request.AbsoluteURL(link))
})
c.OnRequest(func(r *colly.Request) {
if *verbose {
fmt.Println(green("Visiting"), yellow(r.URL.String()))
}
if *visiting {
fmt.Println(r.URL.String())
}
})
c.OnError(func(r *colly.Response, err error) {
if *verbose {
fmt.Printf("Request URL %s failed with response %s (%s)\n", red(r.Request.URL), red(r.StatusCode), err)
}
})
c.Visit("http://localhost")
}
package main
import (
"fmt"
"log"
"net/http"
"strings"
"time"
"github.com/goji/httpauth"
"github.com/gookit/color"
)
func init() {
log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
}
func logMiddleware(next http.Handler, header bool) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
color.FgRed.Println(r.URL.String())
if header {
for k, v := range r.Header {
fmt.Printf("%s: %s\n", k, strings.Join(v[:], ", "))
}
}
next.ServeHTTP(w, r)
})
}
func cookieMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
ttl := 30 * time.Minute
expire := time.Now().Add(ttl)
cookie := http.Cookie{
Name: "foo",
Value: "bar",
Expires: expire,
}
http.SetCookie(w, &cookie)
next.ServeHTTP(w, r)
})
}
func main() {
authHandler := httpauth.SimpleBasicAuth("test", "test")
staticFiles := http.FileServer(http.Dir("content"))
muxer := http.NewServeMux()
muxer.Handle("/", authHandler(cookieMiddleware(logMiddleware(staticFiles, false))))
http.ListenAndServe(":80", muxer)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment