Investigating Webscraping with colly #go
package main | |
import ( | |
"encoding/base64" | |
"flag" | |
"fmt" | |
"strings" | |
"github.com/gocolly/colly" | |
"github.com/gookit/color" | |
) | |
var green = color.FgGreen.Render | |
var red = color.FgRed.Render | |
var yellow = color.FgYellow.Render | |
func SetBasicAuth(c *colly.Collector, user string, password string) { | |
c.OnRequest(func(r *colly.Request) { | |
r.Headers.Set("Authorization", "Basic "+base64.StdEncoding.EncodeToString([]byte(user+":"+password))) | |
}) | |
} | |
func main() { | |
verbose := flag.Bool("v", false, "verbose") | |
visiting := flag.Bool("visiting", false, "print visiting links") | |
found := flag.Bool("found", false, "print found links") | |
flag.Parse() | |
c := colly.NewCollector( | |
colly.AllowedDomains("localhost"), | |
) | |
SetBasicAuth(c, "test", "test") | |
// On every a element which has href attribute call callback | |
c.OnHTML("a[href]", func(e *colly.HTMLElement) { | |
link := e.Attr("href") | |
if *verbose { | |
fmt.Printf("%s %s\n", strings.TrimSpace(e.Text), yellow(link)) | |
} | |
if *found { | |
fmt.Println(e.Request.AbsoluteURL(link)) | |
} | |
// Only those links are visited which are in AllowedDomains | |
c.Visit(e.Request.AbsoluteURL(link)) | |
}) | |
c.OnRequest(func(r *colly.Request) { | |
if *verbose { | |
fmt.Println(green("Visiting"), yellow(r.URL.String())) | |
} | |
if *visiting { | |
fmt.Println(r.URL.String()) | |
} | |
}) | |
c.OnError(func(r *colly.Response, err error) { | |
if *verbose { | |
fmt.Printf("Request URL %s failed with response %s (%s)\n", red(r.Request.URL), red(r.StatusCode), err) | |
} | |
}) | |
c.Visit("http://localhost") | |
} |
package main | |
import ( | |
"fmt" | |
"log" | |
"net/http" | |
"strings" | |
"time" | |
"github.com/goji/httpauth" | |
"github.com/gookit/color" | |
) | |
func init() { | |
log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) | |
} | |
func logMiddleware(next http.Handler, header bool) http.Handler { | |
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { | |
color.FgRed.Println(r.URL.String()) | |
if header { | |
for k, v := range r.Header { | |
fmt.Printf("%s: %s\n", k, strings.Join(v[:], ", ")) | |
} | |
} | |
next.ServeHTTP(w, r) | |
}) | |
} | |
func cookieMiddleware(next http.Handler) http.Handler { | |
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { | |
ttl := 30 * time.Minute | |
expire := time.Now().Add(ttl) | |
cookie := http.Cookie{ | |
Name: "foo", | |
Value: "bar", | |
Expires: expire, | |
} | |
http.SetCookie(w, &cookie) | |
next.ServeHTTP(w, r) | |
}) | |
} | |
func main() { | |
authHandler := httpauth.SimpleBasicAuth("test", "test") | |
staticFiles := http.FileServer(http.Dir("content")) | |
muxer := http.NewServeMux() | |
muxer.Handle("/", authHandler(cookieMiddleware(logMiddleware(staticFiles, false)))) | |
http.ListenAndServe(":80", muxer) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment