Skip to content

Instantly share code, notes, and snippets.

@gmarik
Forked from yogesh-desai/Crawler.go
Created November 16, 2019 04:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gmarik/042703efa836ab759cfba8db0c66fffd to your computer and use it in GitHub Desktop.
Save gmarik/042703efa836ab759cfba8db0c66fffd to your computer and use it in GitHub Desktop.
package main
import (
"net/http"
"net/url"
"runtime"
"strings"
"context"
"bytes"
"flag"
"sync"
"time"
"fmt"
"log"
"os"
"github.com/PuerkitoBio/fetchbot"
"github.com/PuerkitoBio/goquery"
cdp "github.com/knq/chromedp"
cdpr "github.com/knq/chromedp/runner"
)
var (
// Protect access to dup
mu sync.Mutex
wg sync.WaitGroup
// Duplicates table
dup = map[string]bool{}
urls []string
chanURL = make(chan string)
// Command-line flags
// seed = flag.String("seed", "http://golang.org", "seed URL")
seed = flag.String("seed", "http://www.tokopedia.com/", "seed URL")
cancelAfter = flag.Duration("cancelafter", 0, "automatically cancel the fetchbot after a given time")
cancelAtURL = flag.String("cancelat", "", "automatically cancel the fetchbot at a given URL")
stopAfter = flag.Duration("stopafter", 2 * time.Minute, "automatically stop the fetchbot after a given time")
stopAtURL = flag.String("stopat", "", "automatically stop the fetchbot at a given URL")
memStats = flag.Duration("memstats", 5 * time.Minute, "display memory statistics at a given interval")
)
func DoExtract(chanURL chan string){
time.Sleep(2 * time.Second)
for{
url := <- chanURL
//var u string
// Append visited urls
// u := fmt.Sprintf("%v", url)
// u = g(url)
urls = append(urls, url)
DoCDP(url)
}
}
func main() {
flag.Parse()
// Parse the provided seed
u, err := url.Parse(*seed)
check(err, "Error in parsing the seed url")
log.Println("The URL: ", u)
// log.Printf("Type of u: %T:\n", u)
go DoExtract(chanURL)
chanURL <- fmt.Sprintf("%v", u)
// chanURL <- string(u)
DoCrawl(u, chanURL)
close(chanURL)
log.Println("Len dup: ", len(dup), "\nLen urls: \n", urls)
if _, err := os.Stat(pwd() + "/TokoProductDetails.csv"); !os.IsNotExist(err) {
log.Println("The output TSV file location: ", pwd() + "/TokoProductDetails.csv")
} else {
log.Println("Required data is not present in any of processed URLs.")
}
// Write the processed URLs to a file
fmt.Println("Total no. of URLs processed: ", len(urls), "\nThe Processed URLs are in the file: ", WriteProcessedUrlsToFile(urls))
}
func DoCrawl(u *url.URL, chanURL chan string){
// Create the muxer
mux := fetchbot.NewMux()
// Handle all errors the same
mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
log.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
}))
// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
// requests.
mux.Response().Method("GET").ContentType("text/html").Handler(fetchbot.HandlerFunc(
func(ctx *fetchbot.Context, res *http.Response, err error) {
// Process the body to find the links
doc, err := goquery.NewDocumentFromResponse(res)
if err != nil {
log.Printf("[ERR - GoQuery] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
return
}
// Enqueue all links as HEAD requests
enqueueLinks(ctx, doc)
}))
// Handle HEAD requests for html responses coming from the source host - we don't want
// to crawl links from other hosts.
mux.Response().Method("HEAD").Host(u.Host).ContentType("text/html").Handler(fetchbot.HandlerFunc(
func(ctx *fetchbot.Context, res *http.Response, err error) {
if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil {
log.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
} else {
wg.Add(1)
go func(){
// Enqueue the url in chanURL
chanURL <- ctx.Cmd.URL().String()
}()
wg.Wait();
}
}))
// Create the Fetcher, handle the logging first, then dispatch to the Muxer
h := logHandler(mux)
if *stopAtURL != "" || *cancelAtURL != "" {
stopURL := *stopAtURL
if *cancelAtURL != "" {
stopURL = *cancelAtURL
}
h = stopHandler(stopURL, *cancelAtURL != "", logHandler(mux))
}
f := fetchbot.New(h)
// First mem stat print must be right after creating the fetchbot
if *memStats > 0 {
// Print starting stats
printMemStats(nil)
// Run at regular intervals
runMemStats(f, *memStats)
// On exit, print ending stats after a GC
defer func() {
runtime.GC()
printMemStats(nil)
}()
}
// Start processing
q := f.Start()
// if a stop or cancel is requested after some duration, launch the goroutine
// that will stop or cancel.
if *stopAfter > 0 || *cancelAfter > 0 {
after := *stopAfter
stopFunc := q.Close
if *cancelAfter != 0 {
after = *cancelAfter
stopFunc = q.Cancel
}
go func() {
c := time.After(after)
<-c
stopFunc()
}()
}
// Enqueue the seed, which is the first entry in the dup map
dup[*seed] = true
_, err := q.SendStringGet(*seed)
if err != nil {
log.Printf("[ERR] GET %s - %s\n", *seed, err)
}
// log.Println("The len of the queue: ", len(dup), "\nThe l is: ", l)
log.Println("The len(urls): ", len(urls))
q.Block()
}
func runMemStats(f *fetchbot.Fetcher, tick time.Duration) {
var mu sync.Mutex
var di *fetchbot.DebugInfo
// Start goroutine to collect fetchbot debug info
go func() {
for v := range f.Debug() {
mu.Lock()
di = v
mu.Unlock()
}
}()
// Start ticker goroutine to print mem stats at regular intervals
go func() {
c := time.Tick(tick)
for _ = range c {
mu.Lock()
printMemStats(di)
mu.Unlock()
}
}()
}
func printMemStats(di *fetchbot.DebugInfo) {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
buf := bytes.NewBuffer(nil)
buf.WriteString(strings.Repeat("=", 72) + "\n")
buf.WriteString("Memory Profile:\n")
buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024))
buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024))
buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC))
buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine()))
if di != nil {
buf.WriteString(fmt.Sprintf("\tNumHosts: %d\n", di.NumHosts))
}
buf.WriteString(strings.Repeat("=", 72))
log.Println(buf.String())
}
// stopHandler stops the fetcher if the stopurl is reached. Otherwise it dispatches
// the call to the wrapped Handler.
func stopHandler(stopurl string, cancel bool, wrapped fetchbot.Handler) fetchbot.Handler {
return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
if ctx.Cmd.URL().String() == stopurl {
log.Printf(">>>>> STOP URL %s\n", ctx.Cmd.URL())
// generally not a good idea to stop/block from a handler goroutine
// so do it in a separate goroutine
go func() {
if cancel {
ctx.Q.Cancel()
} else {
ctx.Q.Close()
}
}()
return
}
wrapped.Handle(ctx, res, err)
})
}
// logHandler prints the fetch information and dispatches the call to the wrapped Handler.
func logHandler(wrapped fetchbot.Handler) fetchbot.Handler {
return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
if err == nil {
log.Printf("[%d] %s %s - %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL(), res.Header.Get("Content-Type"))
}
wrapped.Handle(ctx, res, err)
})
}
func enqueueLinks(ctx *fetchbot.Context, doc *goquery.Document) {
mu.Lock()
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr("href")
// Resolve address
u, err := ctx.Cmd.URL().Parse(val)
if err != nil {
log.Printf("error: resolve URL %s - %s\n", val, err)
return
}
url := u.String()
if !dup[url] {
if _, err := ctx.Q.SendStringHead(url); err != nil {
log.Printf("error: enqueue head %s - %s\n", u, err)
} else {
dup[url] = true
wg.Add(1)
go func(){
// Enqueue the url in chanURL
chanURL <- ctx.Cmd.URL().String()
}()
wg.Wait();
// Enqueue the url in chanURL
//chanURL <- ctx.Cmd.URL().String()
}
}
})
mu.Unlock()
}
//================================================================================
//================================================================================
//================================================================================
// getProductInfo extract the required information by using chromedp package
func getProductInfo(urlstr, sel string, res *[]byte, pId, pUrl, url *string) cdp.Tasks {
return cdp.Tasks{
cdp.Navigate(urlstr),
cdp.Sleep(5 * time.Second),
cdp.WaitVisible(sel, cdp.ByID),
cdp.EvaluateAsDevTools("document.getElementById('product-id').value;", pId),
cdp.EvaluateAsDevTools("document.getElementById('product-url').value;", pUrl),
cdp.EvaluateAsDevTools("document.getElementById('webyclip-widget-3').contentWindow.document.body.outerHTML;", res),
}
}
// isPresent checks the existance of webyclip-widget-3 element.
func isPresent(url string, res *[]byte) cdp.Tasks {
return cdp.Tasks{
cdp.Navigate(url),
cdp.Sleep(15 * time.Second),
// cdp.EvaluateAsDevTools("document.getElementById('webyclip-thumbnails').childElementCount;", res),
cdp.EvaluateAsDevTools("if (document.getElementById('webyclip-thumbnails')) {document.getElementById('webyclip-thumbnails').childElementCount;} else {console.log('0')}", res),
}
}
//================================================================================
// getVideoLinks returns the Youtube viedo links present in the iframe webyclip-widget-3.
// returns all the links which are comma seperated.
func getVideoLinks(buf []byte) string {
var videoLinks string
//Convert byte buffer to String
innerDoc := string(buf[:])
tmp := strings.TrimSpace(innerDoc)
//Find the videolinks and create one final string
tmpStr := strings.Fields(tmp)
matchStr := "i.ytimg.com/vi/"
yUrl := "https://www.youtube.com/watch?v="
for _, v := range tmpStr {
//log.Println("Contains: ", strings.Contains(v, "i.ytimg.com"))
if strings.Contains(v, matchStr) {
vv := strings.TrimPrefix(v, "src=\\\"//i.ytimg.com/vi/")
id := strings.Split(vv, "/")
//log.Println("https://www.youtube.com/watch?v=" + id[0])
//log.Println("id: \tlen:\n",len(id), id)
youtubeLink := yUrl + id[0]
videoLinks += youtubeLink + ","
}
}
// return the video links
return videoLinks[:len(videoLinks)-1]
}
//========================================================================================
func WriteToFile(filePath, record string) {
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
// log.Println("File open failed for writing failure counts")
// return
log.Println("File doesn't exists. File will be created with the headers before adding data.")
// If file does not exists then create it with the header and write records.
file, err1 := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644)
if err1 != nil {
log.Println("File Open operation failed.")
return
}
defer file.Close()
header := fmt.Sprint("Product_ID" + "\t" + "Product_URL" + "\t" + "Youtube_Video_URLs")
file.WriteString(fmt.Sprintf("%s\n", header))
file.WriteString(fmt.Sprintf("%s\n", record))
return
}
defer f.Close()
log.Println("File exists Already. Adding the data for url.")
f.WriteString(fmt.Sprintf("%s\n", record))
}
//================================================================================
func WriteProcessedUrlsToFile(urls []string) string{
filePath := pwd() + "/ProcessedURLs.csv"
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644)
check(err, "Error in file Open operation")
defer f.Close()
for _, url := range urls {
f.WriteString(fmt.Sprintf("%s\n", url))
}
return filePath
}
//================================================================================
// check checks the error, panics if not nil
func check(err error, str string){
if err != nil { log.Fatalln(err, str) }
}
// pwd returns the current working directory through which the binary is invoked.
// used to save the csv file.
func pwd() string {
pwd, err := os.Getwd()
check(err, "Error in getting current workig dir.")
return pwd
}
//================================================================================
func DoCDP(url string) {
// create context
ctxt, cancel := context.WithCancel(context.Background())
defer cancel()
// create chrome instancefunc(map[string]interface{}) error
// c, err := cdp.New(ctxt, cdp.WithLog(log.Printf), cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1")))
c, err := cdp.New(ctxt, cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1")))
check(err, "Error in creating new cdp instance")
// run task list
var buf, buf1 []byte
var pId, pUrl string
// Check for the existence of the webyclip-widget-3 on the page
err = c.Run(ctxt, isPresent(url, &buf1))
check(err, "Error in Run method of cdp")
if (len(buf1) == 0) || (bytes.EqualFold([]byte("0"), buf1)){
log.Println("No webyclip-widget-3 on page:\n ", url)
// shutdown chrome
err = c.Shutdown(ctxt)
check(err, "Error in shutting down chrome")
// wait for chrome to finish
err = c.Wait()
check(err, "Error in wait to shutdown chrome")
return
//os.Exit(0)
} else {
//fmt.Println("In ELSE The status is: \t Len: ", len(buf), "\t", string(buf), " \t", buf)
// Exit the code if "webyclip-widget-3" is not present.
err = c.Run(ctxt, getProductInfo(url, `#webyclip-widget-3`, &buf, &pId, &pUrl, &url))
check(err, "Error in Run method of cdp")
// shutdown chrome
err = c.Shutdown(ctxt)
check(err, "Error in shutting down chrome")
// wait for chrome to finish
err = c.Wait()
check(err, "Error in wait to shutdown chrome")
pLinks := getVideoLinks(buf)
record := fmt.Sprint(pId + "\t" + pUrl + "\t" + pLinks)
filePath := pwd() + "/TokoProductDetails.csv"
WriteToFile(filePath, record)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment