Skip to content

Instantly share code, notes, and snippets.

@yogesh-desai
Created September 26, 2017 07:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yogesh-desai/bf0d203afe2d151bbefbcc98b427a09c to your computer and use it in GitHub Desktop.
Save yogesh-desai/bf0d203afe2d151bbefbcc98b427a09c to your computer and use it in GitHub Desktop.
package main
import (
"golang.org/x/net/html"
"net/http"
"net/url"
"runtime"
"strings"
"context"
"bytes"
"flag"
"sync"
"time"
"fmt"
"log"
"os"
// "github.com/PuerkitoBio/fetchbot"
// "github.com/PuerkitoBio/goquery"
cdp "github.com/knq/chromedp"
cdpr "github.com/knq/chromedp/runner"
)
var (
baseurl string
// Command-line flags
seed = flag.String("seed", "https://jeremywho.com", "seed URL")
//seed = flag.String("seed", "https://www.tokopedia.com/", "seed URL")
cancelAfter = flag.Duration("cancelafter", 0, "automatically cancel the fetchbot after a given time")
cancelAtURL = flag.String("cancelat", "", "automatically cancel the fetchbot at a given URL")
stopAfter = flag.Duration("stopafter", 0, "automatically stop the fetchbot after a given time")
stopAtURL = flag.String("stopat", "", "automatically stop the fetchbot at a given URL")
memStats = flag.Duration("memstats", 5 * time.Minute, "display memory statistics at a given interval")
)
func DoExtract(chanURL chan string){
time.Sleep(2 * time.Second)
for{
url := <- chanURL
//var u string
// Append visited urls
// u := fmt.Sprintf("%v", url)
// u = g(url)
//urls = append(urls, url)
DoCDP(url)
}
}
func main() {
flag.Parse()
u, err := url.Parse(*seed)
check(err, "Error in parsing the seed url")
log.Println("The URL: ", u)
baseurl = u.String()
urlProcessor := make(chan string)
done := make(chan bool)
go processURL(urlProcessor, done)
go DoExtract(urlProcessor)
urlProcessor <- u.String() //fmt.Sprint(u) //"https://jeremywho.com"
// First mem stat print must be right after creating the fetchbot
if *memStats > 0 {
// Print starting stats
printMemStats()
// Run at regular intervals
runMemStats(*memStats)
// On exit, print ending stats after a GC
defer func() {
runtime.GC()
printMemStats()
}()
}
// if a stop or cancel is requested after some duration, launch the goroutine
// that will stop or cancel.
if *stopAfter > 0 || *cancelAfter > 0 {
after := *stopAfter
stopFunc := true
if *cancelAfter != 0 {
after = *cancelAfter
stopFunc = true
}
go func() {
c := time.After(after)
<-c
fmt.Println("The given timeout has occured. Exiting...")
done <- stopFunc
}()
}
<-done
fmt.Println("Done")
/* if _, err := os.Stat(pwd() + "/TokoProductDetails.csv"); !os.IsNotExist(err) {
log.Println("The output TSV file location: ", pwd() + "/TokoProductDetails.csv")
} else {
log.Println("Required data is not present in any of processed URLs.")
}
// Write the processed URLs to a file
fmt.Println("Total no. of URLs processed: ", len(urls), "\nThe Processed URLs are in the file: ", WriteProcessedUrlsToFile(urls))
*/
}
func runMemStats(tick time.Duration) {
var mu sync.Mutex
go func() {
c := time.Tick(tick)
for _ = range c {
mu.Lock()
printMemStats()
mu.Unlock()
}
}()
}
func printMemStats() {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
buf := bytes.NewBuffer(nil)
buf.WriteString(strings.Repeat("=", 72) + "\n")
buf.WriteString("Memory Profile:\n")
buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024))
buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024))
buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC))
buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine()))
buf.WriteString(strings.Repeat("=", 72))
log.Println(buf.String())
}
// processURL checks the url is already visited or not.
//If not visited already, then set map = true and explore page for more links.
func processURL(urlProcessor chan string, done chan bool) {
visited := make(map[string]bool)
for {
select {
case url := <-urlProcessor:
if _, ok := visited[url]; ok {
continue
} else {
visited[url] = true
go exploreURL(url, urlProcessor)
}
case <-time.After(15 * time.Second):
fmt.Printf("Explored %d pages\n", len(visited))
done <- truex
}
}
}
// exploreURL does HTTP GET and tokenize the response
func exploreURL(url string, urlProcessor chan string) {
fmt.Printf("Visiting %s.\n", url)
resp, err := http.Get(url)
if err != nil {
fmt.Println(err)
return
}
defer resp.Body.Close()
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
if tt == html.ErrorToken {
return
}
if tt == html.StartTagToken {
t := z.Token()
if t.Data == "a" {
for _, a := range t.Attr {
if a.Key == "href" {
// if link is within jeremywho.com
if strings.HasPrefix(a.Val, baseurl) {
urlProcessor <- a.Val
}
}
}
}
}
}
}
//================================================================================
//================================================================================
// getProductInfo extract the required information by using chromedp package
func getProductInfo(urlstr, sel string, res *[]byte, pId, pUrl, url *string) cdp.Tasks {
return cdp.Tasks{
cdp.Navigate(urlstr),
cdp.Sleep(5 * time.Second),
cdp.WaitVisible(sel, cdp.ByID),
cdp.EvaluateAsDevTools("document.getElementById('product-id').value;", pId),
cdp.EvaluateAsDevTools("document.getElementById('product-url').value;", pUrl),
cdp.EvaluateAsDevTools("document.getElementById('webyclip-widget-3').contentWindow.document.body.outerHTML;", res),
}
}
// isPresent checks the existance of webyclip-widget-3 element.
func isPresent(url string, res *[]byte) cdp.Tasks {
return cdp.Tasks{
cdp.Navigate(url),
cdp.Sleep(15 * time.Second),
// cdp.EvaluateAsDevTools("document.getElementById('webyclip-thumbnails').childElementCount;", res),
cdp.EvaluateAsDevTools("if (document.getElementById('webyclip-thumbnails')) {document.getElementById('webyclip-thumbnails').childElementCount;} else {console.log('0')}", res),
}
}
//================================================================================
// getVideoLinks returns the Youtube viedo links present in the iframe webyclip-widget-3.
// returns all the links which are comma seperated.
func getVideoLinks(buf []byte) string {
var videoLinks string
//Convert byte buffer to String
innerDoc := string(buf[:])
tmp := strings.TrimSpace(innerDoc)
//Find the videolinks and create one final string
tmpStr := strings.Fields(tmp)
matchStr := "i.ytimg.com/vi/"
yUrl := "https://www.youtube.com/watch?v="
for _, v := range tmpStr {
//log.Println("Contains: ", strings.Contains(v, "i.ytimg.com"))
if strings.Contains(v, matchStr) {
vv := strings.TrimPrefix(v, "src=\\\"//i.ytimg.com/vi/")
id := strings.Split(vv, "/")
//log.Println("https://www.youtube.com/watch?v=" + id[0])
//log.Println("id: \tlen:\n",len(id), id)
youtubeLink := yUrl + id[0]
videoLinks += youtubeLink + ","
}
}
// return the video links
return videoLinks[:len(videoLinks)-1]
}
//========================================================================================
func WriteToFile(filePath, record string) {
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
// log.Println("File open failed for writing failure counts")
// return
log.Println("File doesn't exists. File will be created with the headers before adding data.")
// If file does not exists then create it with the header and write records.
file, err1 := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644)
if err1 != nil {
log.Println("File Open operation failed.")
return
}
defer file.Close()
header := fmt.Sprint("Product_ID" + "\t" + "Product_URL" + "\t" + "Youtube_Video_URLs")
file.WriteString(fmt.Sprintf("%s\n", header))
file.WriteString(fmt.Sprintf("%s\n", record))
return
}
defer f.Close()
log.Println("File exists Already. Adding the data for url.")
f.WriteString(fmt.Sprintf("%s\n", record))
}
//================================================================================
func WriteProcessedUrlsToFile(urls []string) string{
filePath := pwd() + "/ProcessedURLs.csv"
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0644)
check(err, "Error in file Open operation")
defer f.Close()
for _, url := range urls {
f.WriteString(fmt.Sprintf("%s\n", url))
}
return filePath
}
//================================================================================
// check checks the error, panics if not nil
func check(err error, str string){
if err != nil { log.Fatalln(err, str) }
}
// pwd returns the current working directory through which the binary is invoked.
// used to save the csv file.
func pwd() string {
pwd, err := os.Getwd()
check(err, "Error in getting current workig dir.")
return pwd
}
//================================================================================
func DoCDP(url string) {
// create context
ctxt, cancel := context.WithCancel(context.Background())
defer cancel()
// create chrome instancefunc(map[string]interface{}) error
// c, err := cdp.New(ctxt, cdp.WithLog(log.Printf), cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1")))
c, err := cdp.New(ctxt, cdp.WithRunnerOptions(cdpr.Flag("disable-web-security", "1")))
check(err, "Error in creating new cdp instance")
// run task list
var buf, buf1 []byte
var pId, pUrl string
// Check for the existence of the webyclip-widget-3 on the page
err = c.Run(ctxt, isPresent(url, &buf1))
check(err, "Error in Run method of cdp")
if (len(buf1) == 0) || (bytes.EqualFold([]byte("0"), buf1)){
log.Println("No webyclip-widget-3 on page:\n ", url)
// shutdown chrome
err = c.Shutdown(ctxt)
check(err, "Error in shutting down chrome")
// wait for chrome to finish
err = c.Wait()
check(err, "Error in wait to shutdown chrome")
return
//os.Exit(0)
} else {
//fmt.Println("In ELSE The status is: \t Len: ", len(buf), "\t", string(buf), " \t", buf)
// Exit the code if "webyclip-widget-3" is not present.
err = c.Run(ctxt, getProductInfo(url, `#webyclip-widget-3`, &buf, &pId, &pUrl, &url))
check(err, "Error in Run method of cdp")
// shutdown chrome
err = c.Shutdown(ctxt)
check(err, "Error in shutting down chrome")
// wait for chrome to finish
err = c.Wait()
check(err, "Error in wait to shutdown chrome")
pLinks := getVideoLinks(buf)
record := fmt.Sprint(pId + "\t" + pUrl + "\t" + pLinks)
filePath := pwd() + "/TokoProductDetails.csv"
WriteToFile(filePath, record)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment