Skip to content

Instantly share code, notes, and snippets.

@jamo
Created August 3, 2019 20:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jamo/74f2654d41605f69cbc13204209ca949 to your computer and use it in GitHub Desktop.
Save jamo/74f2654d41605f69cbc13204209ca949 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"encoding/json"
"flag"
"fmt"
"os"
"strings"
"github.com/gocolly/colly"
"github.com/gocolly/colly/queue"
)
type Result struct {
URL string `json:"url"`
Generator string `json:"generator"`
NextIDFound bool `json:"nextIDFound"`
NextDataFound bool `json:"nextDataFound"`
Error bool `json:"error"`
Status int `json:"status"`
}
func main() {
urlsFileName := flag.String("urls", "urls.csv", "Name of the top 10M")
resFileName := flag.String("result", "result.jl", "Name result file")
flag.Parse()
resFile, resFileErr := os.OpenFile(*resFileName, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600)
if resFileErr != nil {
panic(resFileErr)
}
defer resFile.Close()
// Instantiate default collector
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
//colly.MaxDepth(2),
//colly.Async(true),
)
//c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})
// create a request queue with 2 consumer threads
q, _ := queue.New(
20, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 100000000000}, // Use default queue storage
)
c.OnRequest(func(r *colly.Request) {
fmt.Println("visiting", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", "\nError:", err.Error())
retVal := Result{URL: r.Request.URL.String(), Status: r.StatusCode, Error: true}
jsonstr, _ := json.Marshal(retVal)
if _, err := resFile.WriteString(string(jsonstr) + "\n"); err != nil {
panic(err)
}
})
c.OnHTML("html", func(e *colly.HTMLElement) {
retVal := Result{URL: e.Request.URL.String(), Error: false, Status: 200}
generator := e.ChildAttr(`meta[name="generator"]`, "content")
if generator != "" {
retVal.Generator = generator
}
nextID := e.ChildAttr(`#__next`, `id`)
if nextID != "" {
retVal.NextIDFound = true
}
nextdata := e.ChildAttr(`#__NEXT_DATA__`, `id`)
if nextdata != "" {
retVal.NextDataFound = true
}
jsonstr, _ := json.Marshal(retVal)
if _, err := resFile.WriteString(string(jsonstr) + "\n"); err != nil {
panic(err)
}
//link := e.Attr("href")
// Print link
//fmt.Printf("Link found: %q -> %s\n", e.Text, link)
// Visit link found on page
// Only those links are visited which are in AllowedDomains
//c.Visit(e.Request.AbsoluteURL(link))
})
crawlFile, crawlFileErr := os.OpenFile(*urlsFileName, os.O_RDONLY, 0600)
if crawlFileErr != nil {
panic(crawlFileErr)
}
defer crawlFile.Close()
scanner := bufio.NewScanner(crawlFile)
for scanner.Scan() {
q.AddURL(`http://` + strings.ReplaceAll(strings.Split(scanner.Text(), ",")[1], `"`, ``))
}
fmt.Println(`ok, starting up`)
q.Run(c)
c.Wait()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment