Skip to content

Instantly share code, notes, and snippets.

@frankcash
Created March 14, 2018 02:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save frankcash/7bc32891b686cea0a0cf98ff6ae90b33 to your computer and use it in GitHub Desktop.
Save frankcash/7bc32891b686cea0a0cf98ff6ae90b33 to your computer and use it in GitHub Desktop.
Structured Colly Example for Reddit.com/r/programming
package main
import (
"fmt"
"time"
"github.com/gocolly/colly"
)
type item struct {
StoryURL string
Source string
comments string
CrawledAt time.Time
Comments string
Title string
}
func main() {
stories := []item{}
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: reddit.com
colly.AllowedDomains("www.reddit.com"),
)
// On every a element which has .top-matter attribute call callback
// This class is unique to the div that holds all information about a story
c.OnHTML(".top-matter", func(e *colly.HTMLElement) {
temp := item{}
temp.StoryURL = e.ChildAttr("a[data-event-action=title]", "href")
temp.Source = "https://www.reddit.com/r/programming/"
temp.Title = e.ChildText("a[data-event-action=title]")
temp.Comments = e.ChildAttr("a[data-event-action=comments]", "href")
temp.CrawledAt = time.Now()
stories = append(stories, temp)
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.Visit("https://www.reddit.com/r/programming/")
c.Wait()
fmt.Println(stories)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment