Skip to content

Instantly share code, notes, and snippets.

@techjanitor
Last active August 29, 2015 14:07
Show Gist options
  • Save techjanitor/5effc78b792cf79eb9a0 to your computer and use it in GitHub Desktop.
Save techjanitor/5effc78b792cf79eb9a0 to your computer and use it in GitHub Desktop.
Scrape a website with GoQuery
func Scrape(address string) (b *bytes.Buffer) {
// Create a new goquery document from the address
doc, err := goquery.NewDocument(address)
if err != nil {
log.Fatal(err)
}
// A buffer for the CSV writer
b := &bytes.Buffer{}
// I used CSV as the serialization but it can be anything
writer := csv.NewWriter(b)
// Search for CSS class and then loop through all contents
doc.Find(".reply_body").Each(func(i int, s *goquery.Selection) {
// Find a link from any a elements
img, _ := s.Find("a").Attr("href")
// Get text from certain class
name := s.Find(".whatever").Text()
// Get a class and its contents, and then use Not to remove stuff you don't want
time := s.Find(".info").Contents().Not(".name").Not(".title").Not("a").Text()
// An example of getting the html from a certain class
reply, _ := s.Find(".reply").Html()
// It comes escaped so you can unescape it like this
reply = html.UnescapeString(reply)
// Make a slice with the stuff
lineformat := []string{img, name, time, reply}
// Write to the CSV writer
writer.Write(lineformat)
})
// Flush buffer to writer
writer.Flush()
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment