Skip to content

Instantly share code, notes, and snippets.

@rafakato
Created July 30, 2014 22:23
Show Gist options
  • Save rafakato/e4e352cf69e2a56e5d7b to your computer and use it in GitHub Desktop.
Save rafakato/e4e352cf69e2a56e5d7b to your computer and use it in GitHub Desktop.
Go Crawler
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"gopkg.in/mgo.v2"
"gopkg.in/mgo.v2/bson"
"log"
"net/http"
"strconv"
"strings"
"sync"
"time"
)
func perror(err error) {
if err != nil {
log.Fatal(err)
}
}
func ScrapPage(url string, infoChannel chan InformacaoNutricional) {
wg.Add(1)
fmt.Printf("Scraping page %s\n", url)
info := InformacaoNutricional{Propriedades: make(map[string]string)}
doc, err := goquery.NewDocument(url)
perror(err)
fmt.Printf("Page data received: %s\n", url)
info.Nome = strings.TrimSpace(doc.Find(".product-header > .product-header__heading").Text())
info.LojaUrl = url
doc.Find(".product-grid table").Each(func(t_i int, table *goquery.Selection) {
//header
info.Medida = strings.TrimSpace(table.Find("thead > tr").Eq(1).Text())
table.Find("tbody > tr").Each(func(tr_i int, row *goquery.Selection) {
info.Propriedades[strings.TrimSpace(row.Find("td").Eq(0).Text())] = strings.TrimSpace(row.Find("td").Eq(1).Text())
})
})
infoChannel <- info
}
func AddToList(index int, infoChannel chan InformacaoNutricional) {
info := <-infoChannel
if info.Nome != "" && info.Medida != "" {
infos = append(infos, info)
}
defer wg.Done()
}
var wg sync.WaitGroup
var infos []interface{}
func main() {
http.DefaultTransport.(*http.Transport).ResponseHeaderTimeout = time.Minute * 3
mongodbServer := "127.0.0.1:27017"
fmt.Printf("Connecting to MongoDB on %s\n", mongodbServer)
session, err := mgo.Dial(mongodbServer)
perror(err)
defer session.Close()
fmt.Printf("Connected to MongoDB on %s\n", mongodbServer)
alimentosCollection := session.DB("CrawlerAlimentos").C("Alimentos")
totalPerLoop := 100
max := 10000000
infoChannel := make(chan InformacaoNutricional)
for n := 0; n < (max / totalPerLoop); n++ {
start := (n * totalPerLoop) + 1
end := ((n + 1) * totalPerLoop)
infos = make([]interface{}, 0)
for i := start; i <= end; i++ {
url := "http://www.paodeacucar.com.br/produto/" + strconv.Itoa(i)
var info InformacaoNutricional
alimentosCollection.Find(bson.M{"LojaUrl": url}).One(&info)
if info.Nome == "" {
go ScrapPage(url, infoChannel)
go AddToList(i, infoChannel)
} else {
fmt.Printf("Skipping page: %s\n", url)
}
time.Sleep(150 * time.Millisecond)
}
wg.Wait()
fmt.Println("Writing batch to db")
alimentosCollection.Insert(infos...)
time.Sleep(1 * time.Second)
}
fmt.Println("Done!")
}
type InformacaoNutricional struct {
LojaUrl string
Nome string
Medida string
Propriedades map[string]string
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment