Skip to content

Instantly share code, notes, and snippets.

@fsouza
Last active August 29, 2015 13:57
Show Gist options
  • Save fsouza/9715301 to your computer and use it in GitHub Desktop.
Save fsouza/9715301 to your computer and use it in GitHub Desktop.
package main
import (
"flag"
"fmt"
"github.com/globocom/tsuru/db/storage"
"github.com/gorilla/feeds"
"labix.org/v2/mgo"
"labix.org/v2/mgo/bson"
"net/http"
"regexp"
"time"
)
const (
NewsURL = "http://www.bmfbovespa.com.br/agencia/corpo.asp?origem=exibir&id=%s"
Limit = 100
)
var (
listenHTTP string
regexpNews = regexp.MustCompile(`^/bovespa/(\d+)$`)
)
func init() {
flag.StringVar(&listenHTTP, "listen", "127.0.0.1:7676", "address to listen to connections")
flag.Parse()
}
type News struct {
ID string `bson:"_id"`
Title string
Date time.Time
}
func (n *News) URL() string {
return fmt.Sprintf(NewsURL, n.ID)
}
func collection() (*storage.Collection, error) {
storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
if err != nil {
return nil, err
}
coll := storage.Collection("news")
coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
return coll, nil
}
func getFeed(query bson.M, id string) (*feeds.Feed, error) {
coll, err := collection()
if err != nil {
return nil, err
}
defer coll.Close()
var newsList []News
err = coll.Find(query).Sort("-date").Limit(Limit).All(&newsList)
if err != nil {
return nil, err
}
location, _ := time.LoadLocation("America/Sao_Paulo")
updated := time.Now()
if len(newsList) > 0 {
updated = newsList[0].Date.In(location)
}
feed := &feeds.Feed{
Title: "Bovespa - Plantão Empresas",
Link: &feeds.Link{Href: "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?w=" + id},
Description: "Notícias sobre empresas listadas na Bovespa",
Author: &feeds.Author{Name: "Francisco Souza", Email: "f@souza.cc"},
Created: time.Date(2014, 3, 20, 10, 0, 0, 0, location),
Updated: updated,
}
for _, news := range newsList {
item := feeds.Item{
Id: "http://plantao.souza.cc/bovespa/" + news.ID,
Title: news.Title,
Link: &feeds.Link{Href: "http://plantao.souza.cc/bovespa/" + news.ID},
Description: news.Title,
Author: &feeds.Author{Name: "Bovespa", Email: "bovespa@bmfbovespa.com.br"},
Created: news.Date,
Updated: news.Date,
}
feed.Items = append(feed.Items, &item)
}
return feed, nil
}
func feedAll(w http.ResponseWriter, r *http.Request) {
feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^((?!fii))", "$options": "i"}}, "all")
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
}
atom, err := feed.ToAtom()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Add("Content-Type", "application/xml")
fmt.Fprint(w, atom)
}
func feedFIIs(w http.ResponseWriter, r *http.Request) {
feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^fii", "$options": "i"}}, "fii")
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
atom, err := feed.ToAtom()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Add("Content-Type", "application/xml")
fmt.Fprint(w, atom)
}
func redirectNews(w http.ResponseWriter, r *http.Request) {
var newsID string
var news News
parts := regexpNews.FindStringSubmatch(r.URL.Path)
if len(parts) > 1 {
newsID = parts[1]
} else {
http.Error(w, "Page not found", http.StatusNotFound)
return
}
coll, err := collection()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
defer coll.Close()
err = coll.FindId(newsID).One(&news)
if err == mgo.ErrNotFound {
http.Error(w, "News not found", http.StatusNotFound)
return
} else if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Add("Location", news.URL())
w.WriteHeader(http.StatusMovedPermanently)
}
func main() {
http.Handle("/all.atom", http.HandlerFunc(feedAll))
http.Handle("/fii.atom", http.HandlerFunc(feedFIIs))
http.Handle("/", http.HandlerFunc(redirectNews))
http.ListenAndServe(listenHTTP, nil)
}
package main
import (
"bytes"
"flag"
"fmt"
"github.com/globocom/tsuru/db/storage"
"io/ioutil"
"labix.org/v2/mgo"
"launchpad.net/xmlpath"
"log"
"net/http"
"regexp"
"strings"
"time"
)
const BaseURL = "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?idioma=pt-br&q=&tipoFiltro=%d&pg=%d"
var (
pathLink = xmlpath.MustCompile(`//ul[@id="linksNoticias"]/li/a`)
pathHrefLink = xmlpath.MustCompile("./@href")
idRegexp = regexp.MustCompile(`^ListarNoticias.aspx\?idioma=pt-br\&idNoticia=(\d+)\&.*$`)
replaceLessThan = []byte{' ', '<', ' '}
replaceGreaterThan = []byte{' ', '>', ' '}
replaceLessOrEqual = []byte{' ', '<', '=', ' '}
replaceGreaterOrEqual = []byte{' ', '>', '=', ' '}
tickerTimer time.Duration
filter int
)
func init() {
flag.DurationVar(&tickerTimer, "interval", 10*time.Minute, "Ticker interval")
flag.IntVar(&filter, "filter", 0, "News filter (0 for daily, 1 for weekly)")
flag.Parse()
}
type News struct {
ID string `bson:"_id"`
Title string
Date time.Time
}
func collection() (*storage.Collection, error) {
storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
if err != nil {
return nil, err
}
coll := storage.Collection("news")
coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
return coll, nil
}
func downloadContent(page int) (*xmlpath.Node, error) {
url := fmt.Sprintf(BaseURL, filter, page)
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
content, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}
content = bytes.Replace(content, replaceLessThan, nil, -1)
content = bytes.Replace(content, replaceGreaterThan, nil, -1)
content = bytes.Replace(content, replaceLessOrEqual, nil, -1)
content = bytes.Replace(content, replaceGreaterOrEqual, nil, -1)
node, err := xmlpath.ParseHTML(bytes.NewBuffer(content))
if err != nil {
return nil, err
}
return node, err
}
func saveNews(news []News) {
coll, err := collection()
if err != nil {
log.Printf("[ERROR] Failed to save news: %s", err)
return
}
defer coll.Close()
for _, n := range news {
_, err = coll.UpsertId(n.ID, n)
if err != nil {
log.Printf("[ERROR] Failed to save news: %s", err)
}
}
}
func collectNews(node *xmlpath.Node) []News {
location, _ := time.LoadLocation("America/Sao_Paulo")
var err error
var newsList []News
iter := pathLink.Iter(node)
for iter.Next() {
var news News
target, ok := pathHrefLink.String(iter.Node())
if !ok {
continue
}
parts := idRegexp.FindStringSubmatch(target)
if len(parts) > 1 {
news.ID = parts[1]
}
content := iter.Node().String()
content = strings.TrimSpace(content)
parts = strings.SplitN(content, " - ", 2)
if len(parts) < 2 {
continue
}
news.Title = parts[1]
news.Date, err = time.ParseInLocation("02/01/2006 15:04", parts[0], location)
if err != nil {
log.Printf("[WARNING] Wrong date for news: %s", err)
continue
}
newsList = append(newsList, news)
}
return newsList
}
func run() {
for i := 0; ; i++ {
node, err := downloadContent(i + 1)
if err != nil {
log.Print(err)
}
if !pathLink.Exists(node) {
break
}
newsList := collectNews(node)
if len(newsList) > 0 {
saveNews(newsList)
}
}
}
func main() {
for _ = range time.Tick(tickerTimer) {
run()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment