fsouza/feed_plantao_empresas.go

## feed_plantao_empresas.go
package main

import (
	"flag"
	"fmt"
	"github.com/globocom/tsuru/db/storage"
	"github.com/gorilla/feeds"
	"labix.org/v2/mgo"
	"labix.org/v2/mgo/bson"
	"net/http"
	"regexp"
	"time"
)

const (
	NewsURL = "http://www.bmfbovespa.com.br/agencia/corpo.asp?origem=exibir&id=%s"
	Limit   = 100
)

var (
	listenHTTP string
	regexpNews = regexp.MustCompile(`^/bovespa/(\d+)$`)
)

func init() {
	flag.StringVar(&listenHTTP, "listen", "127.0.0.1:7676", "address to listen to connections")
	flag.Parse()
}

type News struct {
	ID    string `bson:"_id"`
	Title string
	Date  time.Time
}

func (n *News) URL() string {
	return fmt.Sprintf(NewsURL, n.ID)
}

func collection() (*storage.Collection, error) {
	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
	if err != nil {
		return nil, err
	}
	coll := storage.Collection("news")
	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
	return coll, nil
}

func getFeed(query bson.M, id string) (*feeds.Feed, error) {
	coll, err := collection()
	if err != nil {
		return nil, err
	}
	defer coll.Close()
	var newsList []News
	err = coll.Find(query).Sort("-date").Limit(Limit).All(&newsList)
	if err != nil {
		return nil, err
	}
	location, _ := time.LoadLocation("America/Sao_Paulo")
	updated := time.Now()
	if len(newsList) > 0 {
		updated = newsList[0].Date.In(location)
	}
	feed := &feeds.Feed{
		Title:       "Bovespa - Plantão Empresas",
		Link:        &feeds.Link{Href: "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?w=" + id},
		Description: "Notícias sobre empresas listadas na Bovespa",
		Author:      &feeds.Author{Name: "Francisco Souza", Email: "f@souza.cc"},
		Created:     time.Date(2014, 3, 20, 10, 0, 0, 0, location),
		Updated:     updated,
	}
	for _, news := range newsList {
		item := feeds.Item{
			Id:          "http://plantao.souza.cc/bovespa/" + news.ID,
			Title:       news.Title,
			Link:        &feeds.Link{Href: "http://plantao.souza.cc/bovespa/" + news.ID},
			Description: news.Title,
			Author:      &feeds.Author{Name: "Bovespa", Email: "bovespa@bmfbovespa.com.br"},
			Created:     news.Date,
			Updated:     news.Date,
		}
		feed.Items = append(feed.Items, &item)
	}
	return feed, nil
}

func feedAll(w http.ResponseWriter, r *http.Request) {
	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^((?!fii))", "$options": "i"}}, "all")
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
	}
	atom, err := feed.ToAtom()
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}
	w.Header().Add("Content-Type", "application/xml")
	fmt.Fprint(w, atom)
}

func feedFIIs(w http.ResponseWriter, r *http.Request) {
	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^fii", "$options": "i"}}, "fii")
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}
	atom, err := feed.ToAtom()
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}
	w.Header().Add("Content-Type", "application/xml")
	fmt.Fprint(w, atom)
}

func redirectNews(w http.ResponseWriter, r *http.Request) {
	var newsID string
	var news News
	parts := regexpNews.FindStringSubmatch(r.URL.Path)
	if len(parts) > 1 {
		newsID = parts[1]
	} else {
		http.Error(w, "Page not found", http.StatusNotFound)
		return
	}
	coll, err := collection()
	if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}
	defer coll.Close()
	err = coll.FindId(newsID).One(&news)
	if err == mgo.ErrNotFound {
		http.Error(w, "News not found", http.StatusNotFound)
		return
	} else if err != nil {
		http.Error(w, err.Error(), http.StatusInternalServerError)
		return
	}
	w.Header().Add("Location", news.URL())
	w.WriteHeader(http.StatusMovedPermanently)
}

func main() {
	http.Handle("/all.atom", http.HandlerFunc(feedAll))
	http.Handle("/fii.atom", http.HandlerFunc(feedFIIs))
	http.Handle("/", http.HandlerFunc(redirectNews))
	http.ListenAndServe(listenHTTP, nil)
}

## plantao_empresas.go
package main

import (
	"bytes"
	"flag"
	"fmt"
	"github.com/globocom/tsuru/db/storage"
	"io/ioutil"
	"labix.org/v2/mgo"
	"launchpad.net/xmlpath"
	"log"
	"net/http"
	"regexp"
	"strings"
	"time"
)

const BaseURL = "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?idioma=pt-br&q=&tipoFiltro=%d&pg=%d"

var (
	pathLink              = xmlpath.MustCompile(`//ul[@id="linksNoticias"]/li/a`)
	pathHrefLink          = xmlpath.MustCompile("./@href")
	idRegexp              = regexp.MustCompile(`^ListarNoticias.aspx\?idioma=pt-br\&idNoticia=(\d+)\&.*$`)
	replaceLessThan       = []byte{' ', '<', ' '}
	replaceGreaterThan    = []byte{' ', '>', ' '}
	replaceLessOrEqual    = []byte{' ', '<', '=', ' '}
	replaceGreaterOrEqual = []byte{' ', '>', '=', ' '}
	tickerTimer           time.Duration
	filter                int
)

func init() {
	flag.DurationVar(&tickerTimer, "interval", 10*time.Minute, "Ticker interval")
	flag.IntVar(&filter, "filter", 0, "News filter (0 for daily, 1 for weekly)")
	flag.Parse()
}

type News struct {
	ID    string `bson:"_id"`
	Title string
	Date  time.Time
}

func collection() (*storage.Collection, error) {
	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
	if err != nil {
		return nil, err
	}
	coll := storage.Collection("news")
	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
	return coll, nil
}

func downloadContent(page int) (*xmlpath.Node, error) {
	url := fmt.Sprintf(BaseURL, filter, page)
	resp, err := http.Get(url)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, err
	}
	content = bytes.Replace(content, replaceLessThan, nil, -1)
	content = bytes.Replace(content, replaceGreaterThan, nil, -1)
	content = bytes.Replace(content, replaceLessOrEqual, nil, -1)
	content = bytes.Replace(content, replaceGreaterOrEqual, nil, -1)
	node, err := xmlpath.ParseHTML(bytes.NewBuffer(content))
	if err != nil {
		return nil, err
	}
	return node, err
}

func saveNews(news []News) {
	coll, err := collection()
	if err != nil {
		log.Printf("[ERROR] Failed to save news: %s", err)
		return
	}
	defer coll.Close()
	for _, n := range news {
		_, err = coll.UpsertId(n.ID, n)
		if err != nil {
			log.Printf("[ERROR] Failed to save news: %s", err)
		}
	}
}

func collectNews(node *xmlpath.Node) []News {
	location, _ := time.LoadLocation("America/Sao_Paulo")
	var err error
	var newsList []News
	iter := pathLink.Iter(node)
	for iter.Next() {
		var news News
		target, ok := pathHrefLink.String(iter.Node())
		if !ok {
			continue
		}
		parts := idRegexp.FindStringSubmatch(target)
		if len(parts) > 1 {
			news.ID = parts[1]
		}
		content := iter.Node().String()
		content = strings.TrimSpace(content)
		parts = strings.SplitN(content, " - ", 2)
		if len(parts) < 2 {
			continue
		}
		news.Title = parts[1]
		news.Date, err = time.ParseInLocation("02/01/2006 15:04", parts[0], location)
		if err != nil {
			log.Printf("[WARNING] Wrong date for news: %s", err)
			continue
		}
		newsList = append(newsList, news)
	}
	return newsList
}

func run() {
	for i := 0; ; i++ {
		node, err := downloadContent(i + 1)
		if err != nil {
			log.Print(err)
		}
		if !pathLink.Exists(node) {
			break
		}
		newsList := collectNews(node)
		if len(newsList) > 0 {
			saveNews(newsList)
		}
	}
}

func main() {
	for _ = range time.Tick(tickerTimer) {
		run()
	}
}
	package main

	import (
	"flag"
	"fmt"
	"github.com/globocom/tsuru/db/storage"
	"github.com/gorilla/feeds"
	"labix.org/v2/mgo"
	"labix.org/v2/mgo/bson"
	"net/http"
	"regexp"
	"time"
	)

	const (
	NewsURL = "http://www.bmfbovespa.com.br/agencia/corpo.asp?origem=exibir&id=%s"
	Limit = 100
	)

	var (
	listenHTTP string
	regexpNews = regexp.MustCompile(`^/bovespa/(\d+)$`)
	)

	func init() {
	flag.StringVar(&listenHTTP, "listen", "127.0.0.1:7676", "address to listen to connections")
	flag.Parse()
	}

	type News struct {
	ID string `bson:"_id"`
	Title string
	Date time.Time
	}

	func (n *News) URL() string {
	return fmt.Sprintf(NewsURL, n.ID)
	}

	func collection() (*storage.Collection, error) {
	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
	if err != nil {
	return nil, err
	}
	coll := storage.Collection("news")
	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
	return coll, nil
	}

	func getFeed(query bson.M, id string) (*feeds.Feed, error) {
	coll, err := collection()
	if err != nil {
	return nil, err
	}
	defer coll.Close()
	var newsList []News
	err = coll.Find(query).Sort("-date").Limit(Limit).All(&newsList)
	if err != nil {
	return nil, err
	}
	location, _ := time.LoadLocation("America/Sao_Paulo")
	updated := time.Now()
	if len(newsList) > 0 {
	updated = newsList[0].Date.In(location)
	}
	feed := &feeds.Feed{
	Title: "Bovespa - Plantão Empresas",
	Link: &feeds.Link{Href: "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?w=" + id},
	Description: "Notícias sobre empresas listadas na Bovespa",
	Author: &feeds.Author{Name: "Francisco Souza", Email: "f@souza.cc"},
	Created: time.Date(2014, 3, 20, 10, 0, 0, 0, location),
	Updated: updated,
	}
	for _, news := range newsList {
	item := feeds.Item{
	Id: "http://plantao.souza.cc/bovespa/" + news.ID,
	Title: news.Title,
	Link: &feeds.Link{Href: "http://plantao.souza.cc/bovespa/" + news.ID},
	Description: news.Title,
	Author: &feeds.Author{Name: "Bovespa", Email: "bovespa@bmfbovespa.com.br"},
	Created: news.Date,
	Updated: news.Date,
	}
	feed.Items = append(feed.Items, &item)
	}
	return feed, nil
	}

	func feedAll(w http.ResponseWriter, r *http.Request) {
	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^((?!fii))", "$options": "i"}}, "all")
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	}
	atom, err := feed.ToAtom()
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	w.Header().Add("Content-Type", "application/xml")
	fmt.Fprint(w, atom)
	}

	func feedFIIs(w http.ResponseWriter, r *http.Request) {
	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^fii", "$options": "i"}}, "fii")
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	atom, err := feed.ToAtom()
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	w.Header().Add("Content-Type", "application/xml")
	fmt.Fprint(w, atom)
	}

	func redirectNews(w http.ResponseWriter, r *http.Request) {
	var newsID string
	var news News
	parts := regexpNews.FindStringSubmatch(r.URL.Path)
	if len(parts) > 1 {
	newsID = parts[1]
	} else {
	http.Error(w, "Page not found", http.StatusNotFound)
	return
	}
	coll, err := collection()
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	defer coll.Close()
	err = coll.FindId(newsID).One(&news)
	if err == mgo.ErrNotFound {
	http.Error(w, "News not found", http.StatusNotFound)
	return
	} else if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	w.Header().Add("Location", news.URL())
	w.WriteHeader(http.StatusMovedPermanently)
	}

	func main() {
	http.Handle("/all.atom", http.HandlerFunc(feedAll))
	http.Handle("/fii.atom", http.HandlerFunc(feedFIIs))
	http.Handle("/", http.HandlerFunc(redirectNews))
	http.ListenAndServe(listenHTTP, nil)
	}
	package main

	import (
	"bytes"
	"flag"
	"fmt"
	"github.com/globocom/tsuru/db/storage"
	"io/ioutil"
	"labix.org/v2/mgo"
	"launchpad.net/xmlpath"
	"log"
	"net/http"
	"regexp"
	"strings"
	"time"
	)

	const BaseURL = "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?idioma=pt-br&q=&tipoFiltro=%d&pg=%d"

	var (
	pathLink = xmlpath.MustCompile(`//ul[@id="linksNoticias"]/li/a`)
	pathHrefLink = xmlpath.MustCompile("./@href")
	idRegexp = regexp.MustCompile(`^ListarNoticias.aspx\?idioma=pt-br\&idNoticia=(\d+)\&.*$`)
	replaceLessThan = []byte{' ', '<', ' '}
	replaceGreaterThan = []byte{' ', '>', ' '}
	replaceLessOrEqual = []byte{' ', '<', '=', ' '}
	replaceGreaterOrEqual = []byte{' ', '>', '=', ' '}
	tickerTimer time.Duration
	filter int
	)

	func init() {
	flag.DurationVar(&tickerTimer, "interval", 10*time.Minute, "Ticker interval")
	flag.IntVar(&filter, "filter", 0, "News filter (0 for daily, 1 for weekly)")
	flag.Parse()
	}

	type News struct {
	ID string `bson:"_id"`
	Title string
	Date time.Time
	}

	func collection() (*storage.Collection, error) {
	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
	if err != nil {
	return nil, err
	}
	coll := storage.Collection("news")
	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
	return coll, nil
	}

	func downloadContent(page int) (*xmlpath.Node, error) {
	url := fmt.Sprintf(BaseURL, filter, page)
	resp, err := http.Get(url)
	if err != nil {
	return nil, err
	}
	defer resp.Body.Close()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil {
	return nil, err
	}
	content = bytes.Replace(content, replaceLessThan, nil, -1)
	content = bytes.Replace(content, replaceGreaterThan, nil, -1)
	content = bytes.Replace(content, replaceLessOrEqual, nil, -1)
	content = bytes.Replace(content, replaceGreaterOrEqual, nil, -1)
	node, err := xmlpath.ParseHTML(bytes.NewBuffer(content))
	if err != nil {
	return nil, err
	}
	return node, err
	}

	func saveNews(news []News) {
	coll, err := collection()
	if err != nil {
	log.Printf("[ERROR] Failed to save news: %s", err)
	return
	}
	defer coll.Close()
	for _, n := range news {
	_, err = coll.UpsertId(n.ID, n)
	if err != nil {
	log.Printf("[ERROR] Failed to save news: %s", err)
	}
	}
	}

	func collectNews(node *xmlpath.Node) []News {
	location, _ := time.LoadLocation("America/Sao_Paulo")
	var err error
	var newsList []News
	iter := pathLink.Iter(node)
	for iter.Next() {
	var news News
	target, ok := pathHrefLink.String(iter.Node())
	if !ok {
	continue
	}
	parts := idRegexp.FindStringSubmatch(target)
	if len(parts) > 1 {
	news.ID = parts[1]
	}
	content := iter.Node().String()
	content = strings.TrimSpace(content)
	parts = strings.SplitN(content, " - ", 2)
	if len(parts) < 2 {
	continue
	}
	news.Title = parts[1]
	news.Date, err = time.ParseInLocation("02/01/2006 15:04", parts[0], location)
	if err != nil {
	log.Printf("[WARNING] Wrong date for news: %s", err)
	continue
	}
	newsList = append(newsList, news)
	}
	return newsList
	}

	func run() {
	for i := 0; ; i++ {
	node, err := downloadContent(i + 1)
	if err != nil {
	log.Print(err)
	}
	if !pathLink.Exists(node) {
	break
	}
	newsList := collectNews(node)
	if len(newsList) > 0 {
	saveNews(newsList)
	}
	}
	}

	func main() {
	for _ = range time.Tick(tickerTimer) {
	run()
	}
	}