Skip to content

Instantly share code, notes, and snippets.

@crosbymichael
Last active December 23, 2015 02:38
Show Gist options
  • Save crosbymichael/6567864 to your computer and use it in GitHub Desktop.
Save crosbymichael/6567864 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
from redis import Redis
import rethinkdb as r
from rq import Queue
import os
from rss import parse_feed
from werkzeug.contrib.atom import AtomFeed
import tornado.ioloop
import tornado.web
from datetime import datetime as dt
from hashlib import md5
import json
url = ''
feed_url = ''
title = ''
def get_conns():
redis = Redis(host=os.environ['REDIS_IP'], port=int(os.environ['REDIS_PORT']))
conn = r.connect(os.environ['RETHINK_IP'], int(os.environ['RETHINK_PORT']))
return conn, redis
def cron():
conn, redis = get_conns()
q = Queue(connection=redis)
table = r.db('rss').table('feed')
# Get all the feeds and push them on the queue to be updated
for feed in table.run(conn):
q.enqueue(parse_feed, feed['id'], feed.get('etag', None), feed.get('modified', None))
def web():
global url, title, feed_url
feed_url = os.environ['FEED_URL']
url = os.environ['URL']
title = os.environ['TITLE']
conn, redis = get_conns()
application = tornado.web.Application([
(r"/", AtomHandler, {'conn': conn, 'redis_conn':redis}),
(r"/info", InfoHandler, {'conn': conn, 'redis_conn':redis}),
])
application.listen(8888)
tornado.ioloop.IOLoop.instance().start()
def create_feeds(conn, url, feed_url):
result = AtomFeed(title, feed_url=feed_url, url=url)
entries = r.db('rss').table('entries').order_by(r.desc('modified')).limit(25).run(conn)
for e in entries:
result.add(e['title'], e['body'], content_type='html', author=e['author'], url=e['id'], id=e['uid'], updated=dt.fromtimestamp(e['updated']))
return result.to_string()
class BaseHandler(tornado.web.RequestHandler):
def initialize(self, conn, redis_conn):
self.conn = conn
self.redis = redis_conn
class AtomHandler(BaseHandler):
def get(self):
data = self.redis.hgetall('feed')
if not data:
data = {}
data['blob'] = create_feeds(self.conn, url, feed_url)
m = md5()
data['modified'] = dt.now()
m.update(data['modified'].__str__())
data['etag'] = m.hexdigest()
self.redis.hmset('feed', data)
self.set_header('Content-Type', 'application/xml')
self.set_header('ETag', data['etag'])
self.set_header('Last-Modified', data['modified'])
self.write(data['blob'])
class InfoHandler(BaseHandler):
def get(self):
data = {}
db = r.db('rss')
data['entry_count'] = db.table('entries').count().run(self.conn)
data['feeds'] = map(lambda f: f['id'], db.table('feed').with_fields('id').run(self.conn) )
self.set_header('Content-Type', 'application/json')
self.write(json.dumps(data))
def main(args):
cmd = args[len(args)-1]
if cmd == 'cron':
cron()
elif cmd == 'web':
web()
else:
sys.stderr.write('Unknown command: %s\n' % cmd)
sys.exit(1)
if __name__=='__main__':
main(sys.argv)
import feedparser
from time import mktime
from datetime import datetime as dt
import rethinkdb as r
import os, sys
from hashlib import md5
from time import mktime
from redis import Redis
import calendar
def parse_feed(url, etag=None, modified=None):
""" Parse a feed url and saves them to the database """
feeds = feedparser.parse(url, etag=etag, modified=modified)
if feeds and (feeds.get('status', -1) == 200 or feeds.get('status', -1) == 301):
ip = os.environ['RETHINK_IP']
port = int(os.environ['RETHINK_PORT'])
conn = r.connect(ip, port)
redis = Redis(host=os.environ['REDIS_IP'], port=int(os.environ['REDIS_PORT']))
entries = r.db('rss').table('entries')
meta = r.db('rss').table('feed')
data = []
for entry in feeds.entries:
data.append(get_entry(feeds.feed.title, entry))
entries.insert(data).run(conn)
meta.get(url).update({'etag':feeds.get('etag', None), 'modified':unix_time(dt.now())}).run(conn)
# remove the hash to reset the cache
redis.delete('feed')
else:
# should do logging here, sentry to the rescue
print feeds.get('status', 'no status')
print len(feeds.entries)
def get_entry(title, feed):
m = md5()
m.update(feed.link)
t = feed.get('updated_parsed', None)
if t is not None:
t = dt.fromtimestamp(mktime(t))
else:
t = dt.now()
return {
'id': feed.link,
'uid': m.hexdigest(),
'author': feed.get('author', title),
'feed': title,
'title': feed.title,
'updated':unix_time(t),
'body': feed.get('summary_detail', {}).get('value', '')}
def unix_time(d):
return calendar.timegm(d.utctimetuple())
if __name__=='__main__':parse_feed(sys.argv[len(sys.argv)-1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment