Last active
October 3, 2015 15:03
-
-
Save cloverstd/67e5b4897f11694ff3c6 to your computer and use it in GitHub Desktop.
扇贝打卡 RSS 源
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
from __future__ import absolute_import, unicode_literals | |
import tornado.ioloop | |
import tornado.httpserver | |
from tornado.options import parse_command_line, define, options | |
import tornado.httpclient | |
import tornado.web | |
import tornado.gen | |
import logging | |
from bs4 import BeautifulSoup | |
import PyRSS2Gen | |
import datetime | |
import locale | |
define('debug', type=bool, default=True) | |
define('port', type=int, default=8088) | |
define('host', type=str, default="127.0.0.1") | |
class ShanbayRss(object): | |
URL = "http://shanbay.com/checkin/record/{record_id}/" | |
DEFAULT_HEADERS = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36", | |
} | |
def __init__(self, record_id): | |
self.record_id = record_id | |
self.http_client = tornado.httpclient.AsyncHTTPClient() | |
@tornado.gen.coroutine | |
def get_latest_chekcins(self): | |
req = tornado.httpclient.HTTPRequest( | |
url=self.URL.format(record_id=self.record_id), | |
headers=self.DEFAULT_HEADERS, | |
) | |
resp = yield self.http_client.fetch(req) | |
checkins = self.parse_checkins(resp.body) | |
raise tornado.gen.Return(checkins) | |
@tornado.gen.coroutine | |
def to_xml(self): | |
checkins = yield self.get_latest_chekcins() | |
items = list() | |
items = [PyRSS2Gen.RSSItem( | |
title="{date} 第{number}天打卡".format( | |
date=item["date"], | |
number=item["day"], | |
), | |
link=item["link"], | |
description=item["note"], | |
guid=PyRSS2Gen.Guid(item["link"]), | |
) for item in checkins["checkins"]] | |
rss = PyRSS2Gen.RSS2( | |
title=checkins["title"], | |
link="http://hui.lu", | |
description="", | |
lastBuildDate=datetime.datetime.now(), | |
items=items | |
) | |
xml = rss.to_xml(encoding="utf-8") | |
raise tornado.gen.Return(xml) | |
def parse_checkins(self, content): | |
soup = BeautifulSoup(content, 'html.parser') | |
checkin_content = soup.find(attrs={"class":"checkins"}).\ | |
find_all(attrs={"class": "checkin"}) | |
checkins = list() | |
for i in checkin_content: | |
note = i.find(attrs={"class": "note"}).text.strip() | |
target = i.find(attrs={"class": "target"}).text.strip() | |
locale.setlocale(locale.LC_ALL, str('zh_CN.UTF-8')) | |
date = datetime.datetime.strptime(target.encode('utf-8'), '%B %d, %Y') | |
checkins.append({ | |
"day": i.find(attrs={"class": "number"}).text.strip(), | |
"note": "".join(note.split()), | |
"date": date.strftime('%Y年%m月%d天'.encode('utf-8')).decode('utf-8'), | |
"link": "http://shanbay.com" + i.find("a", | |
attrs={"class": "target"})["href"] | |
}) | |
title = soup.title.text.strip() | |
return { | |
"title": title, | |
"checkins": checkins, | |
} | |
class MainHandler(tornado.web.RequestHandler): | |
@tornado.gen.coroutine | |
def get(self): | |
self.write("hi") | |
class ShanbayRssHandler(tornado.web.RequestHandler): | |
@tornado.gen.coroutine | |
def get(self, record_id): | |
""" | |
由于个人打卡页面必须登录 | |
所以曲线救国,通过单条打卡记录,可以查看最新的打卡记录 | |
""" | |
try: | |
shanby_rss = ShanbayRss(record_id) | |
res = yield shanby_rss.to_xml() | |
self.set_header("Content-Type", "text/xml;charset=UTF-8") | |
self.write(res) | |
except Exception as e: | |
self.application.logger.error(e) | |
self.set_status(404) | |
self.write('404') | |
class Application(tornado.web.Application): | |
def __init__(self): | |
handlers = [ | |
(r'/shanbay2rss/(\d+)', ShanbayRssHandler), | |
(r'/', MainHandler), | |
] | |
settings = dict() | |
settings["debug"] = options.debug | |
self.logger = logging.getLogger('application') | |
super(Application, self).__init__(handlers, **settings) | |
def main(): | |
http_server = tornado.httpserver.HTTPServer(Application()) | |
parse_command_line() | |
http_server.listen(options.port, address=options.host) | |
logger = logging.getLogger('application') | |
logger.info("Server listen on http://{host}:{port}".format(host=options.host, | |
port=options.port)) | |
tornado.ioloop.IOLoop.current().start() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment