Skip to content

Instantly share code, notes, and snippets.

@qrtt1
Created October 31, 2013 16:36
Show Gist options
  • Save qrtt1/7252831 to your computer and use it in GitHub Desktop.
Save qrtt1/7252831 to your computer and use it in GitHub Desktop.
from BeautifulSoup import BeautifulSoup as Soup
from mechanize import Browser
from soupselect import select
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
base_url = "http://www.hinet.net/pu"
url = base_url + "/notify.htm"
browser = Browser()
resp = browser.open(url)
content = resp.read()
soup = Soup(content)
DATE_FORMAT = "%a, %d %b %Y %H:%M:%S %Z"
rss_header = r"""
<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Hinet unofficial notification rss</title>
<link>http://www.hinet.net/pu/notify.htm</link>
<description>hinet rss, it just works</description>
<language>zh-tw</language>
<pubDate>%(RSS_DATE)s</pubDate>
<lastBuildDate>%(RSS_DATE)s</lastBuildDate>
<docs>http://foo.bar.com</docs>
<generator>Weblog Editor 2.0</generator>
"""
rss_footer = r"""
</channel>
</rss>
"""
rss_item = r"""
<item>
<title>%(TITLE)s</title>
<link>%(LINK)s</link>
<pubDate>%(DATE)s</pubDate>
<description><![CDATA[%(DESCRIPTION)s]]></description>
</item>
"""
built_date = datetime.strftime(datetime.now(), DATE_FORMAT)
print (rss_header % {'RSS_DATE':built_date}).strip()
for link in select(soup, "a"):
href = link['href']
if href and href.startswith('notify/'):
date_str = href.split("_")[1]
item_title = link.text
item_link = base_url + "/" + href
item_pubdate = datetime.strftime(datetime.strptime(date_str, "%Y%m%d"), DATE_FORMAT)
item_description = str( select(Soup(browser.open(item_link).read()), ".content")[0])
print rss_item % {'TITLE':item_title, 'LINK':item_link, 'DATE':item_pubdate, 'DESCRIPTION':item_description}
print rss_footer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment