Created
April 21, 2016 06:25
-
-
Save bootell/df3b797905947c4679a8f071d93c2c4a to your computer and use it in GitHub Desktop.
tuicool magazines for kindleEar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
import re | |
from bs4 import BeautifulSoup | |
from base import BaseFeedBook, URLOpener, feedparser, string_of_tag | |
def getBook(): | |
return tuicoolMags | |
class tuicoolMags(BaseFeedBook): | |
title = u'编程狂人' | |
description = u'《编程狂人》周刊是推酷网专为IT人打造的行业技术周刊。' | |
language = 'zh-cn' | |
feed_encoding = "utf-8" | |
page_encoding = "utf-8" | |
mastheadfile = "mh_tuicoolMags.gif" | |
coverfile = 'cv_tuicoolMags.jpg' | |
deliver_days = ['Friday'] | |
def ParseFeedUrls(self): | |
# 从rss获取最新一期地址 | |
rssurl = 'http://www.tuicool.com/mags/rss_programming.rss' | |
feedresult = feedparser.parse(rssurl) | |
if !feedresult: | |
self.log.warn('fetch web failed:%s'%rssrul) | |
return [] | |
mainurl = feedresult.entries[0]['link'] | |
# 获取最新一期内容 | |
urladded = set() | |
opener = URLOpener(self.host, timeout=30) | |
result = opener.open(mainurl) | |
if result.status_code != 200: | |
self.log.warn('fetch web failed:%s'%mainurl) | |
return [] | |
content = result.content.decode(self.feed_encoding) | |
urls = [] | |
p = re.compile(r"<strong>[\s\S]*?</ol>") | |
for section in p.findall(content): | |
soup = BeautifulSoup(section, "lxml") | |
strong = section.find('strong') | |
if strong is None: | |
self.log.warn('strong is empty') | |
continue | |
sectitle = string_of_tag(strong).strip() | |
if not sectitle: | |
self.log.warn('strong string is empty') | |
continue | |
#self.log.info('Found section: %s' % section_title) | |
articles = [] | |
subsection = '' | |
for node in section.find_all(class_='article-title'): | |
a = node.find('a', attrs={"href":True}) | |
if a: | |
url = a['href'] | |
title = string_of_tag(a) | |
if title: | |
if url not in urladded: | |
urls.append((sectitle,title,url,None)) | |
urladded.add(url) | |
if len(urls) == 0: | |
self.log.warn('len of urls is zero.') | |
return urls |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment