bootell/tuicoolMags.py

## tuicoolMags.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import re
from bs4 import BeautifulSoup
from base import BaseFeedBook, URLOpener, feedparser, string_of_tag

def getBook():
    return tuicoolMags

class tuicoolMags(BaseFeedBook):
    title                 = u'编程狂人'
    description           = u'《编程狂人》周刊是推酷网专为IT人打造的行业技术周刊。'
    language = 'zh-cn'
    feed_encoding = "utf-8"
    page_encoding = "utf-8"
    mastheadfile = "mh_tuicoolMags.gif"
    coverfile =  'cv_tuicoolMags.jpg'
    deliver_days = ['Friday']

    def ParseFeedUrls(self):
        # 从rss获取最新一期地址
        rssurl = 'http://www.tuicool.com/mags/rss_programming.rss'
        feedresult = feedparser.parse(rssurl)
        if !feedresult:
            self.log.warn('fetch web failed:%s'%rssrul)
            return []
        mainurl = feedresult.entries[0]['link']

        # 获取最新一期内容
        urladded = set()
        opener = URLOpener(self.host, timeout=30)
        result = opener.open(mainurl)
        if result.status_code != 200:
            self.log.warn('fetch web failed:%s'%mainurl)
            return []
        content = result.content.decode(self.feed_encoding)

        urls = []
        p = re.compile(r"<strong>[\s\S]*?</ol>")
        for section in p.findall(content):
            soup = BeautifulSoup(section, "lxml")

            strong = section.find('strong')
            if strong is None:
                self.log.warn('strong is empty')
                continue
            sectitle = string_of_tag(strong).strip()
            if not sectitle:
                self.log.warn('strong string is empty')
                continue
            #self.log.info('Found section: %s' % section_title)

            articles = []
            subsection = ''
            for node in section.find_all(class_='article-title'):
                a = node.find('a', attrs={"href":True})
                if a:
                    url = a['href']
                    title = string_of_tag(a)
                    if title:
                        if url not in urladded:
                            urls.append((sectitle,title,url,None))
                            urladded.add(url)

        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
	#!/usr/bin/env python
	# -- coding:utf-8 --

	import re
	from bs4 import BeautifulSoup
	from base import BaseFeedBook, URLOpener, feedparser, string_of_tag

	def getBook():
	return tuicoolMags

	class tuicoolMags(BaseFeedBook):
	title = u'编程狂人'
	description = u'《编程狂人》周刊是推酷网专为IT人打造的行业技术周刊。'
	language = 'zh-cn'
	feed_encoding = "utf-8"
	page_encoding = "utf-8"
	mastheadfile = "mh_tuicoolMags.gif"
	coverfile = 'cv_tuicoolMags.jpg'
	deliver_days = ['Friday']

	def ParseFeedUrls(self):
	# 从rss获取最新一期地址
	rssurl = 'http://www.tuicool.com/mags/rss_programming.rss'
	feedresult = feedparser.parse(rssurl)
	if !feedresult:
	self.log.warn('fetch web failed:%s'%rssrul)
	return []
	mainurl = feedresult.entries[0]['link']

	# 获取最新一期内容
	urladded = set()
	opener = URLOpener(self.host, timeout=30)
	result = opener.open(mainurl)
	if result.status_code != 200:
	self.log.warn('fetch web failed:%s'%mainurl)
	return []
	content = result.content.decode(self.feed_encoding)

	urls = []
	p = re.compile(r"<strong>[\s\S]*?</ol>")
	for section in p.findall(content):
	soup = BeautifulSoup(section, "lxml")

	strong = section.find('strong')
	if strong is None:
	self.log.warn('strong is empty')
	continue
	sectitle = string_of_tag(strong).strip()
	if not sectitle:
	self.log.warn('strong string is empty')
	continue
	#self.log.info('Found section: %s' % section_title)

	articles = []
	subsection = ''
	for node in section.find_all(class_='article-title'):
	a = node.find('a', attrs={"href":True})
	if a:
	url = a['href']
	title = string_of_tag(a)
	if title:
	if url not in urladded:
	urls.append((sectitle,title,url,None))
	urladded.add(url)

	if len(urls) == 0:
	self.log.warn('len of urls is zero.')
	return urls