Created
February 11, 2015 13:59
-
-
Save cryptowen/826145abf1f03702417f to your computer and use it in GitHub Desktop.
使用calibre获取乌云知识库<http://drops.wooyun.org/>文章用的recipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# encoding: utf-8 | |
''' | |
本代码为使用calibre获取 乌云知识库<http://drops.wooyun.org/> 文章用的recipe, | |
可用来参考获取其他网站或博客电子书。 | |
使用方法参见: | |
http://blog.csdn.net/yelyyely/article/details/43741739 | |
## TODO | |
1. 使用calibre内置的soup无法解析得到text节点,故此代码中start_page和end_page为硬编码,需要根据实际情况进行更改; | |
2. 同理得到文章标题用的是 href['title'][18:], 比较粗糙,有待改善; | |
3. parse_index解析目录页总共40多页是单线程同步进行的,速度较慢,可以考虑改成多线程加快速度; | |
''' | |
from calibre.web.feeds.recipes import BasicNewsRecipe | |
class wooyun(BasicNewsRecipe): | |
title = u'乌云知识库' | |
__author__ = u'无关风月' | |
description = u'''乌云知识库,最专业的安全知识分享平台。本电子书由无关风月整理网站 <http://drops.wooyun.org/> 内容而来。''' | |
timefmt = '[%Y-%m-%d]' | |
no_stylesheets = True | |
INDEX = 'http://drops.wooyun.org/' | |
# auto_cleanup = True # 如果没有手动分析文章结构,可以考虑开启该选项自动清理正文内容 | |
language = 'zh-CN' | |
keep_only_tags = [{'class': ['post']}] # 仅保留文章的post中的内容,其中为自己分析得到的正文范围 | |
max_articles_per_feed = 10000 # 默认最多文章数是100,可改为更大的数字以免下载不全 | |
def parse_index(self): | |
# soup = self.index_to_soup(self.INDEX) | |
# pages_info = soup.findALL(**{'class': 'pages'}).text.split() | |
# print 'pages_info:', pages_info | |
start_page = 1 # int(pages_info[1]) | |
end_page = 47 # int(pages_info[3]) | |
articles = [] | |
for p in range(start_page, end_page+1): # 处理每一个目录页 | |
soup_page = self.index_to_soup(self.INDEX + '/page/' + str(p)) | |
soup_titles = soup_page.findAll(**{'class': 'entry-title'}) # 从目录页中提取正文标题和链接 | |
for soup_title in soup_titles: | |
href = soup_title.a | |
articles.append({'title': href['title'][18:], 'url': href['href']}) | |
print 'page %d done' % p | |
articles.reverse() # 文章倒序,让其按照时间从前到后排列 | |
res = [(u'乌云知识库', articles)] # 返回tuple,分别是电子书名字和文章列表 | |
# self.abort_recipe_processing('test') # 用来中断电子书生成,调试用 | |
return res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment