Created
April 5, 2014 18:24
-
-
Save Tyderion/9995935 to your computer and use it in GitHub Desktop.
create books from pact webserial: http://pactwebserial.wordpress.com/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import re | |
from calibre.web.feeds.recipes import BasicNewsRecipe | |
from calibre.web.feeds import Feed | |
from calibre.constants import config_dir, CONFIG_DIR_MODE | |
import os, os.path, urllib | |
from hashlib import md5 | |
class PactRecipe(BasicNewsRecipe): | |
title = u'Pact' | |
#oldest_article = 30 | |
#max_articles_per_feed = 100 | |
oldest_article = 10000 | |
max_articles_per_feed = 10000 | |
auto_cleanup = True | |
feeds = [(u'Posts', u'https://pactwebserial.wordpress.com/feed/')] | |
extra_css = 'p { margin-top: 0; margin-bottom:0; text-indent: 1.5em } p > strong {margin-top:1em;} ' | |
chapter_regex = re.compile(r'.*?(Last Chapter)|(Next Chapter).*') | |
reverse_article_order = True | |
def preprocess_html(self, soup): | |
for alink in soup.findAll('a'): | |
if (len(alink.contents[0]) > 0): | |
if self.chapter_regex.match(alink.contents.__repr__()): | |
if alink.parent.name == 'p': | |
alink.parent.extract() | |
else: | |
alink.extract() | |
return soup | |
def create_feed_from_article(self, article, index): | |
feed = Feed() | |
feed.title = article.title | |
feed.id_counter = index | |
feed.articles =[article] | |
return feed | |
def parse_feeds (self): | |
# Do the "official" parse_feeds first | |
feeds = self.remove_and_save_doubles(BasicNewsRecipe.parse_feeds(self)) | |
# Loop thru the articles in all feeds to find articles with "recipe" in it | |
for curfeed in feeds: | |
if len(curfeed.articles) == 0: | |
self.abort_recipe_processing("No new Chapters") | |
break | |
newtitle = 'Pact: ' | |
for a,curarticle in enumerate(reversed(curfeed.articles)): | |
newtitle += curarticle.title + " & " | |
curarticle.description = curarticle.title | |
curfeed.title = newtitle[0:-3] | |
self.title = curfeed.title | |
return feeds | |
def remove_and_save_doubles(self, feeds): | |
recipe_dir = os.path.join(config_dir,'recipes') | |
hash_dir = os.path.join(recipe_dir,'recipe_storage') | |
feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':')) | |
if not os.path.isdir(feed_dir): | |
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE) | |
for feed in feeds: | |
feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='') | |
feed_fn = os.path.join(feed_dir,feed_hash) | |
past_items = set() | |
if os.path.exists(feed_fn): | |
with file(feed_fn) as f: | |
for h in f: | |
past_items.add(h.strip()) | |
cur_items = set() | |
for article in feed.articles[:]: | |
item_hash = md5() | |
if article.content: item_hash.update(article.content.encode('utf-8')) | |
if article.summary: item_hash.update(article.summary.encode('utf-8')) | |
item_hash = item_hash.hexdigest() | |
if article.url: | |
item_hash = article.url + ':' + item_hash | |
cur_items.add(item_hash) | |
if item_hash in past_items: | |
feed.articles.remove(article) | |
with file(feed_fn,'w') as f: | |
for h in cur_items: | |
f.write(h+'\n') | |
remove = [f for f in feeds if len(f) == 0 and | |
self.remove_empty_feeds] | |
for f in remove: | |
feeds.remove(f) | |
return feeds | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment