Skip to content

Instantly share code, notes, and snippets.

@Tyderion
Created April 5, 2014 18:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tyderion/9995935 to your computer and use it in GitHub Desktop.
Save Tyderion/9995935 to your computer and use it in GitHub Desktop.
create books from pact webserial: http://pactwebserial.wordpress.com/
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Feed
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5
class PactRecipe(BasicNewsRecipe):
title = u'Pact'
#oldest_article = 30
#max_articles_per_feed = 100
oldest_article = 10000
max_articles_per_feed = 10000
auto_cleanup = True
feeds = [(u'Posts', u'https://pactwebserial.wordpress.com/feed/')]
extra_css = 'p { margin-top: 0; margin-bottom:0; text-indent: 1.5em } p > strong {margin-top:1em;} '
chapter_regex = re.compile(r'.*?(Last Chapter)|(Next Chapter).*')
reverse_article_order = True
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if (len(alink.contents[0]) > 0):
if self.chapter_regex.match(alink.contents.__repr__()):
if alink.parent.name == 'p':
alink.parent.extract()
else:
alink.extract()
return soup
def create_feed_from_article(self, article, index):
feed = Feed()
feed.title = article.title
feed.id_counter = index
feed.articles =[article]
return feed
def parse_feeds (self):
# Do the "official" parse_feeds first
feeds = self.remove_and_save_doubles(BasicNewsRecipe.parse_feeds(self))
# Loop thru the articles in all feeds to find articles with "recipe" in it
for curfeed in feeds:
if len(curfeed.articles) == 0:
self.abort_recipe_processing("No new Chapters")
break
newtitle = 'Pact: '
for a,curarticle in enumerate(reversed(curfeed.articles)):
newtitle += curarticle.title + " & "
curarticle.description = curarticle.title
curfeed.title = newtitle[0:-3]
self.title = curfeed.title
return feeds
def remove_and_save_doubles(self, feeds):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)
past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())
cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.encode('utf-8'))
if article.summary: item_hash.update(article.summary.encode('utf-8'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')
remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)
return feeds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment