Tyderion/pact.recipe.py

## pact.recipe.py
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Feed

from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5

class PactRecipe(BasicNewsRecipe):
    title          = u'Pact'
    #oldest_article = 30
    #max_articles_per_feed = 100
    oldest_article = 10000
    max_articles_per_feed = 10000
    auto_cleanup = True
    feeds          = [(u'Posts', u'https://pactwebserial.wordpress.com/feed/')]

    extra_css = 'p { margin-top: 0; margin-bottom:0; text-indent: 1.5em } p > strong {margin-top:1em;} '

    chapter_regex = re.compile(r'.*?(Last Chapter)|(Next Chapter).*')

    reverse_article_order = True


    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if (len(alink.contents[0]) > 0):
                if  self.chapter_regex.match(alink.contents.__repr__()):
                    if alink.parent.name == 'p':
                        alink.parent.extract()
                    else:
                        alink.extract()
        return soup

    def create_feed_from_article(self, article, index):
        feed = Feed()
        feed.title = article.title
        feed.id_counter = index
        feed.articles =[article]
        return feed

    def parse_feeds (self):
        # Do the "official" parse_feeds first
        feeds = self.remove_and_save_doubles(BasicNewsRecipe.parse_feeds(self))
        # Loop thru the articles in all feeds to find articles with "recipe" in it
        for curfeed in feeds:
            if len(curfeed.articles) == 0:
                self.abort_recipe_processing("No new Chapters")
                break
            newtitle = 'Pact: '
            for a,curarticle in enumerate(reversed(curfeed.articles)):
                newtitle += curarticle.title + " & "
                curarticle.description = curarticle.title
            curfeed.title = newtitle[0:-3]
            self.title = curfeed.title
        return feeds


    def remove_and_save_doubles(self, feeds):
        recipe_dir = os.path.join(config_dir,'recipes')
        hash_dir = os.path.join(recipe_dir,'recipe_storage')
        feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
        if not os.path.isdir(feed_dir):
            os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

        for feed in feeds:
            feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
            feed_fn = os.path.join(feed_dir,feed_hash)

            past_items = set()
            if os.path.exists(feed_fn):
               with file(feed_fn) as f:
                   for h in f:
                       past_items.add(h.strip())

            cur_items = set()
            for article in feed.articles[:]:
                item_hash = md5()
                if article.content: item_hash.update(article.content.encode('utf-8'))
                if article.summary: item_hash.update(article.summary.encode('utf-8'))
                item_hash = item_hash.hexdigest()
                if article.url:
                    item_hash = article.url + ':' + item_hash
                cur_items.add(item_hash)
                if item_hash in past_items:
                    feed.articles.remove(article)
            with file(feed_fn,'w') as f:
                for h in cur_items:
                    f.write(h+'\n')

        remove = [f for f in feeds if len(f) == 0 and
                self.remove_empty_feeds]
        for f in remove:
            feeds.remove(f)

        return feeds
	import time
	import re
	from calibre.web.feeds.recipes import BasicNewsRecipe
	from calibre.web.feeds import Feed

	from calibre.constants import config_dir, CONFIG_DIR_MODE
	import os, os.path, urllib
	from hashlib import md5

	class PactRecipe(BasicNewsRecipe):
	title = u'Pact'
	#oldest_article = 30
	#max_articles_per_feed = 100
	oldest_article = 10000
	max_articles_per_feed = 10000
	auto_cleanup = True
	feeds = [(u'Posts', u'https://pactwebserial.wordpress.com/feed/')]

	extra_css = 'p { margin-top: 0; margin-bottom:0; text-indent: 1.5em } p > strong {margin-top:1em;} '

	chapter_regex = re.compile(r'.?(Last Chapter)\|(Next Chapter).')

	reverse_article_order = True


	def preprocess_html(self, soup):
	for alink in soup.findAll('a'):
	if (len(alink.contents[0]) > 0):
	if self.chapter_regex.match(alink.contents.__repr__()):
	if alink.parent.name == 'p':
	alink.parent.extract()
	else:
	alink.extract()
	return soup

	def create_feed_from_article(self, article, index):
	feed = Feed()
	feed.title = article.title
	feed.id_counter = index
	feed.articles =[article]
	return feed

	def parse_feeds (self):
	# Do the "official" parse_feeds first
	feeds = self.remove_and_save_doubles(BasicNewsRecipe.parse_feeds(self))
	# Loop thru the articles in all feeds to find articles with "recipe" in it
	for curfeed in feeds:
	if len(curfeed.articles) == 0:
	self.abort_recipe_processing("No new Chapters")
	break
	newtitle = 'Pact: '
	for a,curarticle in enumerate(reversed(curfeed.articles)):
	newtitle += curarticle.title + " & "
	curarticle.description = curarticle.title
	curfeed.title = newtitle[0:-3]
	self.title = curfeed.title
	return feeds


	def remove_and_save_doubles(self, feeds):
	recipe_dir = os.path.join(config_dir,'recipes')
	hash_dir = os.path.join(recipe_dir,'recipe_storage')
	feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
	if not os.path.isdir(feed_dir):
	os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

	for feed in feeds:
	feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
	feed_fn = os.path.join(feed_dir,feed_hash)

	past_items = set()
	if os.path.exists(feed_fn):
	with file(feed_fn) as f:
	for h in f:
	past_items.add(h.strip())

	cur_items = set()
	for article in feed.articles[:]:
	item_hash = md5()
	if article.content: item_hash.update(article.content.encode('utf-8'))
	if article.summary: item_hash.update(article.summary.encode('utf-8'))
	item_hash = item_hash.hexdigest()
	if article.url:
	item_hash = article.url + ':' + item_hash
	cur_items.add(item_hash)
	if item_hash in past_items:
	feed.articles.remove(article)
	with file(feed_fn,'w') as f:
	for h in cur_items:
	f.write(h+'\n')

	remove = [f for f in feeds if len(f) == 0 and
	self.remove_empty_feeds]
	for f in remove:
	feeds.remove(f)

	return feeds