Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LoveMeWithoutAll/7e5fdb0ba69e074ecf379eac14cf2c87 to your computer and use it in GitHub Desktop.
Save LoveMeWithoutAll/7e5fdb0ba69e074ecf379eac14cf2c87 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# vim:ft=python tabstop=8 expandtab shiftwidth=4 softtabstop=4
from __future__ import print_function
__version__ = '2.6.2'
"""
2.6.2: Fix. Add also removal of H1 to clean up all titles, to insert a new one
**TAGS** (list of strings or empty list: []) if [] (empty list) then the plugin will connect Pocket and fetch articles based on the configuration of the plugin.
Next, the plugin will get tags of these articles and group them into sections in the final ebook.
If TAGS has elements, e.g., TAGS = ['tag1', 'tag2'] then only these tags will be fetched from Pocket.
**TAGS_EXCEPTIONS** (list of strings or empty list: []) if [] (empty list) then the plugin will ignore it.
If TAGS_EXCEPTIONS has elements, e.g., TAGS_EXCEPTIONS = ['tag3', 'tag4'] then the articles tagged with this tags will be ignored.
That is, tag3 and tag4 won't appear as sections, and it's articles won't appear in the "Untagged" section.
This variable is meant to be used with TAGS = [], as it doesn’t make any sense to specify a tag both in TAGS and in TAGS_EXCEPTIONS.
**INCLUDE_UNTAGGED** (True or False) if True then put all fetched and untagged articles in the last section 'Untagged'.
If False then skip these articles and don't create the section 'Untagged'. Bear in mind that if TAGS is populated ( e.g. TAGS = ['tag1', 'tag2']),
INCLUDE_UNTAGED = True and other tags exist in Pokcet (e.g. tag3,tag4) then the Untagged section will include untagged articles
in Pocket AND articles tagged with tag3 and tag4. That behavior can be avoided using TAGS_EXCEPTION
**ARCHIVE_DOWNLOADED** (True or False) do you want to archive articles after fetching
**MAX_ARTICLES_PER_FEED** (number) how many articles do you want to fetch for FEED (FEED could be also
considered as TAG, so for each TAG you this value will be applied.
**SORT_METHOD** ('oldest' or 'newest') way how the articles are sorted
**OLDEST_ARTICLE** (number) fetch articles added (modified) in Pocket for number of days, 7 will give you articles added/modified in Pocket for the last week
**TO_PULL** ('all' or 'unread') What articles to pull? unread only or all?
**TITLE_WITH_TAGS** (True or False) if True will the ebook filename will be like
Pocket: INVEST P2P [Sun, 05 Jan 2020] for many tags this might be to long, if you make a single tag ebook this might be super fun!
"""
# CONFIGURATION ###########################################################
TAGS = [] # [] or ['tag1', 'tag2']
TAGS_EXCEPTIONS = [] # [] or ['tag3', 'tag4']
INCLUDE_UNTAGGED = True
ARCHIVE_DOWNLOADED = True
MAX_ARTICLES_PER_FEED = 100
OLDEST_ARTICLE = 90
SORT_METHOD = 'newest'
SORT_WITHIN_TAG_BY_TITLE = True
TO_PULL = 'unread'
TITLE_WITH_TAGS = False
#############################################################################
from calibre.constants import config_dir
from calibre.utils.config import JSONConfig
from calibre.web.feeds.news import BasicNewsRecipe
from collections import namedtuple
from os import path
from time import localtime, strftime, time
import sys
import errno
import json
import mechanize
import operator
try:
from urllib.error import HTTPError
except ImportError:
from urllib2 import HTTPError
__license__ = 'GPL v3'
__copyright__ = '2019, David Orchard'
class PocketConfig:
__file_path = path.join(config_dir, 'custom_recipes', 'Pocket.json')
class AuthState:
FirstRun = 1
Authorizing = 2
Authorized = 3
def __init__(self, state = AuthState.FirstRun, token = None, user = None):
# Default values
self.state = state
self.token = token
self.user = user
@staticmethod
def from_file():
config = PocketConfig()
config.load()
return config
def load(self):
try:
with open(self.__file_path) as config:
config = json.load(config)
if isinstance(config, dict):
for key in self.__dict__.keys():
if config[key]:
setattr(self, key, config[key])
except IOError as e:
# File not found
if e.errno != errno.ENOENT:
raise e
def save(self):
with open(self.__file_path, 'w') as config:
json.dump(self.__dict__, config)
class Pocket(BasicNewsRecipe):
config = PocketConfig.from_file()
__author__ = 'David Orchard'
description = '''
Modified by Marcin Magnus.
Fetches articles saved with <a href="https://getpocket.com/">Pocket</a> and archives them.<br>
''' + ('''
Click <a href="https://getpocket.com/connected_applications">here</a>
to disconnect Calibre from the Pocket account "{}".
'''.format(config.user) if config.user else '''
Run 'Fetch News' with this source scheduled to initiate authentication with Pocket.
''')
publisher = 'Pocket.com'
category = 'info, custom, Pocket'
# User-configurable settings -----------------------------------------------
tagsList = TAGS
oldest_article = OLDEST_ARTICLE
max_articles_per_feed = MAX_ARTICLES_PER_FEED
archive_downloaded = ARCHIVE_DOWNLOADED
include_untagged = INCLUDE_UNTAGGED
series_name = 'Pocket'
sort_method = SORT_METHOD
to_pull = TO_PULL
publication_type = 'magazine'
title = "Pocket"
# timefmt = '' # uncomment to remove date from the filenames, if commented then you will get something like `Pocket [Wed, 13 May 2020]`
masthead_url = "https://github.com/mmagnus/Pocket-Plus-Calibre-Plugin/raw/master/doc/masthead.png"
# will make square cover; this will replace text and cover of the default
# cover_url = "https://github.com/mmagnus/Pocket-Plus-Calibre-Plugin/raw/master/doc/cover.png"
# --------------------------------------------------------------------------
# Inherited developer settings
auto_cleanup = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'url'}
# Custom developer settings
consumer_key = '87006-2ecad30a91903f54baf0ee05'
redirect_uri = 'https://calibre-ebook.com/'
base_url = 'https://app.getpocket.com'
to_archive = []
simultaneous_downloads = 10
extra_css = '.touchscreen_navbar {display: none;}'
extra_css = '.calibre_navbar { visibility: hidden; }'
# TITLE_WITH_TAGS
tags_title = ' '
if tagsList:
if tagsList[-1] != '' and TITLE_WITH_TAGS: # ugly hack
tags_title = ':' + ' '.join(tagsList).upper() + ' '
def first_run(self):
request = mechanize.Request("https://getpocket.com/v3/oauth/request",
(u'{{'
'"consumer_key":"{0}",'
'"redirect_uri":"{1}"'
'}}').format(
self.consumer_key,
self.redirect_uri
),
headers = {
'Content-Type': 'application/json; charset=UTF8',
'X-Accept': 'application/json'
}
)
response = self.browser.open(request)
response = json.load(response)
self.config = PocketConfig(
state = PocketConfig.AuthState.Authorizing,
token = response['code']
)
def authorize(self):
assert self.config.state == PocketConfig.AuthState.Authorizing, "Authorization process not yet begun"
assert self.config.token, "No request token"
request = mechanize.Request("https://getpocket.com/v3/oauth/authorize",
(u'{{'
'"consumer_key":"{0}",'
'"code":"{1}"'
'}}').format(
self.consumer_key,
self.config.token
),
headers = {
'Content-Type': 'application/json; charset=UTF8',
'X-Accept': 'application/json'
}
)
try:
response = self.browser.open(request)
response = json.load(response)
self.config = PocketConfig(
state = PocketConfig.AuthState.Authorized,
token = response["access_token"],
user = response["username"],
)
except HTTPError as e:
if e.code == 403:
# The code has already been used, or the user denied access
self.reauthorize()
raise e
def parse_index(self):
assert self.config.state == PocketConfig.AuthState.Authorized, "Not yet authorized"
assert self.config.token, "No access token"
articles = []
articles_to_ignore = [] #articles that should be ignored because they are tagged with a tag in TAGS_EXCEPTIONS
############ GET TAGS ###################
# ugly implementation because this is just a copy what happens next
# to be refactor at some point
#
if self.tagsList:
pass # so if you have any tags, skip this
else:
request = mechanize.Request("https://getpocket.com/v3/get",
(u'{{'
'"consumer_key":"{0}",'
'"access_token":"{1}",'
'"count":"{2}",'
'"since":"{3}",'
'"state":"{5}",'
'"detailType":"complete",'
'"sort":"{4}"' '}}').format(
self.consumer_key,
self.config.token,
self.max_articles_per_feed * 1000, # something "unlimited"
int(time()) - 86400 * self.oldest_article,
self.sort_method,
self.to_pull,
),
headers = {
'Content-Type': 'application/json; charset=UTF8',
'X-Accept': 'application/json'
}
)
try:
response = self.browser.open(request)
response = json.load(response)
except HTTPError as e:
if e.code == 401:
# Calibre access has been removed
self.reauthorize()
raise e
if not response['list']:
self.abort_recipe_processing('No unread articles in the Pocket account "{}"'.format(self.config.user))
else:
self.tagsList = []
for item in response['list']:
try:
tagItem = response['list'][item]['tags'].keys()[0]
except KeyError:
continue
if tagItem not in self.tagsList:
self.tagsList.append(tagItem)
sorted(self.tagsList, key=unicode.lower)
if self.include_untagged:
self.tagsList.append('') # ugly hack
###### PROCESS ########
for tagItem in self.tagsList:
request = mechanize.Request("https://getpocket.com/v3/get",
(u'{{'
'"consumer_key":"{0}",'
'"access_token":"{1}",'
'"count":"{2}",'
'"since":"{3}",'
'"state":"{6}",'
'"detailType":"complete",'
'"sort":"{4}",'
'"tag":"{5}"'
'}}').format(
self.consumer_key,
self.config.token,
self.max_articles_per_feed,
int(time()) - 86400 * self.oldest_article,
self.sort_method,
tagItem,
self.to_pull,
),
headers = {
'Content-Type': 'application/json; charset=UTF8',
'X-Accept': 'application/json'
}
)
try:
response = self.browser.open(request)
response = json.load(response)
except HTTPError as e:
if e.code == 401:
# Calibre access has been removed
self.reauthorize()
raise e
if not response['list']:
# self.abort_recipe_processing('No unread articles in the Pocket account "{}"'.format(self.config.user))
continue
if self.archive_downloaded and response['list']:
#Only archive items NOT in excluded tags.
if tagItem not in TAGS_EXCEPTIONS:
for item in response['list'].values():
# Avoid duplicates as an article can appear in a tag and in the 'Untagged' section
if (item['item_id'] not in articles_to_ignore and item['item_id'] not in self.to_archive and 'naver' not in item['resolved_url'] and 'blog.me' not in item['resolved_url'] and 'twitter' not in item['resolved_url']):
self.to_archive.append(item['item_id'])
if not response['list']:
pass
else:
arts = []
for item in response['list'].values(): # .values() sorted(response['list'].values(), key = lambda x: x['sort_id'])
# If the tag is excluded, store the item_id so we can test against the list
# when processing the empty tag, which includes all the articles
if tagItem in TAGS_EXCEPTIONS:
articles_to_ignore.append(item['item_id'])
if 'naver' in item['resolved_url']:
articles_to_ignore.append(item['item_id'])
if 'blog.me' in item['resolved_url']:
articles_to_ignore.append(item['item_id'])
if 'twitter' in item['resolved_url']:
articles_to_ignore.append(item['item_id'])
try: # KeyError: u'resolved_title' error fix?
if item['item_id'] not in articles_to_ignore:
arts.append({
'title': item['resolved_title'],
'url': item['resolved_url'],
'date': item['time_added'],
'description': item['excerpt'],})
except KeyError:
print(response['list'], file=sys.stderr)
pass
if SORT_WITHIN_TAG_BY_TITLE:
arts = sorted(arts, key = lambda i: i['title'])
if not tagItem:
tagItem = "Untagged"
if arts: # if no arts, then don't create an empty entry with the tag only!
if tagItem not in TAGS_EXCEPTIONS: #Only include tags NOT excluded
articles.append((tagItem, arts))
if not articles:
self.abort_recipe_processing('No articles in the Pocket account %s to download' % (self.config.user)) #, ' '.join(self.tags))) \n[tags: %s]
return articles
def reauthorize(self):
self.config = PocketConfig();
self.ensure_authorization()
def ensure_authorization(self):
if self.config.state is PocketConfig.AuthState.FirstRun:
self.first_run()
self.config.save()
self.abort_recipe_processing('''
Calibre must be granted access to your Pocket account. Please click
<a href="https://getpocket.com/auth/authorize?request_token={0}&redirect_uri={1}">here</a>
to authenticate via a browser, and then re-fetch the news.
'''.format(self.config.token, self.redirect_uri))
elif self.config.state is PocketConfig.AuthState.Authorizing:
self.authorize()
self.config.save()
def get_browser(self, *args, **kwargs):
self.browser = BasicNewsRecipe.get_browser(self)
self.ensure_authorization()
return self.browser
def archive(self):
assert self.config.state == PocketConfig.AuthState.Authorized, "Not yet authorized"
assert self.config.token, "No access token"
if not self.to_archive:
return
archived_time = int(time())
request = mechanize.Request("https://getpocket.com/v3/send",
(u'{{'
'"consumer_key":"{0}",'
'"access_token":"{1}",'
'"actions":{2}'
'}}').format(
self.consumer_key,
self.config.token,
json.dumps([{
'action': 'archive',
'item_id': item_id,
'time': archived_time,
} for item_id in self.to_archive])
),
headers = {
'Content-Type': 'application/json; charset=UTF8',
'X-Accept': 'application/json'
}
)
response = self.browser.open(request)
def cleanup(self):
# If we're in another state, then downloading didn't complete
# (e.g. reauthorization needed) so there is no archiving to do
if self.config.state == PocketConfig.AuthState.Authorized:
self.archive()
# TODO: This works with EPUB, but not mobi/azw3
# BUG: https://bugs.launchpad.net/calibre/+bug/1838486
def postprocess_book(self, oeb, opts, log):
oeb.metadata.add('series', self.series_name)
def postprocess_html(self, soup, first):
title = soup.find('title').text # get title
h1s = soup.findAll('h1') # get all h1 headers
for h1 in h1s:
if title in h1.text:
h1 = h1.clear() # clean this tag, so the h1 will be there only
h2s = soup.findAll('h2') # get all h2 headers
for h2 in h2s:
if title in h2.text:
h2 = h2.clear() # clean this tag, so the h1 will be there only
body = soup.find('body')
new_tag = soup.new_tag('h1')
new_tag.append(title)
body.insert(0, new_tag)
# print(soup.prettify(), file=sys.stderr)
return soup
def default_cover(self, cover_file):
"""
Create a generic cover for recipes that don't have a cover
This override adds time to the cover
"""
try:
from calibre.ebooks import calibre_cover
title = self.title if isinstance(self.title, unicode) else \
self.title.decode('utf-8', 'replace')
# print('>> title', title, file=sys.stderr)
date = strftime(self.timefmt)
time = strftime('%a %d %b %Y %-H:%M')
img_data = calibre_cover(title, date, time)
cover_file.write(img_data)
cover_file.flush()
except:
self.log.exception('Failed to generate default cover')
return False
return True
@LoveMeWithoutAll
Copy link
Author

line 344 & 358~365 are excepting code

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment