Last active
February 16, 2022 09:11
-
-
Save evmn/e6608696a8e3a8724987f6fab534f2d6 to your computer and use it in GitHub Desktop.
Calibre Recipe For <WEBVISION: The Organization of the Retina and Visual System>. First rename Webvision.py to Webvision.recipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
from calibre.web.feeds.recipes import BasicNewsRecipe | |
from calibre.ebooks.BeautifulSoup import BeautifulSoup | |
from datetime import datetime | |
base_url = 'https://webvision.med.utah.edu/' | |
class Web_Vision(BasicNewsRecipe): | |
title = 'WEBVISION: The Organization of the Retina and Visual System' | |
cover_url = 'http://webvision.med.utah.edu/wp-content/uploads/2011/05/Titlez1.jpg' | |
# remove_tags_before = dict(attrs={'class':'entry-content'}) | |
remove_tags_before = dict(attrs={'class':'entry-header'}) | |
remove_tags_after= dict(attrs={'class':'entry-content'}) | |
__author__ = '' | |
language = 'en' | |
encoding = 'utf-8' | |
timefmt = '' | |
extra_css = 'h1{text-align:center}' | |
preprocess_regexps = [ | |
(re.compile(u'<p> </p>'), u""), | |
(re.compile(u'<br>'), u""), | |
#(re.compile(u'<p style="text-align: center;"><a href="http://uuhsc.utah.edu/MoranEyeCenter/research/faculty/emeritus/helga_kolb.htm" target="_blank" rel="noopener">Helga Kolb</a></p>'), u""), | |
#(re.compile(u'<p>(About)? the authors?</p>.*</div>', re.DOTALL | re.IGNORECASE), u"</div>"), | |
#(re.compile(u'<p[^>]*?>Last Updated?: .*</div>', re.DOTALL), u"</div>"), | |
(re.compile(u' by .*?</h1>', re.IGNORECASE), u"</h1>"), | |
] | |
no_stylesheets = True | |
resolve_internal_links = True | |
remove_javascript = True | |
auto_cleanup = False | |
delay = 1 | |
simultaneous_downloads = 5 | |
oldest_article = 999 | |
max_articles_per_feed = 999 | |
def parse_index(self): | |
feeds = [] | |
soup = self.index_to_soup(base_url) | |
archives = soup.find('ul', {'class':'chapter-children'}).findAll('li', {'class': 'page_item'}) | |
patterns = re.compile("Part ") | |
pdf = re.compile("How the Retina Works") | |
ignore_pt = re.compile("(Part [IVX:]* )|( by .*)", re.IGNORECASE) | |
author_pt = re.compile("( by .*)", re.IGNORECASE) | |
section = [] | |
old_title = "" | |
sec_title = "" | |
old_url = "" | |
isFirstGroup = True | |
for entry in archives: | |
title = entry.find('a').getText() | |
print(title) | |
url = entry.find('a')['href'] | |
if patterns.match(title): | |
if len(section) < 1 and len(sec_title) > 5: | |
old_title = ignore_pt.sub("", old_title) | |
section.append({'title':old_title, 'url': old_url}) | |
if len(section) > 0 and len(sec_title) > 5: | |
feeds.append((sec_title, section)) | |
sec_title = author_pt.sub("", title) | |
section = [] | |
elif not pdf.match(title): | |
title = ignore_pt.sub("", title) | |
section.append({'title':title, 'url': url}) | |
old_title = title | |
old_url = url | |
feeds.append((sec_title, section)) | |
for sec in feeds: | |
print(sec[0]) | |
for item in sec[1]: | |
print(" ", item['title']) | |
return feeds |
Author
evmn
commented
Feb 16, 2022
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment