evmn/Webvision.py

## Webvision.py
#!/usr/bin/env  python
# encoding: utf-8
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import datetime
base_url = 'https://webvision.med.utah.edu/'

class Web_Vision(BasicNewsRecipe):

	title = 'WEBVISION: The Organization of the Retina and Visual System'
	cover_url = 'http://webvision.med.utah.edu/wp-content/uploads/2011/05/Titlez1.jpg'
#	remove_tags_before = dict(attrs={'class':'entry-content'})
	remove_tags_before = dict(attrs={'class':'entry-header'})
	remove_tags_after= dict(attrs={'class':'entry-content'})
	__author__ = ''
	language = 'en'
	encoding = 'utf-8'
	timefmt = ''
	extra_css = 'h1{text-align:center}'
	preprocess_regexps = [
		(re.compile(u'<p>&nbsp;</p>'), u""),
		(re.compile(u'<br>'), u""),
		#(re.compile(u'<p style="text-align: center;"><a href="http://uuhsc.utah.edu/MoranEyeCenter/research/faculty/emeritus/helga_kolb.htm" target="_blank" rel="noopener">Helga Kolb</a></p>'), u""),
		#(re.compile(u'<p>(About)? the authors?</p>.*</div>', re.DOTALL | re.IGNORECASE), u"</div>"),
		#(re.compile(u'<p[^>]*?>Last Updated?: .*</div>', re.DOTALL), u"</div>"),
		(re.compile(u' by .*?</h1>', re.IGNORECASE), u"</h1>"),
		]


	no_stylesheets = True
	resolve_internal_links = True
	remove_javascript = True
	auto_cleanup = False
	delay = 1
	simultaneous_downloads = 5
	oldest_article = 999
	max_articles_per_feed = 999

	def parse_index(self):
		feeds = []
		soup = self.index_to_soup(base_url)
		archives = soup.find('ul', {'class':'chapter-children'}).findAll('li', {'class': 'page_item'})
		patterns = re.compile("Part ")
		pdf = re.compile("How the Retina Works")
		ignore_pt = re.compile("(Part [IVX:]* )|( by .*)", re.IGNORECASE)
		author_pt = re.compile("( by .*)", re.IGNORECASE)
		section = []
		old_title = ""
		sec_title = ""
		old_url = ""
		isFirstGroup = True
		for entry in archives:
			title = entry.find('a').getText()
			print(title)
			url = entry.find('a')['href']
			if patterns.match(title):
				if len(section) < 1 and len(sec_title) > 5:
					old_title = ignore_pt.sub("", old_title)
					section.append({'title':old_title, 'url': old_url})
				if len(section) > 0 and len(sec_title) > 5:
					feeds.append((sec_title, section))
				sec_title = author_pt.sub("", title)
				section = []
			elif not pdf.match(title):
				title = ignore_pt.sub("", title)
				section.append({'title':title, 'url': url})
			old_title = title
			old_url = url
		feeds.append((sec_title, section))
		for sec in feeds:
			print(sec[0])
			for item in sec[1]:
				print("    ", item['title'])
		return feeds
	#!/usr/bin/env python
	# encoding: utf-8
	from calibre.web.feeds.recipes import BasicNewsRecipe
	from calibre.ebooks.BeautifulSoup import BeautifulSoup
	from datetime import datetime
	base_url = 'https://webvision.med.utah.edu/'

	class Web_Vision(BasicNewsRecipe):

	title = 'WEBVISION: The Organization of the Retina and Visual System'
	cover_url = 'http://webvision.med.utah.edu/wp-content/uploads/2011/05/Titlez1.jpg'
	# remove_tags_before = dict(attrs={'class':'entry-content'})
	remove_tags_before = dict(attrs={'class':'entry-header'})
	remove_tags_after= dict(attrs={'class':'entry-content'})
	__author__ = ''
	language = 'en'
	encoding = 'utf-8'
	timefmt = ''
	extra_css = 'h1{text-align:center}'
	preprocess_regexps = [
	(re.compile(u'<p> </p>'), u""),
	(re.compile(u'<br>'), u""),
	#(re.compile(u'<p style="text-align: center;"><a href="http://uuhsc.utah.edu/MoranEyeCenter/research/faculty/emeritus/helga_kolb.htm" target="_blank" rel="noopener">Helga Kolb</a></p>'), u""),
	#(re.compile(u'<p>(About)? the authors?</p>.*</div>', re.DOTALL \| re.IGNORECASE), u"</div>"),
	#(re.compile(u'<p[^>]?>Last Updated?: .</div>', re.DOTALL), u"</div>"),
	(re.compile(u' by .*?</h1>', re.IGNORECASE), u"</h1>"),
	]


	no_stylesheets = True
	resolve_internal_links = True
	remove_javascript = True
	auto_cleanup = False
	delay = 1
	simultaneous_downloads = 5
	oldest_article = 999
	max_articles_per_feed = 999

	def parse_index(self):
	feeds = []
	soup = self.index_to_soup(base_url)
	archives = soup.find('ul', {'class':'chapter-children'}).findAll('li', {'class': 'page_item'})
	patterns = re.compile("Part ")
	pdf = re.compile("How the Retina Works")
	ignore_pt = re.compile("(Part [IVX:]* )\|( by .*)", re.IGNORECASE)
	author_pt = re.compile("( by .*)", re.IGNORECASE)
	section = []
	old_title = ""
	sec_title = ""
	old_url = ""
	isFirstGroup = True
	for entry in archives:
	title = entry.find('a').getText()
	print(title)
	url = entry.find('a')['href']
	if patterns.match(title):
	if len(section) < 1 and len(sec_title) > 5:
	old_title = ignore_pt.sub("", old_title)
	section.append({'title':old_title, 'url': old_url})
	if len(section) > 0 and len(sec_title) > 5:
	feeds.append((sec_title, section))
	sec_title = author_pt.sub("", title)
	section = []
	elif not pdf.match(title):
	title = ignore_pt.sub("", title)
	section.append({'title':title, 'url': url})
	old_title = title
	old_url = url
	feeds.append((sec_title, section))
	for sec in feeds:
	print(sec[0])
	for item in sec[1]:
	print(" ", item['title'])
	return feeds