Skip to content

Instantly share code, notes, and snippets.

@ggsalas
Last active November 6, 2019 01:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ggsalas/1cafe377e09927f9ca9b4814b3730d2c to your computer and use it in GitHub Desktop.
Save ggsalas/1cafe377e09927f9ca9b4814b3730d2c to your computer and use it in GitHub Desktop.
Recipe for Calibre to get https://www.perfil.com
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
class AdvancedUserRecipe1572824592(BasicNewsRecipe):
title = 'Diario Perfil'
oldest_article = 1
max_articles_per_feed = 35
cover_url = strftime('http://img8.kiosko.net/%Y/%m/%d/ar/ar_perfil.750.jpg')
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
publication_type = 'newspaper'
compress_news_images = True
scale_news_images_to_device = True
compress_news_images_max_size = 10 # kB
scale_news_images = True
handle_gzip = True
# To get all the data (images)
auto_cleanup = False
keep_only_tags = [
dict( attrs={'class': ['fotoPrincipal', 'articuloHeader']}),
dict( attrs={'id': ['bodytext']})
]
remove_tags = [
dict(name=['meta', 'base', 'link', 'iframe', 'embed', 'object']),
dict(attrs={'class': ['relatedPost', 'destacadoNota', 'embed-responsive', 'iframeContainer']}),
]
remove_attributes = ['style', 'font']
no_stylesheets = True
extra_css = """
.fotoPrincipal,
.figcaption {
font-style: italic;
font-size: .9em;
margin-bottom: .5em;
}
.articuloMeta {
display: none;
}
.articuloHeader h1 {
line-height: 1em;
margin: 0 0 .5em 0;
}
.articuloHeader h2 {
font-size: 1em;
line-height: 1em;
color: #666666;
margin: 0 0 1em 0;
}
"""
# Images on hightlights view
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article, picdiv['src'])
feeds = [
('Ultimo momento', 'https://www.perfil.com/rss/ultimomomento.xml'),
('Pol\xedtica', 'https://www.perfil.com/rss/politica.xml'),
('Econom\xeda', 'https://www.perfil.com/rss/economia.xml'),
('Internacionales', 'http://www.perfil.com/rss/internacional.xml'),
('Opini\xf3n', 'https://www.perfil.com/rss/columnistas.xml'),
('Sociedad', 'https://www.perfil.com/rss/sociedad.xml'),
('Cultura', 'https://www.perfil.com/rss/cultura.xml'),
('Espect\xf3culos', 'https://www.perfil.com/rss/espectaculos.xml'),
('Ciencia', 'http://www.perfil.com/rss/ciencia.xml'),
('Salud', 'http://www.perfil.com/rss/salud.xml'),
('Tecnolog\xeda', 'https://www.perfil.com/rss/tecnologia.xml'),
]
def preprocess_html(self, soup):
for figCaption in soup.findAll(['figcaption']):
figCaption.name = 'div'
figCaption['class'] = 'figcaption'
return soup
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['a']):
t.name = 'strong'
nonBreakSpace = u'\xa0'
for empty in soup.findAll('p', string=nonBreakSpace):
empty.extract()
return soup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment