ggsalas/Diario Perfil_1003.recipe

## Diario Perfil_1003.recipe
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime

class AdvancedUserRecipe1572824592(BasicNewsRecipe):
    title = 'Diario Perfil'
    oldest_article = 1
    max_articles_per_feed = 35
    cover_url = strftime('http://img8.kiosko.net/%Y/%m/%d/ar/ar_perfil.750.jpg')
    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds = True
    publication_type = 'newspaper'

    compress_news_images = True
    scale_news_images_to_device = True
    compress_news_images_max_size = 10  # kB
    scale_news_images = True
    handle_gzip = True

    # To get all the data (images)
    auto_cleanup = False

    keep_only_tags = [
        dict( attrs={'class': ['fotoPrincipal', 'articuloHeader']}),
        dict( attrs={'id': ['bodytext']})
    ]

    remove_tags = [
        dict(name=['meta', 'base', 'link', 'iframe', 'embed', 'object']),
        dict(attrs={'class': ['relatedPost', 'destacadoNota', 'embed-responsive', 'iframeContainer']}),
    ]

    remove_attributes = ['style', 'font']

    no_stylesheets = True

    extra_css = """
      .fotoPrincipal,
      .figcaption {
        font-style: italic;
        font-size: .9em;
        margin-bottom: .5em;
      }
      .articuloMeta {
        display: none;
      }
      .articuloHeader h1 {
        line-height: 1em;
        margin: 0 0 .5em 0;
      }
      .articuloHeader h2 {
        font-size: 1em;
        line-height: 1em;
        color: #666666;
        margin: 0 0 1em 0;
      }
    """

    # Images on hightlights view
    def populate_article_metadata(self, article, soup, first):
      if first and hasattr(self, 'add_toc_thumbnail'):
        picdiv = soup.find('img')
        if picdiv is not None:
          self.add_toc_thumbnail(article, picdiv['src'])

    feeds = [
        ('Ultimo momento', 'https://www.perfil.com/rss/ultimomomento.xml'),
        ('Pol\xedtica', 'https://www.perfil.com/rss/politica.xml'),
        ('Econom\xeda', 'https://www.perfil.com/rss/economia.xml'),
        ('Internacionales', 'http://www.perfil.com/rss/internacional.xml'),
        ('Opini\xf3n', 'https://www.perfil.com/rss/columnistas.xml'),
        ('Sociedad', 'https://www.perfil.com/rss/sociedad.xml'),
        ('Cultura', 'https://www.perfil.com/rss/cultura.xml'),
        ('Espect\xf3culos', 'https://www.perfil.com/rss/espectaculos.xml'),
        ('Ciencia', 'http://www.perfil.com/rss/ciencia.xml'),
        ('Salud', 'http://www.perfil.com/rss/salud.xml'),
        ('Tecnolog\xeda', 'https://www.perfil.com/rss/tecnologia.xml'),
    ]

    def preprocess_html(self, soup):
        for figCaption in soup.findAll(['figcaption']):
            figCaption.name = 'div'
            figCaption['class'] = 'figcaption'

        return soup

    def postprocess_html(self, soup, first_fetch):
        for t in soup.findAll(['a']):
            t.name = 'strong'

        nonBreakSpace = u'\xa0'
        for empty in soup.findAll('p', string=nonBreakSpace):
            empty.extract()

        return soup
	#!/usr/bin/env python2
	# vim:fileencoding=utf-8
	from __future__ import unicode_literals, division, absolute_import, print_function
	from calibre.web.feeds.news import BasicNewsRecipe
	from calibre import strftime

	class AdvancedUserRecipe1572824592(BasicNewsRecipe):
	title = 'Diario Perfil'
	oldest_article = 1
	max_articles_per_feed = 35
	cover_url = strftime('http://img8.kiosko.net/%Y/%m/%d/ar/ar_perfil.750.jpg')
	ignore_duplicate_articles = {'title', 'url'}
	remove_empty_feeds = True
	publication_type = 'newspaper'

	compress_news_images = True
	scale_news_images_to_device = True
	compress_news_images_max_size = 10 # kB
	scale_news_images = True
	handle_gzip = True

	# To get all the data (images)
	auto_cleanup = False

	keep_only_tags = [
	dict( attrs={'class': ['fotoPrincipal', 'articuloHeader']}),
	dict( attrs={'id': ['bodytext']})
	]

	remove_tags = [
	dict(name=['meta', 'base', 'link', 'iframe', 'embed', 'object']),
	dict(attrs={'class': ['relatedPost', 'destacadoNota', 'embed-responsive', 'iframeContainer']}),
	]

	remove_attributes = ['style', 'font']

	no_stylesheets = True

	extra_css = """
	.fotoPrincipal,
	.figcaption {
	font-style: italic;
	font-size: .9em;
	margin-bottom: .5em;
	}
	.articuloMeta {
	display: none;
	}
	.articuloHeader h1 {
	line-height: 1em;
	margin: 0 0 .5em 0;
	}
	.articuloHeader h2 {
	font-size: 1em;
	line-height: 1em;
	color: #666666;
	margin: 0 0 1em 0;
	}
	"""

	# Images on hightlights view
	def populate_article_metadata(self, article, soup, first):
	if first and hasattr(self, 'add_toc_thumbnail'):
	picdiv = soup.find('img')
	if picdiv is not None:
	self.add_toc_thumbnail(article, picdiv['src'])

	feeds = [
	('Ultimo momento', 'https://www.perfil.com/rss/ultimomomento.xml'),
	('Pol\xedtica', 'https://www.perfil.com/rss/politica.xml'),
	('Econom\xeda', 'https://www.perfil.com/rss/economia.xml'),
	('Internacionales', 'http://www.perfil.com/rss/internacional.xml'),
	('Opini\xf3n', 'https://www.perfil.com/rss/columnistas.xml'),
	('Sociedad', 'https://www.perfil.com/rss/sociedad.xml'),
	('Cultura', 'https://www.perfil.com/rss/cultura.xml'),
	('Espect\xf3culos', 'https://www.perfil.com/rss/espectaculos.xml'),
	('Ciencia', 'http://www.perfil.com/rss/ciencia.xml'),
	('Salud', 'http://www.perfil.com/rss/salud.xml'),
	('Tecnolog\xeda', 'https://www.perfil.com/rss/tecnologia.xml'),
	]

	def preprocess_html(self, soup):
	for figCaption in soup.findAll(['figcaption']):
	figCaption.name = 'div'
	figCaption['class'] = 'figcaption'

	return soup

	def postprocess_html(self, soup, first_fetch):
	for t in soup.findAll(['a']):
	t.name = 'strong'

	nonBreakSpace = u'\xa0'
	for empty in soup.findAll('p', string=nonBreakSpace):
	empty.extract()

	return soup