a10kiloham/ft.test.recipe Secret

## ft.test.recipe
#!/usr/bin/env  python2
# -*- mode: python -*-
# -*- coding: utf-8 -*-

__license__ = 'GPL v3'
__copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
'''
www.ft.com/todaysnewspaper/uk
'''

from calibre.web.feeds.news import BasicNewsRecipe
from urllib import unquote


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class FinancialTimes(BasicNewsRecipe):
    title = 'Financial Times (UK)'
    __author__ = 'Darko Miletic'
    description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."  # noqa
    publisher = 'The Financial Times Ltd.'
    category = 'news, finances, politics, UK, World'
    oldest_article = 7
    language = 'en_GB'
    max_articles_per_feed = 250
    no_stylesheets = True
    use_embedded_content = False
    needs_subscription = True
    encoding = 'utf8'
    publication_type = 'newspaper'
    handle_gzip = True
    compress_news_images = True
    compress_news_images_auto_size = 50
    compress_news_images_max_size = 20
    scale_news_images_to_device = True
    simultaneous_downloads = 1
    ignore_duplicate_articles = {'url'}
    LOGIN = 'https://accounts.ft.com/login'
    LOGOUT = 'https://myaccount.ft.com/logout'
    INDEX = 'https://www.ft.com/todaysnewspaper/uk'
    PREFIX = 'https://www.ft.com'

    keep_only_tags = [
        classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body'),
        dict(attrs={'class': lambda x: x and 'main-image' in x.split()}),
        dict(attrs={'class': lambda x: x and 'article__content' in x.split()}),
    ]

    remove_tags = [
        classes('n-content-related-box tour-tip n-content-recommended n-content-video o-share-wrapper'),
        classes('article-tools article__right follow-tags comments infitite-scroll component-share'),
#        classes('onward-journey ftlabsaudioplayerholder article-info__time-byline n-content-recommended--single-story'),
        classes('onward-journey ftlabsaudioplayerholder article-print n-content-recommended--single-story'),
#        dict(attrs={'class': lambda x: x and 'ftlabsaudioplayerholder' in x.split()}),
    ]

    extra_css = '''
                body {font-family: Georgia,serif;}
                img {display:block;}
	        figcaption {font-style:italic; color:LightGray;}
                '''

    def get_browser(self):
        USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
        br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT)
        #br.set_debug_http(True)
        #br.set_debug_responses(True)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
            br.open(self.LOGIN)
            br.select_form(id='email-form')
            br['email'] = self.username
            br.submit()
#            br.select_form(name='enter-password-form')
            br.select_form(id='login-form')
            br['password'] = self.password
            br.submit()
        br.addheaders += [('Referer', 'https://www.facebook.com')]
        br.addheaders += [('referer', 'https://www.facebook.com')]
        br.addheaders += [('referrer', 'https://www.facebook.com')]
        br.addheaders += [('Cookie', 'ft-access-decision-policy=PRIVILEGED_REFERER_POLICY;')]
        return br

    def open_novisit(self, *args, **kwargs):
        from calibre import browser
        br = get_browser()
        try:
            response = br.open_novisit(*args, **kwargs)
        except Exception as e:
            raise
            # Google is throttling us, wait a little
            time.sleep(2)
            response = br.open_novisit(*args, **kwargs)
        return response

    def get_cover_url(self):
        from datetime import date
        cover = 'http://img.kiosko.net/' + str(date.today().year) + '/' + date.today().strftime('%m') + '/' + date.today().strftime('%d') + '/uk/ft_uk.750.jpg'
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
            index = 'http://en.kiosko.net/uk/np/ft_uk.html'
            soup = self.index_to_soup(index)
            for image in soup.findAll('img', src=True):
                if image['src'].endswith('750.jpg'):
                    return image['src']
            self.log("\nCover unavailable")
            cover = None
        return cover

    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        totalfeeds = []
        current_section = []
        div = []
        for div in soup.findAll('div', attrs={'data-trackable': 'list'}):
            articles = []
            current_section = self.tag_to_string(div.find('h2'))
            self.log('in section: ', current_section)
            for article in div.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
                url = self.PREFIX + article['href']
                title = self.tag_to_string(article)
                articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
                self.log('title: ', title, ' url: ', url)
            totalfeeds.append((current_section,articles))
        return totalfeeds

    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):
            src = img['srcset'].split(',')[0].strip()
            src = unquote(src.rpartition('/')[2].partition('?')[0])
            img['src'] = src
        return soup

    def cleanup(self):
        self.browser.open(self.LOGOUT)
	#!/usr/bin/env python2
	# -- mode: python --
	# -- coding: utf-8 --

	__license__ = 'GPL v3'
	__copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
	'''
	www.ft.com/todaysnewspaper/uk
	'''

	from calibre.web.feeds.news import BasicNewsRecipe
	from urllib import unquote


	def classes(classes):
	q = frozenset(classes.split(' '))
	return dict(attrs={
	'class': lambda x: x and frozenset(x.split()).intersection(q)})


	class FinancialTimes(BasicNewsRecipe):
	title = 'Financial Times (UK)'
	__author__ = 'Darko Miletic'
	description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." # noqa
	publisher = 'The Financial Times Ltd.'
	category = 'news, finances, politics, UK, World'
	oldest_article = 7
	language = 'en_GB'
	max_articles_per_feed = 250
	no_stylesheets = True
	use_embedded_content = False
	needs_subscription = True
	encoding = 'utf8'
	publication_type = 'newspaper'
	handle_gzip = True
	compress_news_images = True
	compress_news_images_auto_size = 50
	compress_news_images_max_size = 20
	scale_news_images_to_device = True
	simultaneous_downloads = 1
	ignore_duplicate_articles = {'url'}
	LOGIN = 'https://accounts.ft.com/login'
	LOGOUT = 'https://myaccount.ft.com/logout'
	INDEX = 'https://www.ft.com/todaysnewspaper/uk'
	PREFIX = 'https://www.ft.com'

	keep_only_tags = [
	classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body'),
	dict(attrs={'class': lambda x: x and 'main-image' in x.split()}),
	dict(attrs={'class': lambda x: x and 'article__content' in x.split()}),
	]

	remove_tags = [
	classes('n-content-related-box tour-tip n-content-recommended n-content-video o-share-wrapper'),
	classes('article-tools article__right follow-tags comments infitite-scroll component-share'),
	# classes('onward-journey ftlabsaudioplayerholder article-info__time-byline n-content-recommended--single-story'),
	classes('onward-journey ftlabsaudioplayerholder article-print n-content-recommended--single-story'),
	# dict(attrs={'class': lambda x: x and 'ftlabsaudioplayerholder' in x.split()}),
	]

	extra_css = '''
	body {font-family: Georgia,serif;}
	img {display:block;}
	figcaption {font-style:italic; color:LightGray;}
	'''

	def get_browser(self):
	USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
	br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT)
	#br.set_debug_http(True)
	#br.set_debug_responses(True)
	br.open(self.INDEX)
	if self.username is not None and self.password is not None:
	br.open(self.LOGIN)
	br.select_form(id='email-form')
	br['email'] = self.username
	br.submit()
	# br.select_form(name='enter-password-form')
	br.select_form(id='login-form')
	br['password'] = self.password
	br.submit()
	br.addheaders += [('Referer', 'https://www.facebook.com')]
	br.addheaders += [('referer', 'https://www.facebook.com')]
	br.addheaders += [('referrer', 'https://www.facebook.com')]
	br.addheaders += [('Cookie', 'ft-access-decision-policy=PRIVILEGED_REFERER_POLICY;')]
	return br

	def open_novisit(self, args, *kwargs):
	from calibre import browser
	br = get_browser()
	try:
	response = br.open_novisit(args, *kwargs)
	except Exception as e:
	raise
	# Google is throttling us, wait a little
	time.sleep(2)
	response = br.open_novisit(args, *kwargs)
	return response

	def get_cover_url(self):
	from datetime import date
	cover = 'http://img.kiosko.net/' + str(date.today().year) + '/' + date.today().strftime('%m') + '/' + date.today().strftime('%d') + '/uk/ft_uk.750.jpg'
	br = BasicNewsRecipe.get_browser(self)
	try:
	br.open(cover)
	except:
	index = 'http://en.kiosko.net/uk/np/ft_uk.html'
	soup = self.index_to_soup(index)
	for image in soup.findAll('img', src=True):
	if image['src'].endswith('750.jpg'):
	return image['src']
	self.log("\nCover unavailable")
	cover = None
	return cover

	def parse_index(self):
	articles = []
	soup = self.index_to_soup(self.INDEX)
	totalfeeds = []
	current_section = []
	div = []
	for div in soup.findAll('div', attrs={'data-trackable': 'list'}):
	articles = []
	current_section = self.tag_to_string(div.find('h2'))
	self.log('in section: ', current_section)
	for article in div.findAll('a', href=True, attrs={'data-trackable':'main-link'}):
	url = self.PREFIX + article['href']
	title = self.tag_to_string(article)
	articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
	self.log('title: ', title, ' url: ', url)
	totalfeeds.append((current_section,articles))
	return totalfeeds

	def preprocess_html(self, soup):
	for img in soup.findAll('img', srcset=True):
	src = img['srcset'].split(',')[0].strip()
	src = unquote(src.rpartition('/')[2].partition('?')[0])
	img['src'] = src
	return soup

	def cleanup(self):
	self.browser.open(self.LOGOUT)