-
-
Save a10kiloham/468e68eb4acd59e03e83a51657636736 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- mode: python -*- | |
# -*- coding: utf-8 -*- | |
__license__ = 'GPL v3' | |
__copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>' | |
''' | |
www.ft.com/todaysnewspaper/uk | |
''' | |
from calibre.web.feeds.news import BasicNewsRecipe | |
from urllib import unquote | |
def classes(classes): | |
q = frozenset(classes.split(' ')) | |
return dict(attrs={ | |
'class': lambda x: x and frozenset(x.split()).intersection(q)}) | |
class FinancialTimes(BasicNewsRecipe): | |
title = 'Financial Times (UK)' | |
__author__ = 'Darko Miletic' | |
description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." # noqa | |
publisher = 'The Financial Times Ltd.' | |
category = 'news, finances, politics, UK, World' | |
oldest_article = 7 | |
language = 'en_GB' | |
max_articles_per_feed = 250 | |
no_stylesheets = True | |
use_embedded_content = False | |
needs_subscription = True | |
encoding = 'utf8' | |
publication_type = 'newspaper' | |
handle_gzip = True | |
compress_news_images = True | |
compress_news_images_auto_size = 50 | |
compress_news_images_max_size = 20 | |
scale_news_images_to_device = True | |
simultaneous_downloads = 1 | |
ignore_duplicate_articles = {'url'} | |
LOGIN = 'https://accounts.ft.com/login' | |
LOGOUT = 'https://myaccount.ft.com/logout' | |
INDEX = 'https://www.ft.com/todaysnewspaper/uk' | |
PREFIX = 'https://www.ft.com' | |
keep_only_tags = [ | |
classes('topper__headline topper__standfirst n-content-image--full article__time-byline article__body'), | |
dict(attrs={'class': lambda x: x and 'main-image' in x.split()}), | |
dict(attrs={'class': lambda x: x and 'article__content' in x.split()}), | |
] | |
remove_tags = [ | |
classes('n-content-related-box tour-tip n-content-recommended n-content-video o-share-wrapper'), | |
classes('article-tools article__right follow-tags comments infitite-scroll component-share'), | |
# classes('onward-journey ftlabsaudioplayerholder article-info__time-byline n-content-recommended--single-story'), | |
classes('onward-journey ftlabsaudioplayerholder article-print n-content-recommended--single-story'), | |
# dict(attrs={'class': lambda x: x and 'ftlabsaudioplayerholder' in x.split()}), | |
] | |
extra_css = ''' | |
body {font-family: Georgia,serif;} | |
img {display:block;} | |
figcaption {font-style:italic; color:LightGray;} | |
''' | |
def get_browser(self): | |
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' | |
br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT) | |
#br.set_debug_http(True) | |
#br.set_debug_responses(True) | |
br.open(self.INDEX) | |
if self.username is not None and self.password is not None: | |
br.open(self.LOGIN) | |
br.select_form(id='email-form') | |
br['email'] = self.username | |
br.submit() | |
# br.select_form(name='enter-password-form') | |
br.select_form(id='login-form') | |
br['password'] = self.password | |
br.submit() | |
br.addheaders += [('Referer', 'https://www.facebook.com')] | |
br.addheaders += [('referer', 'https://www.facebook.com')] | |
br.addheaders += [('referrer', 'https://www.facebook.com')] | |
br.addheaders += [('Cookie', 'ft-access-decision-policy=PRIVILEGED_REFERER_POLICY;')] | |
return br | |
def open_novisit(self, *args, **kwargs): | |
from calibre import browser | |
br = get_browser() | |
try: | |
response = br.open_novisit(*args, **kwargs) | |
except Exception as e: | |
raise | |
# Google is throttling us, wait a little | |
time.sleep(2) | |
response = br.open_novisit(*args, **kwargs) | |
return response | |
def get_cover_url(self): | |
from datetime import date | |
cover = 'http://img.kiosko.net/' + str(date.today().year) + '/' + date.today().strftime('%m') + '/' + date.today().strftime('%d') + '/uk/ft_uk.750.jpg' | |
br = BasicNewsRecipe.get_browser(self) | |
try: | |
br.open(cover) | |
except: | |
index = 'http://en.kiosko.net/uk/np/ft_uk.html' | |
soup = self.index_to_soup(index) | |
for image in soup.findAll('img', src=True): | |
if image['src'].endswith('750.jpg'): | |
return image['src'] | |
self.log("\nCover unavailable") | |
cover = None | |
return cover | |
def parse_index(self): | |
articles = [] | |
soup = self.index_to_soup(self.INDEX) | |
totalfeeds = [] | |
current_section = [] | |
div = [] | |
for div in soup.findAll('div', attrs={'data-trackable': 'list'}): | |
articles = [] | |
current_section = self.tag_to_string(div.find('h2')) | |
self.log('in section: ', current_section) | |
for article in div.findAll('a', href=True, attrs={'data-trackable':'main-link'}): | |
url = self.PREFIX + article['href'] | |
title = self.tag_to_string(article) | |
articles.append({'title': title, 'url': url, 'description': '', 'date': ''}) | |
self.log('title: ', title, ' url: ', url) | |
totalfeeds.append((current_section,articles)) | |
return totalfeeds | |
def preprocess_html(self, soup): | |
for img in soup.findAll('img', srcset=True): | |
src = img['srcset'].split(',')[0].strip() | |
src = unquote(src.rpartition('/')[2].partition('?')[0]) | |
img['src'] = src | |
return soup | |
def cleanup(self): | |
self.browser.open(self.LOGOUT) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment