Last active
November 9, 2016 18:54
-
-
Save kgaut/cf62063c38e45e78aaf30e6efdd9d09b to your computer and use it in GitHub Desktop.
Recipe Mediapart pour Calibre (à placer dans le dossier .config/calibre/custom_recipes)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- mode:python -*- | |
from __future__ import unicode_literals | |
__license__ = 'GPL v3' | |
__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' | |
''' | |
Mediapart | |
''' | |
__author__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' | |
import re | |
from calibre.ebooks.BeautifulSoup import BeautifulSoup | |
from calibre.web.feeds.news import BasicNewsRecipe | |
from calibre.web.feeds import feeds_from_index | |
from datetime import date,timedelta | |
class Mediapart(BasicNewsRecipe): | |
title = 'Mediapart du ' + time.strftime("%d/%m/%Y") | |
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert' | |
author = 'Rédaction Mediapart' | |
description = 'Tous les derniers articles du site Mediapart' | |
publisher = 'Mediapart' | |
publication_type = 'newspaper' | |
language = 'fr' | |
needs_subscription = True | |
oldest_article = 2 | |
extra_css = 'h1 {font-size: medium;}' | |
use_embedded_content = False | |
no_stylesheets = True | |
cover_url = 'http://i.imgur.com/0TaEcS1.jpg' | |
# -- | |
oldest_article_date = date.today() - timedelta(days=oldest_article) | |
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has | |
# the 10 last elements :/) | |
feeds = [ | |
('La Une', 'http://www.mediapart.fr/articles/feed'), | |
] | |
def parse_feeds(self): | |
feeds = super(Mediapart, self).parse_feeds() | |
feeds += feeds_from_index(self.my_parse_index(feeds)) | |
return feeds | |
def my_parse_index(self, la_une): | |
articles = [] | |
breves = [] | |
liens = [] | |
confidentiels = [] | |
soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites') | |
page = soup.find('main', {'role':'main'}) | |
fils = page.find('ul', {'class':'post-list universe-journal'}) | |
for article in fils.findAll('li'): | |
try: | |
title = article.find('h3',recursive=False) | |
if title is None or title['class'] == 'title-specific': | |
continue | |
# print "found fil ",title | |
article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents() | |
# print "kind: ",article_type | |
for s in title('span'): | |
s.replaceWith(s.renderContents() + "\n") | |
url = title.find('a', href=True)['href'] | |
article_date = self.parse_french_date(article.find("span", "article-date").renderContents()) | |
#print("################################# 9") | |
#print(article_date) | |
#if article_date < self.oldest_article_date: | |
# print "too old" | |
# continue | |
authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')}) | |
authors = [self.tag_to_string(a) for a in authors] | |
description = article.find('p') | |
print "fil ",title," by ",authors," : ",description | |
summary = { | |
'title': self.tag_to_string(title).strip(), | |
'author': ', '.join(authors), | |
'url': url, | |
#'date': u'' + article_date.strftime("%A %d %b %Y"), | |
'description': '\n'.join([self.tag_to_string(d) for d in description]), | |
} | |
{ | |
"Brève": breves, | |
"Lien": liens, | |
"Confidentiel": confidentiels, | |
}.get(article_type).append(summary) | |
except: | |
pass | |
# print 'La Une: ', len(la_une), ' articles' | |
# for a in la_une: print a["title"] | |
# print 'Brèves: ', len(breves), ' articles' | |
# print 'Revue web: ', len(liens), ' articles' | |
# print 'Confidentiel: ', len(confidentiels), ' articles' | |
articles += [('Brèves', breves)] if breves else [] | |
articles += [('Revue du Web', liens)] if liens else [] | |
articles += [('Confidentiel', confidentiels)] if confidentiels else [] | |
return articles | |
# -- print-version | |
conversion_options = {'smarten_punctuation' : True} | |
remove_tags = [dict(name='div', attrs={'class':'print-source_url'})] | |
# non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale) | |
def parse_french_date(self, date_str): | |
date_arr = date_str.lower().split() | |
return date(day=int(date_arr[0]), | |
year=int(date_arr[2]), | |
month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', | |
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1])) | |
def print_version(self, url): | |
raw = self.browser.open(url).read() | |
soup = BeautifulSoup(raw.decode('utf8', 'replace')) | |
# Filter old articles | |
# article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date'))) | |
# if article_date < self.oldest_article_date: | |
# return None | |
tools = soup.find('li', {'class':'print'}) | |
link = tools.find('a', {'href': re.compile(r'\/print\/.*')}) | |
print(link['href']) | |
# if link is None: | |
# print 'Error: print link not found' | |
# return None | |
return 'https://mediapart.fr' + link['href'] | |
# return url | |
# -- Handle login | |
def get_browser(self): | |
br = BasicNewsRecipe.get_browser(self) | |
if self.username is not None and self.password is not None: | |
br.open('https://www.mediapart.fr/login') | |
br.select_form(nr=2) | |
br['name'] = self.username | |
br['password'] = self.password | |
br.submit() | |
return br | |
# This is a workaround articles with scribd content that include | |
# <body></body> tags _within_ the body | |
preprocess_regexps = [ | |
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL), | |
lambda match: | |
match.group(1) + re.sub( | |
re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '</body>') | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment