kgaut/mediapart.recipe

## mediapart.recipe
# -*- mode:python -*-
from __future__ import unicode_literals

__license__   = 'GPL v3'
__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''

__author__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'

import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import date,timedelta

class Mediapart(BasicNewsRecipe):
    title = 'Mediapart du ' + time.strftime("%d/%m/%Y")
    __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
    author = 'Rédaction Mediapart'
    description = 'Tous les derniers articles du site Mediapart'
    publisher = 'Mediapart'
    publication_type = 'newspaper'
    language = 'fr'
    needs_subscription = True
    oldest_article = 2
    extra_css = 'h1 {font-size: medium;}'
    use_embedded_content = False
    no_stylesheets = True

    cover_url = 'http://i.imgur.com/0TaEcS1.jpg'

# --

    oldest_article_date = date.today() - timedelta(days=oldest_article)

# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
#    the 10 last elements :/)

    feeds =  [
        ('La Une', 'http://www.mediapart.fr/articles/feed'),
    ]

    def parse_feeds(self):
        feeds = super(Mediapart, self).parse_feeds()
        feeds += feeds_from_index(self.my_parse_index(feeds))
        return feeds

    def my_parse_index(self, la_une):
        articles = []

        breves = []
        liens = []
        confidentiels = []

        soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
        page = soup.find('main', {'role':'main'})
        fils = page.find('ul', {'class':'post-list universe-journal'})

        for article in fils.findAll('li'):
            try:
                title = article.find('h3',recursive=False)

                if title is None or title['class'] == 'title-specific':
                    continue

                # print "found fil ",title
                article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
                # print "kind: ",article_type

                for s in title('span'):
                    s.replaceWith(s.renderContents() + "\n")
                url = title.find('a', href=True)['href']

                article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
                #print("################################# 9")
                #print(article_date)

                #if article_date < self.oldest_article_date:
                    # print "too old"
                #    continue

                authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
                authors = [self.tag_to_string(a) for a in authors]

                description = article.find('p')

                print "fil ",title," by ",authors," : ",description

                summary = {
                    'title': self.tag_to_string(title).strip(),
                    'author': ', '.join(authors),
                    'url': url,
                    #'date': u'' + article_date.strftime("%A %d %b %Y"),
                    'description': '\n'.join([self.tag_to_string(d) for d in description]),
                }
                {
                    "Brève": breves,
                    "Lien": liens,
                    "Confidentiel": confidentiels,
                }.get(article_type).append(summary)
            except:
                pass

        # print 'La Une: ', len(la_une), ' articles'
        # for a in la_une: print a["title"]
        # print 'Brèves: ', len(breves), ' articles'
        # print 'Revue web: ', len(liens), ' articles'
        # print 'Confidentiel: ', len(confidentiels), ' articles'

        articles += [('Brèves', breves)] if breves else []
        articles += [('Revue du Web', liens)] if liens else []
        articles += [('Confidentiel', confidentiels)] if confidentiels else []
	return articles
# -- print-version

    conversion_options = {'smarten_punctuation' : True}

    remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]

    # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
    def parse_french_date(self, date_str):
        date_arr = date_str.lower().split()
        return date(day=int(date_arr[0]),
                    year=int(date_arr[2]),
                    month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
                       'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))

    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
        # Filter old articles
 #       article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))

  #      if article_date < self.oldest_article_date:
   #         return None

        tools = soup.find('li', {'class':'print'})
        link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
        print(link['href'])
      #       if link is None:
 #           print 'Error: print link not found'
 #           return None
        return 'https://mediapart.fr' + link['href']
#        return url

# -- Handle login
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('https://www.mediapart.fr/login')
            br.select_form(nr=2)
            br['name'] = self.username
            br['password'] = self.password
            br.submit()

        return br

    # This is a workaround articles with scribd content that include
    # <body></body> tags _within_ the body
    preprocess_regexps = [
        (re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
         lambda match:
             match.group(1) + re.sub(
                 re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '</body>')
    ]
	# -- mode:python --
	from __future__ import unicode_literals

	__license__ = 'GPL v3'
	__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
	'''
	Mediapart
	'''

	__author__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'

	import re
	from calibre.ebooks.BeautifulSoup import BeautifulSoup
	from calibre.web.feeds.news import BasicNewsRecipe
	from calibre.web.feeds import feeds_from_index
	from datetime import date,timedelta

	class Mediapart(BasicNewsRecipe):
	title = 'Mediapart du ' + time.strftime("%d/%m/%Y")
	__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
	author = 'Rédaction Mediapart'
	description = 'Tous les derniers articles du site Mediapart'
	publisher = 'Mediapart'
	publication_type = 'newspaper'
	language = 'fr'
	needs_subscription = True
	oldest_article = 2
	extra_css = 'h1 {font-size: medium;}'
	use_embedded_content = False
	no_stylesheets = True

	cover_url = 'http://i.imgur.com/0TaEcS1.jpg'

	# --

	oldest_article_date = date.today() - timedelta(days=oldest_article)

	# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
	# the 10 last elements :/)

	feeds = [
	('La Une', 'http://www.mediapart.fr/articles/feed'),
	]

	def parse_feeds(self):
	feeds = super(Mediapart, self).parse_feeds()
	feeds += feeds_from_index(self.my_parse_index(feeds))
	return feeds

	def my_parse_index(self, la_une):
	articles = []

	breves = []
	liens = []
	confidentiels = []

	soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
	page = soup.find('main', {'role':'main'})
	fils = page.find('ul', {'class':'post-list universe-journal'})

	for article in fils.findAll('li'):
	try:
	title = article.find('h3',recursive=False)

	if title is None or title['class'] == 'title-specific':
	continue

	# print "found fil ",title
	article_type = article.find('a', {'href': re.compile(r'.\/type-darticles\/.')}).renderContents()
	# print "kind: ",article_type

	for s in title('span'):
	s.replaceWith(s.renderContents() + "\n")
	url = title.find('a', href=True)['href']

	article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
	#print("################################# 9")
	#print(article_date)

	#if article_date < self.oldest_article_date:
	# print "too old"
	# continue

	authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
	authors = [self.tag_to_string(a) for a in authors]

	description = article.find('p')

	print "fil ",title," by ",authors," : ",description

	summary = {
	'title': self.tag_to_string(title).strip(),
	'author': ', '.join(authors),
	'url': url,
	#'date': u'' + article_date.strftime("%A %d %b %Y"),
	'description': '\n'.join([self.tag_to_string(d) for d in description]),
	}
	{
	"Brève": breves,
	"Lien": liens,
	"Confidentiel": confidentiels,
	}.get(article_type).append(summary)
	except:
	pass

	# print 'La Une: ', len(la_une), ' articles'
	# for a in la_une: print a["title"]
	# print 'Brèves: ', len(breves), ' articles'
	# print 'Revue web: ', len(liens), ' articles'
	# print 'Confidentiel: ', len(confidentiels), ' articles'

	articles += [('Brèves', breves)] if breves else []
	articles += [('Revue du Web', liens)] if liens else []
	articles += [('Confidentiel', confidentiels)] if confidentiels else []
	return articles
	# -- print-version

	conversion_options = {'smarten_punctuation' : True}

	remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]

	# non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
	def parse_french_date(self, date_str):
	date_arr = date_str.lower().split()
	return date(day=int(date_arr[0]),
	year=int(date_arr[2]),
	month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
	'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))

	def print_version(self, url):
	raw = self.browser.open(url).read()
	soup = BeautifulSoup(raw.decode('utf8', 'replace'))
	# Filter old articles
	# article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))

	# if article_date < self.oldest_article_date:
	# return None

	tools = soup.find('li', {'class':'print'})
	link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
	print(link['href'])
	# if link is None:
	# print 'Error: print link not found'
	# return None
	return 'https://mediapart.fr' + link['href']
	# return url

	# -- Handle login
	def get_browser(self):
	br = BasicNewsRecipe.get_browser(self)
	if self.username is not None and self.password is not None:
	br.open('https://www.mediapart.fr/login')
	br.select_form(nr=2)
	br['name'] = self.username
	br['password'] = self.password
	br.submit()

	return br

	# This is a workaround articles with scribd content that include
	# <body></body> tags _within_ the body
	preprocess_regexps = [
	(re.compile(r'(<body.?>)(.)</body>', re.IGNORECASE\|re.DOTALL),
	lambda match:
	match.group(1) + re.sub(
	re.compile(r'</?body>', re.IGNORECASE\|re.DOTALL),'', match.group(2)) + '</body>')
	]