Skip to content

Instantly share code, notes, and snippets.

Last active March 2, 2021 00:30
What would you like to do?
"Marc Dubuisson croque l’actu" en RSS
# $Id: 521 2020-03-08 21:03:51Z sarterm $
# Dependencies : bs4, lxml, requests
import logging
from bs4 import BeautifulSoup
import re
import requests
url = ""
log = logging.getLogger(__name__)
session = requests.Session()
session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Cookie': 'pwv=1; pws=functional|analytics|content_recommendation|targeted_advertising|social_media; pwv=1; pws=functional|analytics|content_recommendation|targeted_advertising|social_media',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'})
page = BeautifulSoup(session.get(url).content, 'lxml')
redirect_url = re.findall(r"[^']*", str(page.contents))[0]
page = BeautifulSoup(session.get(redirect_url).content, 'lxml')
posts = page.find_all('div', {'class': 'article__component article__component--picture'})
<rss version="2.0">
<title>Dubuisson / 7 sur 7</title>
<description>Marc Dubuisson croque l’actu</description>
for post in posts:
link = post.find('a', {'class': 'slideshow-trigger'})
img = post.find('img')
post_url = link['href']
post_title = img['alt']
<author>Marc Dubuisson</author>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment