Skip to content

Instantly share code, notes, and snippets.

@MatthieuSarter
Last active March 2, 2021 00:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MatthieuSarter/86a14a13593db95faede44160b528060 to your computer and use it in GitHub Desktop.
Save MatthieuSarter/86a14a13593db95faede44160b528060 to your computer and use it in GitHub Desktop.
"Marc Dubuisson croque l’actu" en RSS
#!/usr/bin/python3
# $Id: dubuisson-7sur7.py 521 2020-03-08 21:03:51Z sarterm $
# Dependencies : bs4, lxml, requests
import logging
from bs4 import BeautifulSoup
import re
import requests
url = "https://www.7sur7.be/extra/marc-dubuisson-croque-l-actu~a7839cb5/"
log = logging.getLogger(__name__)
log.addHandler(logging.StreamHandler())
session = requests.Session()
session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Cookie': 'pwv=1; pws=functional|analytics|content_recommendation|targeted_advertising|social_media; pwv=1; pws=functional|analytics|content_recommendation|targeted_advertising|social_media',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'})
page = BeautifulSoup(session.get(url).content, 'lxml')
redirect_url = re.findall(r"https://www.7sur7.be/privacy-gate/accept[^']*", str(page.contents))[0]
page = BeautifulSoup(session.get(redirect_url).content, 'lxml')
posts = page.find_all('div', {'class': 'article__component article__component--picture'})
print(f"""
<rss version="2.0">
<channel>
<title>Dubuisson / 7 sur 7</title>
<link>{url}</link>
<description>Marc Dubuisson croque l’actu</description>
""")
for post in posts:
link = post.find('a', {'class': 'slideshow-trigger'})
img = post.find('img')
post_url = link['href']
post_title = img['alt']
print(f"""
<item>
<title>{post_title}</title>
<link>{post_url}</link>
<author>Marc Dubuisson</author>
<description>{img}</description>
</item>
""")
print("""
</channel>
</rss>
""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment