Created
April 4, 2021 17:17
-
-
Save pbeaudequin/2c009bef60c4d37fbe508da1cac45023 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from feedgen.feed import FeedGenerator | |
class litteratureaudioScrapper(object): | |
def __init__(self): | |
self.bootstrap_url = 'http://www.litteratureaudio.com/classement-de-nos-livres-audio-gratuits-les-plus-vus' | |
def getTopBooks(self): | |
html_doc=requests.get(self.bootstrap_url).content | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
links_div = soup.find_all("div", {"class": "entrybody2"})[0] | |
return [(i.string,i['href']) for i in links_div.find_all("a")] | |
def parseBook(self,title,url): | |
html_doc=requests.get(url).content | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
links = [a for a in soup.find_all("a") if a.get("href") and \ | |
a["href"].endswith(".mp3") and \ | |
a.string and \ | |
a.string.endswith(".mp3")] | |
if not links: | |
links = [a for a in soup.find_all("a") if a.get("href") and \ | |
a["href"].endswith(".mp3") and \ | |
a.string ] | |
return [(i.string[:-4],i['href']) for i in links] | |
def generateRSS(self,topx=10): | |
fg = FeedGenerator() | |
fg.id(self.bootstrap_url) | |
fg.title('Litterature Audio - Top Books Podcast') | |
fg.description('Litterature Audio - Top Books Podcast') | |
fg.author( {'name':'Philippe Beaudequin','email':'tyboon@gmail.com'} ) | |
fg.link( href=self.bootstrap_url, rel='self' ) | |
fg.language('fr') | |
for title,book_url in self.getTopBooks()[:topx]: | |
print(title) | |
for chapter_name,mp3 in self.parseBook(title,book_url): | |
fe = fg.add_entry() | |
fe.id(mp3) | |
fe.title(f'{title} - {chapter_name}') | |
fe.link(href=mp3) | |
fe.enclosure(mp3, 0, 'audio/mpeg') | |
fg.rss_file(f'la.xml') | |
return f'la.xml' | |
litteratureaudioScrapper().generateRSS(1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment