Last active
July 31, 2021 18:59
-
-
Save jdstroy/ab91871b07de099b4da2f26772c41b46 to your computer and use it in GitHub Desktop.
Scrape A Prairie Home Companion website for MP3s.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!env python3 | |
from bs4 import BeautifulSoup; | |
import urllib3; | |
import urllib.parse as urlparse; | |
import time; | |
import random; | |
class Main: | |
def __init__(self): | |
startURL = "https://www.prairiehome.org/"; | |
self.http = urllib3.PoolManager(); | |
self.urls = [startURL]; | |
self.urls.extend(f"https://www.prairiehome.org/shows/page/{index}.html" for index in range(2, 115)); | |
def getAllAudio(self): | |
for url in self.urls: | |
for show in self.findAllShowPages(url): | |
for clip in self.getAudioPlayers(show): | |
yield clip; | |
def findAllShowPages(self, url): | |
response = self.http.request("GET", url); | |
document = response.data.decode(); | |
soup = BeautifulSoup(document, 'html.parser'); | |
return [urlparse.urljoin(url, link.get('href')) for link in soup.find_all(name='a', attrs={'class' : 'mod_header'}, recursive=True)]; | |
def getAudioPlayers(self, showPage): | |
showresponse = self.http.request("GET", showPage); | |
showdocument = showresponse.data.decode(); | |
showsoup = BeautifulSoup(showdocument, 'html.parser'); | |
return [ audio.get('data-src') for audio in showsoup.find_all(name='div', attrs={'data-playlist': "#story-playlist"}, recursive=True) ]; | |
print("#EXT3M3U"); | |
for x in Main().getAllAudio(): | |
# Wait a little bit so that we don't get banned. | |
time.sleep(random.randint(0,2)); | |
print(x); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.9.3 | |
soupsieve==2.2.1 | |
urllib3==1.26.6 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment