Skip to content

Instantly share code, notes, and snippets.

@jdstroy
Last active Jul 31, 2021
Embed
What would you like to do?
Scrape A Prairie Home Companion website for MP3s.
#!env python3
from bs4 import BeautifulSoup;
import urllib3;
import urllib.parse as urlparse;
import time;
import random;
class Main:
def __init__(self):
startURL = "https://www.prairiehome.org/";
self.http = urllib3.PoolManager();
self.urls = [startURL];
self.urls.extend(f"https://www.prairiehome.org/shows/page/{index}.html" for index in range(2, 115));
def getAllAudio(self):
for url in self.urls:
for show in self.findAllShowPages(url):
for clip in self.getAudioPlayers(show):
yield clip;
def findAllShowPages(self, url):
response = self.http.request("GET", url);
document = response.data.decode();
soup = BeautifulSoup(document, 'html.parser');
return [urlparse.urljoin(url, link.get('href')) for link in soup.find_all(name='a', attrs={'class' : 'mod_header'}, recursive=True)];
def getAudioPlayers(self, showPage):
showresponse = self.http.request("GET", showPage);
showdocument = showresponse.data.decode();
showsoup = BeautifulSoup(showdocument, 'html.parser');
return [ audio.get('data-src') for audio in showsoup.find_all(name='div', attrs={'data-playlist': "#story-playlist"}, recursive=True) ];
print("#EXT3M3U");
for x in Main().getAllAudio():
# Wait a little bit so that we don't get banned.
time.sleep(random.randint(0,2));
print(x);
beautifulsoup4==4.9.3
soupsieve==2.2.1
urllib3==1.26.6
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment