Last active
March 28, 2022 18:23
-
-
Save axi0m/be404e2aae6cd78b4db7fe1ed7b2d3c5 to your computer and use it in GitHub Desktop.
Download podcast episodes from RSS feed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import feedparser | |
import argparse | |
import requests | |
import sys | |
from rich.console import Console | |
from os import listdir | |
from os.path import isfile, join | |
from pathlib import Path | |
from pathlib import PurePath | |
# Globals | |
podcasts = [ | |
"https://darknetdiaries.com/feedfree.xml", | |
"https://feeds.mozilla-podcasts.org/irl", | |
"https://realpython.com/podcasts/rpp/feed", | |
"https://feeds.eff.org/howtofixtheinternet" | |
] | |
console = Console() | |
# References | |
# https://dusty.phillips.codes/2018/08/13/python-loading-pathlib-paths-with-argparse/ | |
def parse_rss_url(rss_url): | |
''' Parse RSS feed URL and return custom FeedParser Dict object ''' | |
feed = feedparser.parse(rss_url) | |
return feed | |
def format_filenames(title): | |
''' Replace unnecessary characters from Episode Title and append .mp3 to downloaded file ''' | |
# What is the title from the RSS feed metadata | |
# console.print(f'[*] RSS feed episode title is: [blue]{title}[/blue]', style="bold green") | |
# Add file extension for Windows | |
tmp_filename = title + '.mp3' | |
# Replace spaces with _ first | |
tmp_filename = tmp_filename.replace(' ', '_') | |
# Replace characters from title before file creation | |
tmp_filename = tmp_filename.replace(':', '') | |
tmp_filename = tmp_filename.replace('-', '') | |
tmp_filename = tmp_filename.replace('?', '') | |
tmp_filename = tmp_filename.replace('+', '') | |
tmp_filename = tmp_filename.replace('@', '') | |
tmp_filename = tmp_filename.replace(',', '') | |
# Added to remove | character from EFF podcast title | |
tmp_filename = tmp_filename.replace('|', '') | |
filename = tmp_filename.replace('"', '') | |
return filename | |
def map_files(directory): | |
''' Accept a directory and list all files underneath it ''' | |
# :input: Path absolute path object | |
# :returns: List of filenames | |
file_list = [] | |
for child in directory.iterdir(): | |
file_list.append(child.name) | |
return file_list | |
def download_episodes(feed_dict, existing_files, podcast_absolute_path, warnings): | |
''' Download all URLs to destination folder ''' | |
# .entries is easy way to access all individual episodes in an RSS feed | |
for episode in feed_dict.entries: | |
# .links usually has multiple links, a text/html link with description and a audio/mpeg with actual audio file | |
for link in episode.links: | |
if link['type'] == 'audio/mpeg': | |
# Format the filename by stripping non-compliant characters | |
filename = format_filenames(episode.title) | |
# Generate joined path for output | |
download_path = podcast_absolute_path.joinpath(filename) | |
# If we have warnings enabled and file already downloaded, write to console | |
if filename in existing_files and warnings: | |
console.print(f'[-] WARN - Episode already downloaded [green]{filename}[/green] skipping...', style="bold red") | |
# If file isn't in our existing list, download it and send info to console | |
elif filename not in existing_files: | |
console.print(f'[+] INFO - New episode, downloading [blue]{filename}[/blue]', style="bold green") | |
response = requests.get(link['href']) | |
with open(download_path, 'wb') as f: | |
f.write(response.content) | |
else: | |
pass | |
def check_directory(directory): | |
''' Verify the directory provided exists ''' | |
if directory.is_dir(): | |
return directory | |
else: | |
return None | |
def make_directory(absolute_path): | |
''' Create podcast directory ''' | |
try: | |
absolute_path.mkdir() | |
except FileNotFoundError: | |
console.print(f'[!] Parent directory was not found for provided folder name: {name}', style="bold yellow") | |
return None | |
def main(): | |
''' Download all podcasts episodes not already downloaded ''' | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--directory", type=Path, action="store", dest="parent_directory", help="Parent directory for your local Podcast files") | |
parser.add_argument("--warnings", action="store_true", dest="warnings", help="Enable warning messages") | |
parser.set_defaults(warnings=False) | |
args = parser.parse_args() | |
parent_directory = args.parent_directory | |
warnings = args.warnings | |
if not parent_directory: | |
parser.print_help() | |
sys.exit(1) | |
existing_directory = check_directory(parent_directory) | |
if existing_directory is None: | |
console.print(f'[!] Directory supplied is not a valid existing directory', style="bold yellow") | |
sys.exit(1) | |
# Iterate over all of the defined podcast RSS URLs | |
for podcast in podcasts: | |
# Process the RSS feed and get a list of entries for the podcast | |
feed = parse_rss_url(podcast) | |
# Define a directory name as a string from the feed title | |
podcast_directory_name = feed['feed']['title'] | |
# Create a Pathlib path object for the directory (not an absolute path yet) | |
podcast_directory = Path(podcast_directory_name) | |
# Join the podcast directory name with parent provided by user | |
podcast_absolute_path = parent_directory.joinpath(podcast_directory) | |
# Check if the new podcast directory already exists or not | |
existing = check_directory(podcast_absolute_path) | |
# If the podcast directory does not exist, create it | |
if existing is None: | |
make_directory(podcast_absolute_path) | |
# Otherwise generate a list of all the files in the existing directory | |
if existing: | |
existing_files = map_files(podcast_absolute_path) | |
download_episodes(feed, existing_files, podcast_absolute_path, warnings) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment