Skip to content

Instantly share code, notes, and snippets.

@axi0m
Last active March 28, 2022 18:23
Show Gist options
  • Save axi0m/be404e2aae6cd78b4db7fe1ed7b2d3c5 to your computer and use it in GitHub Desktop.
Save axi0m/be404e2aae6cd78b4db7fe1ed7b2d3c5 to your computer and use it in GitHub Desktop.
Download podcast episodes from RSS feed
#!/usr/bin/python3
import feedparser
import argparse
import requests
import sys
from rich.console import Console
from os import listdir
from os.path import isfile, join
from pathlib import Path
from pathlib import PurePath
# Globals
podcasts = [
"https://darknetdiaries.com/feedfree.xml",
"https://feeds.mozilla-podcasts.org/irl",
"https://realpython.com/podcasts/rpp/feed",
"https://feeds.eff.org/howtofixtheinternet"
]
console = Console()
# References
# https://dusty.phillips.codes/2018/08/13/python-loading-pathlib-paths-with-argparse/
def parse_rss_url(rss_url):
''' Parse RSS feed URL and return custom FeedParser Dict object '''
feed = feedparser.parse(rss_url)
return feed
def format_filenames(title):
''' Replace unnecessary characters from Episode Title and append .mp3 to downloaded file '''
# What is the title from the RSS feed metadata
# console.print(f'[*] RSS feed episode title is: [blue]{title}[/blue]', style="bold green")
# Add file extension for Windows
tmp_filename = title + '.mp3'
# Replace spaces with _ first
tmp_filename = tmp_filename.replace(' ', '_')
# Replace characters from title before file creation
tmp_filename = tmp_filename.replace(':', '')
tmp_filename = tmp_filename.replace('-', '')
tmp_filename = tmp_filename.replace('?', '')
tmp_filename = tmp_filename.replace('+', '')
tmp_filename = tmp_filename.replace('@', '')
tmp_filename = tmp_filename.replace(',', '')
# Added to remove | character from EFF podcast title
tmp_filename = tmp_filename.replace('|', '')
filename = tmp_filename.replace('"', '')
return filename
def map_files(directory):
''' Accept a directory and list all files underneath it '''
# :input: Path absolute path object
# :returns: List of filenames
file_list = []
for child in directory.iterdir():
file_list.append(child.name)
return file_list
def download_episodes(feed_dict, existing_files, podcast_absolute_path, warnings):
''' Download all URLs to destination folder '''
# .entries is easy way to access all individual episodes in an RSS feed
for episode in feed_dict.entries:
# .links usually has multiple links, a text/html link with description and a audio/mpeg with actual audio file
for link in episode.links:
if link['type'] == 'audio/mpeg':
# Format the filename by stripping non-compliant characters
filename = format_filenames(episode.title)
# Generate joined path for output
download_path = podcast_absolute_path.joinpath(filename)
# If we have warnings enabled and file already downloaded, write to console
if filename in existing_files and warnings:
console.print(f'[-] WARN - Episode already downloaded [green]{filename}[/green] skipping...', style="bold red")
# If file isn't in our existing list, download it and send info to console
elif filename not in existing_files:
console.print(f'[+] INFO - New episode, downloading [blue]{filename}[/blue]', style="bold green")
response = requests.get(link['href'])
with open(download_path, 'wb') as f:
f.write(response.content)
else:
pass
def check_directory(directory):
''' Verify the directory provided exists '''
if directory.is_dir():
return directory
else:
return None
def make_directory(absolute_path):
''' Create podcast directory '''
try:
absolute_path.mkdir()
except FileNotFoundError:
console.print(f'[!] Parent directory was not found for provided folder name: {name}', style="bold yellow")
return None
def main():
''' Download all podcasts episodes not already downloaded '''
parser = argparse.ArgumentParser()
parser.add_argument("--directory", type=Path, action="store", dest="parent_directory", help="Parent directory for your local Podcast files")
parser.add_argument("--warnings", action="store_true", dest="warnings", help="Enable warning messages")
parser.set_defaults(warnings=False)
args = parser.parse_args()
parent_directory = args.parent_directory
warnings = args.warnings
if not parent_directory:
parser.print_help()
sys.exit(1)
existing_directory = check_directory(parent_directory)
if existing_directory is None:
console.print(f'[!] Directory supplied is not a valid existing directory', style="bold yellow")
sys.exit(1)
# Iterate over all of the defined podcast RSS URLs
for podcast in podcasts:
# Process the RSS feed and get a list of entries for the podcast
feed = parse_rss_url(podcast)
# Define a directory name as a string from the feed title
podcast_directory_name = feed['feed']['title']
# Create a Pathlib path object for the directory (not an absolute path yet)
podcast_directory = Path(podcast_directory_name)
# Join the podcast directory name with parent provided by user
podcast_absolute_path = parent_directory.joinpath(podcast_directory)
# Check if the new podcast directory already exists or not
existing = check_directory(podcast_absolute_path)
# If the podcast directory does not exist, create it
if existing is None:
make_directory(podcast_absolute_path)
# Otherwise generate a list of all the files in the existing directory
if existing:
existing_files = map_files(podcast_absolute_path)
download_episodes(feed, existing_files, podcast_absolute_path, warnings)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment