axi0m/download_podcast_episodes.py

## download_podcast_episodes.py
#!/usr/bin/python3
import feedparser
import argparse
import requests
import sys
from rich.console import Console
from os import listdir
from os.path import isfile, join
from pathlib import Path
from pathlib import PurePath

# Globals
podcasts = [
    "https://darknetdiaries.com/feedfree.xml",
    "https://feeds.mozilla-podcasts.org/irl",
    "https://realpython.com/podcasts/rpp/feed",
    "https://feeds.eff.org/howtofixtheinternet"
    ]

console = Console()

# References
# https://dusty.phillips.codes/2018/08/13/python-loading-pathlib-paths-with-argparse/

def parse_rss_url(rss_url):
    ''' Parse RSS feed URL and return custom FeedParser Dict object '''
    feed = feedparser.parse(rss_url)
    return feed

def format_filenames(title):
    ''' Replace unnecessary characters from Episode Title and append .mp3 to downloaded file '''

    # What is the title from the RSS feed metadata
    # console.print(f'[*] RSS feed episode title is: [blue]{title}[/blue]', style="bold green")

    # Add file extension for Windows
    tmp_filename = title + '.mp3'

    # Replace spaces with _ first
    tmp_filename = tmp_filename.replace(' ', '_')

    # Replace characters from title before file creation
    tmp_filename = tmp_filename.replace(':', '')
    tmp_filename = tmp_filename.replace('-', '')
    tmp_filename = tmp_filename.replace('?', '')
    tmp_filename = tmp_filename.replace('+', '')
    tmp_filename = tmp_filename.replace('@', '')
    tmp_filename = tmp_filename.replace(',', '')
    # Added to remove | character from EFF podcast title
    tmp_filename = tmp_filename.replace('|', '')
    filename = tmp_filename.replace('"', '')
    return filename


def map_files(directory):
    ''' Accept a directory and list all files underneath it '''
    # :input: Path absolute path object
    # :returns: List of filenames

    file_list = []
    for child in directory.iterdir():
        file_list.append(child.name)

    return file_list


def download_episodes(feed_dict, existing_files, podcast_absolute_path, warnings):
    ''' Download all URLs to destination folder '''

    # .entries is easy way to access all individual episodes in an RSS feed
    for episode in feed_dict.entries:
        # .links usually has multiple links, a text/html link with description and a audio/mpeg with actual audio file
        for link in episode.links:
            if link['type'] == 'audio/mpeg':
                    # Format the filename by stripping non-compliant characters
                    filename = format_filenames(episode.title)

                    # Generate joined path for output
                    download_path = podcast_absolute_path.joinpath(filename)

                    # If we have warnings enabled and file already downloaded, write to console
                    if filename in existing_files and warnings:
                        console.print(f'[-] WARN - Episode already downloaded [green]{filename}[/green] skipping...', style="bold red")

                    # If file isn't in our existing list, download it and send info to console
                    elif filename not in existing_files:
                        console.print(f'[+] INFO - New episode, downloading [blue]{filename}[/blue]', style="bold green")
                        response = requests.get(link['href'])
                        with open(download_path, 'wb') as f:
                            f.write(response.content)
                    else:
                        pass


def check_directory(directory):
    ''' Verify the directory provided exists '''

    if directory.is_dir():
        return directory
    else:
        return None


def make_directory(absolute_path):
    ''' Create podcast directory '''

    try:
        absolute_path.mkdir()
    except FileNotFoundError:
        console.print(f'[!] Parent directory was not found for provided folder name: {name}', style="bold yellow")
        return None


def main():
    ''' Download all podcasts episodes not already downloaded '''

    parser = argparse.ArgumentParser()
    parser.add_argument("--directory", type=Path, action="store", dest="parent_directory", help="Parent directory for your local Podcast files")
    parser.add_argument("--warnings", action="store_true", dest="warnings", help="Enable warning messages")
    parser.set_defaults(warnings=False)
    args = parser.parse_args()
    parent_directory = args.parent_directory
    warnings = args.warnings

    if not parent_directory:
        parser.print_help()
        sys.exit(1)

    existing_directory = check_directory(parent_directory)

    if existing_directory is None:
        console.print(f'[!] Directory supplied is not a valid existing directory', style="bold yellow")
        sys.exit(1)

    # Iterate over all of the defined podcast RSS URLs
    for podcast in podcasts:
        # Process the RSS feed and get a list of entries for the podcast
        feed = parse_rss_url(podcast)
        # Define a directory name as a string from the feed title
        podcast_directory_name = feed['feed']['title']
        # Create a Pathlib path object for the directory (not an absolute path yet)
        podcast_directory = Path(podcast_directory_name)
        # Join the podcast directory name with parent provided by user
        podcast_absolute_path = parent_directory.joinpath(podcast_directory)
        # Check if the new podcast directory already exists or not
        existing = check_directory(podcast_absolute_path)

        # If the podcast directory does not exist, create it
        if existing is None:
            make_directory(podcast_absolute_path)
        # Otherwise generate a list of all the files in the existing directory
        if existing:
            existing_files = map_files(podcast_absolute_path)
            download_episodes(feed, existing_files, podcast_absolute_path, warnings)


if __name__ == "__main__":
    main()
	#!/usr/bin/python3
	import feedparser
	import argparse
	import requests
	import sys
	from rich.console import Console
	from os import listdir
	from os.path import isfile, join
	from pathlib import Path
	from pathlib import PurePath

	# Globals
	podcasts = [
	"https://darknetdiaries.com/feedfree.xml",
	"https://feeds.mozilla-podcasts.org/irl",
	"https://realpython.com/podcasts/rpp/feed",
	"https://feeds.eff.org/howtofixtheinternet"
	]

	console = Console()

	# References
	# https://dusty.phillips.codes/2018/08/13/python-loading-pathlib-paths-with-argparse/

	def parse_rss_url(rss_url):
	''' Parse RSS feed URL and return custom FeedParser Dict object '''
	feed = feedparser.parse(rss_url)
	return feed

	def format_filenames(title):
	''' Replace unnecessary characters from Episode Title and append .mp3 to downloaded file '''

	# What is the title from the RSS feed metadata
	# console.print(f'[*] RSS feed episode title is: [blue]{title}[/blue]', style="bold green")

	# Add file extension for Windows
	tmp_filename = title + '.mp3'

	# Replace spaces with _ first
	tmp_filename = tmp_filename.replace(' ', '_')

	# Replace characters from title before file creation
	tmp_filename = tmp_filename.replace(':', '')
	tmp_filename = tmp_filename.replace('-', '')
	tmp_filename = tmp_filename.replace('?', '')
	tmp_filename = tmp_filename.replace('+', '')
	tmp_filename = tmp_filename.replace('@', '')
	tmp_filename = tmp_filename.replace(',', '')
	# Added to remove \| character from EFF podcast title
	tmp_filename = tmp_filename.replace('\|', '')
	filename = tmp_filename.replace('"', '')
	return filename


	def map_files(directory):
	''' Accept a directory and list all files underneath it '''
	# :input: Path absolute path object
	# :returns: List of filenames

	file_list = []
	for child in directory.iterdir():
	file_list.append(child.name)

	return file_list


	def download_episodes(feed_dict, existing_files, podcast_absolute_path, warnings):
	''' Download all URLs to destination folder '''

	# .entries is easy way to access all individual episodes in an RSS feed
	for episode in feed_dict.entries:
	# .links usually has multiple links, a text/html link with description and a audio/mpeg with actual audio file
	for link in episode.links:
	if link['type'] == 'audio/mpeg':
	# Format the filename by stripping non-compliant characters
	filename = format_filenames(episode.title)

	# Generate joined path for output
	download_path = podcast_absolute_path.joinpath(filename)

	# If we have warnings enabled and file already downloaded, write to console
	if filename in existing_files and warnings:
	console.print(f'[-] WARN - Episode already downloaded [green]{filename}[/green] skipping...', style="bold red")

	# If file isn't in our existing list, download it and send info to console
	elif filename not in existing_files:
	console.print(f'[+] INFO - New episode, downloading [blue]{filename}[/blue]', style="bold green")
	response = requests.get(link['href'])
	with open(download_path, 'wb') as f:
	f.write(response.content)
	else:
	pass


	def check_directory(directory):
	''' Verify the directory provided exists '''

	if directory.is_dir():
	return directory
	else:
	return None


	def make_directory(absolute_path):
	''' Create podcast directory '''

	try:
	absolute_path.mkdir()
	except FileNotFoundError:
	console.print(f'[!] Parent directory was not found for provided folder name: {name}', style="bold yellow")
	return None


	def main():
	''' Download all podcasts episodes not already downloaded '''

	parser = argparse.ArgumentParser()
	parser.add_argument("--directory", type=Path, action="store", dest="parent_directory", help="Parent directory for your local Podcast files")
	parser.add_argument("--warnings", action="store_true", dest="warnings", help="Enable warning messages")
	parser.set_defaults(warnings=False)
	args = parser.parse_args()
	parent_directory = args.parent_directory
	warnings = args.warnings

	if not parent_directory:
	parser.print_help()
	sys.exit(1)

	existing_directory = check_directory(parent_directory)

	if existing_directory is None:
	console.print(f'[!] Directory supplied is not a valid existing directory', style="bold yellow")
	sys.exit(1)

	# Iterate over all of the defined podcast RSS URLs
	for podcast in podcasts:
	# Process the RSS feed and get a list of entries for the podcast
	feed = parse_rss_url(podcast)
	# Define a directory name as a string from the feed title
	podcast_directory_name = feed['feed']['title']
	# Create a Pathlib path object for the directory (not an absolute path yet)
	podcast_directory = Path(podcast_directory_name)
	# Join the podcast directory name with parent provided by user
	podcast_absolute_path = parent_directory.joinpath(podcast_directory)
	# Check if the new podcast directory already exists or not
	existing = check_directory(podcast_absolute_path)

	# If the podcast directory does not exist, create it
	if existing is None:
	make_directory(podcast_absolute_path)
	# Otherwise generate a list of all the files in the existing directory
	if existing:
	existing_files = map_files(podcast_absolute_path)
	download_episodes(feed, existing_files, podcast_absolute_path, warnings)


	if __name__ == "__main__":
	main()