Skip to content

Instantly share code, notes, and snippets.

@woky
Last active September 2, 2021 15:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save woky/9ba28b53556ffbe138fcf9e9082dc17e to your computer and use it in GitHub Desktop.
Save woky/9ba28b53556ffbe138fcf9e9082dc17e to your computer and use it in GitHub Desktop.
podcastdl
# https://thehistoryofrome.typepad.com/ # finished
#https://feeds.feedburner.com/TheHistoryOfRome
# https://thehistoryofrome.typepad.com/revolutions_podcast/
https://revolutionspodcast.libsyn.com/rss
# https://thehistoryofbyzantium.com/
https://rss.acast.com/thehistoryofbyzantium
# https://darknetdiaries.com/
https://feeds.megaphone.fm/darknetdiaries
# https://hubermanlab.libsyn.com/
https://hubermanlab.libsyn.com/rss
# https://routingtable.cloud/
https://anchor.fm/s/1a3cf0b8/podcast/rss
# http://www.astronomycast.com/
https://astronomycast.libsyn.com/rss
# https://americanbiography.webs.com/
#https://rss.acast.com/americanbiography
# http://ethnopolis.co.uk/
#https://historyofyugoslavia.libsyn.com/rss
# https://historyofenglishpodcast.com/
#https://historyofenglishpodcast.com/feed/podcast/
# https://therealmiddleages.com/
#https://therealmiddleages.libsyn.com/rss
# https://thehistoryofvikings.com/ # Noath Tetzner
#https://feeds.captivate.fm/thehistoryofvikings/
# http://podcast.storiesofthesecondworldwar.com/ # Noath Tetzner
#https://feeds.acast.com/public/shows/stories-of-the-second-world-war
# https://corecursive.com/
https://corecursive.libsyn.com/feed
# https://www.acast.com/historyofthepapacy
#https://rss.acast.com/historyofthepapacy
# https://thegreatwarpodcast.podbean.com/ # finished
#https://feed.podbean.com/thegreatwarpodcast/feed.xml
# https://barrystrauss.com/podcast/
#https://antiquitas.castos.com/feed
# https://adspthepodcast.com/
https://feeds.buzzsprout.com/1501960.rss
# https://www.arraycast.com/
https://www.arraycast.com/episodes?format=rss
# https://handmade.network/podcast
#https://handmade.network/podcast/podcast.xml
# https://wondery.com/shows/tides-of-history/
https://rss.art19.com/tides-of-history
#!/usr/bin/env python3
# Dependencies:
# apt install python3-lxml python3-requests python3-dateutil
# pacman -S python-lxml python-requests python-dateutil
# pip install lxml requests python-dateutil
import argparse
import sys, os, os.path, re
import fnmatch
from pathlib import Path
from urllib.parse import urlparse
import dateutil.parser
import requests
from lxml import etree, html
DEFAULT_OUTPUT_DIR = '/mnt/storage/podcasts'
DEFAULT_LIST_FILE = os.path.expanduser('~/.podcasts.lst')
RSS_PODCAST_URLS = (
# These are unused now because the regex below matches them all
'feeds.feedburner.com/*',
'rss.acast.com/*',
'feeds.acast.com/public/shows/*',
'*.libsyn.com/rss',
'feeds.megaphone.fm/*',
'anchor.fm/s/*/podcast/rss',
'feeds.captivate.fm/*',
)
argp = argparse.ArgumentParser()
argp.add_argument('url', nargs='*')
argp.add_argument('--output-dir', '-o', type=str, default=DEFAULT_OUTPUT_DIR)
argp.add_argument('--dry-run', '-n', action='store_true')
argp.add_argument('--verbose', '-v', action='store_true')
argp.add_argument('--print-full-path', '-P', action='store_true')
args = argp.parse_args()
url_args = args.url
if not url_args:
if not os.path.exists(DEFAULT_LIST_FILE):
argp.print_usage()
sys.exit(1)
url_args = ['@' + DEFAULT_LIST_FILE]
urls = []
for arg in url_args:
if arg.startswith('@'):
with open(arg[1:], 'r') as f:
for line in f:
line = re.sub('#.*', '', line)
line = line.strip()
if line:
urls.append(line)
else:
urls.append(arg)
def text2filename(text: str):
name = re.sub('[^0-9A-Za-z]+', '_', text)
name = re.sub('^_|_$', '', name)
return name
top_urls = set(urls)
added_urls = set(top_urls)
while urls:
rss_url, urls = urls[0], urls[1:]
if args.verbose:
print('<<', rss_url)
try:
rss_resp = requests.get(rss_url, timeout=15)
rss_resp.raise_for_status()
except OSError as e:
print(rss_url, 'failed to fetch rss/html:', repr(e), file=sys.stderr)
continue
ctype = rss_resp.headers['content-type']
ctype = re.sub(';.*', '', ctype).strip()
if ctype == 'text/html':
if rss_url not in top_urls:
continue
tree = html.fromstring(rss_resp.text)
i = 0
def insert_url(new_url):
global i
urls.insert(i, new_url)
i += 1
added_urls.add(new_url)
for new_url in tree.xpath('//link[@type="application/rss+xml"]/@href'):
if new_url not in added_urls:
insert_url(new_url)
for new_url in tree.xpath('//a/@href'):
if new_url in added_urls:
continue
if re.search(r'(?i)\b(?:rss|feeds?)\b', new_url):
insert_url(new_url)
continue
_url = urlparse(new_url)
_url = re.sub(':.*', '', _url.netloc) + _url.path
for pattern in RSS_PODCAST_URLS:
if fnmatch.fnmatch(_url, pattern):
insert_url(new_url)
break
elif ctype in ('application/rss+xml', 'text/xml', 'application/xml'):
tree = etree.fromstring(rss_resp.content)
if tree.tag != 'rss':
continue
for chan in tree.xpath('/rss/channel'):
chan_title = chan.xpath('title/text()')
if not chan_title:
print(rss_url, 'channel has no title', file=sys.stderr)
continue
chan_title = text2filename(chan_title[0])
chan_dir = os.path.join(args.output_dir, chan_title)
for item in chan.xpath('item')[::-1]:
item_title = item.xpath('title/text()')
if not item_title:
print(rss_url, 'item has no title', file=sys.stderr)
continue
item_title = text2filename(item_title[0])
item_time = item.xpath('pubDate/text()')
if not item_time:
print(rss_url, 'item has no pubDate', file=sys.stderr)
continue
try:
item_time = dateutil.parser.parse(item_time[0])
except ValueError:
print(rss_url, 'item has invalid pubDate', item_time[0], file=sys.stderr)
continue
item_time = item_time.strftime('%Y-%m-%d')
for enclosure in item.xpath('enclosure[starts-with(@type, "audio/")]'):
media_type = enclosure.get('type')
if media_type == 'audio/mpeg':
ext = '.mp3'
else:
continue
break
else:
continue
media_url = enclosure.get('url')
media_file = os.path.join(chan_dir, item_time + '-' + item_title + ext)
if os.path.exists(media_file):
continue
if args.print_full_path:
print(media_file)
else:
print(os.path.relpath(media_file, args.output_dir))
if args.dry_run:
continue
if not os.path.isdir(chan_dir):
os.mkdir(chan_dir)
media_file_tmp = media_file + '.tmp'
try:
media_resp = requests.get(media_url, stream=True, timeout=15)
media_resp.raise_for_status()
with open(media_file_tmp, 'wb') as f:
for chunk in media_resp.iter_content(chunk_size=None):
f.write(chunk)
except OSError as e:
print(rss_url, f'failed to download {media_url}:', repr(e), file=sys.stderr)
if os.path.exists(media_file_tmp):
os.unlink(media_file_tmp)
continue
os.rename(media_file_tmp, media_file)
0 4 * * * systemd-cat -t podcastdl ~/podcastdl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment