Created
March 5, 2022 20:20
-
-
Save tahnok/3a353d870eed641b002d273d0885ffcf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Python 3.7+ script to download episodes of radio three sixty | |
Reads from radiothreesixtyacc.xml, a copy of the iTunes RSS feed. Fetched from archive.org | |
Episodes are saved in a '$EP_NUMBER - Radio Three Sixty $SHOW_TITLE.m4a' format | |
Metadata about the files are written to manifest. This includes: | |
- show title | |
- show subtitle | |
- description (usually including track list) | |
- sha1 hash of the original file | |
- archived_at date | |
- published_on date | |
- episode duration | |
Written by Wesley Ellis <tahnok@gmail.com> on 2022-03-05 | |
""" | |
from typing import Dict | |
from datetime import datetime | |
import os.path | |
import hashlib | |
import json | |
import re | |
import requests | |
import xml.etree.ElementTree as ET | |
FEED_URLS = [ | |
# episodes 1-29 | |
"https://web.archive.org/web/20070311065745if_/http://www.threesixtyrecords.net:80/podcasts/radiothreesixtyacc.xml", | |
# episodes 30-111 | |
"https://web.archive.org/web/20210125034103if_/http://www.threesixtyrecords.net/podcasts/radiothreesixtyacc.xml", | |
] | |
# some weird XML shit | |
XML_NS = {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"} | |
def hash_file(file: str) -> str: | |
"""sha1 hexdigest of a file from https://stackoverflow.com/a/22058673/344151""" | |
BUF_SIZE = 65536 # lets read stuff in 64kb chunks! | |
sha1 = hashlib.sha1() | |
with open(file, "rb") as f: | |
while True: | |
data = f.read(BUF_SIZE) | |
if not data: | |
break | |
sha1.update(data) | |
return sha1.hexdigest() | |
def item_as_dict(item: ET.Element) -> Dict[str, str]: | |
"""Convert itunes episode to manifest entry, minus sha1 and archived_on.""" | |
url = item.find("enclosure").get("url") | |
subtitle = item.find("itunes:subtitle", XML_NS).text | |
title = item.find("title").text.strip() | |
match = re.search(r"radiothreesixtypart([0-9]+)", url) # works for episode 13+ | |
if match: | |
number = int(match.group(1)) | |
else: | |
subtitle_match = re.search( | |
r"Part ([0-9]+) by DJ Darkhorse", subtitle | |
) # works for ep 1-12 | |
if subtitle_match: | |
number = int(subtitle_match.group(1)) | |
else: | |
number = 1 | |
file_title = title.replace(":", "") | |
filename = f"{number:03d} - {file_title}.m4a" | |
maybe_duration = item.find("itunes:duration", XML_NS) | |
duration = "missing" | |
if maybe_duration is not None: | |
duration = maybe_duration.text | |
return { | |
"number": number, | |
"title": title, | |
"subtitle": subtitle, | |
"summary": item.find("itunes:summary", XML_NS).text, | |
"published_on": item.find("pubDate").text, | |
"duration": duration, | |
"url": url, | |
"filename": filename, | |
} | |
def download_to_file(url: str, filename: str) -> bool: | |
"""Download file from url to filename. Return success/fail boolean""" | |
response = requests.get(url) | |
if response.status_code == requests.codes.ok: | |
with open(filename, "wb") as fd: | |
for chunk in response.iter_content(chunk_size=128): | |
fd.write(chunk) | |
return True | |
else: | |
return False | |
def main(): | |
print("Begin") | |
episodes = [] | |
for i, feed_url in enumerate(FEED_URLS): | |
path = f"radiothreesixtyacc_part{i}.xml" | |
if os.path.isfile(path): | |
print(f"{feed_url} already dowloaded to {path}") | |
else: | |
download_to_file(feed_url, path) | |
root = ET.parse(path).getroot().find("channel") | |
new_episodes = [item_as_dict(item) for item in root.findall("item")] | |
episodes.extend(new_episodes) | |
print(f"Extracted {len(episodes)} episodes from {path}") | |
success = [] | |
failed = [] | |
for episode in episodes: | |
filename = episode["filename"] | |
if os.path.isfile(filename): | |
print(f"{filename} already saved, skipping download") | |
episode["sha1"] = hash_file(filename) | |
episode["archived_on"] = datetime.now().isoformat() | |
success.append(episode) | |
elif download_to_file(episode["url"], filename): | |
print(f"{filename} saved") | |
episode["sha1"] = hash_file(filename) | |
episode["archived_on"] = datetime.now().isoformat() | |
success.append(episode) | |
else: | |
print(f"{filename} failed to download {episode}") | |
failed.append(episode) | |
with open("manifest.json", "w") as manifest: | |
json.dump(success, manifest, indent=2) | |
print("Wrote manifest.json") | |
if len(failed) > 0: | |
print("Some episodes failed to download, details in failed.json") | |
with open("failed.json", "w") as fail_out: | |
json.dump(failed, fail_out, indent=2) | |
print("done") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment