Skip to content

Instantly share code, notes, and snippets.

@tahnok
Created March 5, 2022 20:20
Show Gist options
  • Save tahnok/3a353d870eed641b002d273d0885ffcf to your computer and use it in GitHub Desktop.
Save tahnok/3a353d870eed641b002d273d0885ffcf to your computer and use it in GitHub Desktop.
"""
Python 3.7+ script to download episodes of radio three sixty
Reads from radiothreesixtyacc.xml, a copy of the iTunes RSS feed. Fetched from archive.org
Episodes are saved in a '$EP_NUMBER - Radio Three Sixty $SHOW_TITLE.m4a' format
Metadata about the files are written to manifest. This includes:
- show title
- show subtitle
- description (usually including track list)
- sha1 hash of the original file
- archived_at date
- published_on date
- episode duration
Written by Wesley Ellis <tahnok@gmail.com> on 2022-03-05
"""
from typing import Dict
from datetime import datetime
import os.path
import hashlib
import json
import re
import requests
import xml.etree.ElementTree as ET
FEED_URLS = [
# episodes 1-29
"https://web.archive.org/web/20070311065745if_/http://www.threesixtyrecords.net:80/podcasts/radiothreesixtyacc.xml",
# episodes 30-111
"https://web.archive.org/web/20210125034103if_/http://www.threesixtyrecords.net/podcasts/radiothreesixtyacc.xml",
]
# some weird XML shit
XML_NS = {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}
def hash_file(file: str) -> str:
"""sha1 hexdigest of a file from https://stackoverflow.com/a/22058673/344151"""
BUF_SIZE = 65536 # lets read stuff in 64kb chunks!
sha1 = hashlib.sha1()
with open(file, "rb") as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
sha1.update(data)
return sha1.hexdigest()
def item_as_dict(item: ET.Element) -> Dict[str, str]:
"""Convert itunes episode to manifest entry, minus sha1 and archived_on."""
url = item.find("enclosure").get("url")
subtitle = item.find("itunes:subtitle", XML_NS).text
title = item.find("title").text.strip()
match = re.search(r"radiothreesixtypart([0-9]+)", url) # works for episode 13+
if match:
number = int(match.group(1))
else:
subtitle_match = re.search(
r"Part ([0-9]+) by DJ Darkhorse", subtitle
) # works for ep 1-12
if subtitle_match:
number = int(subtitle_match.group(1))
else:
number = 1
file_title = title.replace(":", "")
filename = f"{number:03d} - {file_title}.m4a"
maybe_duration = item.find("itunes:duration", XML_NS)
duration = "missing"
if maybe_duration is not None:
duration = maybe_duration.text
return {
"number": number,
"title": title,
"subtitle": subtitle,
"summary": item.find("itunes:summary", XML_NS).text,
"published_on": item.find("pubDate").text,
"duration": duration,
"url": url,
"filename": filename,
}
def download_to_file(url: str, filename: str) -> bool:
"""Download file from url to filename. Return success/fail boolean"""
response = requests.get(url)
if response.status_code == requests.codes.ok:
with open(filename, "wb") as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
return True
else:
return False
def main():
print("Begin")
episodes = []
for i, feed_url in enumerate(FEED_URLS):
path = f"radiothreesixtyacc_part{i}.xml"
if os.path.isfile(path):
print(f"{feed_url} already dowloaded to {path}")
else:
download_to_file(feed_url, path)
root = ET.parse(path).getroot().find("channel")
new_episodes = [item_as_dict(item) for item in root.findall("item")]
episodes.extend(new_episodes)
print(f"Extracted {len(episodes)} episodes from {path}")
success = []
failed = []
for episode in episodes:
filename = episode["filename"]
if os.path.isfile(filename):
print(f"{filename} already saved, skipping download")
episode["sha1"] = hash_file(filename)
episode["archived_on"] = datetime.now().isoformat()
success.append(episode)
elif download_to_file(episode["url"], filename):
print(f"{filename} saved")
episode["sha1"] = hash_file(filename)
episode["archived_on"] = datetime.now().isoformat()
success.append(episode)
else:
print(f"{filename} failed to download {episode}")
failed.append(episode)
with open("manifest.json", "w") as manifest:
json.dump(success, manifest, indent=2)
print("Wrote manifest.json")
if len(failed) > 0:
print("Some episodes failed to download, details in failed.json")
with open("failed.json", "w") as fail_out:
json.dump(failed, fail_out, indent=2)
print("done")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment