tahnok/archive_radio_three_sixty.py

## archive_radio_three_sixty.py
"""
Python 3.7+ script to download episodes of radio three sixty

Reads from radiothreesixtyacc.xml, a copy of the iTunes RSS feed. Fetched from archive.org

Episodes are saved in a '$EP_NUMBER - Radio Three Sixty $SHOW_TITLE.m4a' format

Metadata about the files are written to manifest. This includes:

 - show title
 - show subtitle
 - description (usually including track list)
 - sha1 hash of the original file
 - archived_at date
 - published_on date
 - episode duration

Written by Wesley Ellis <tahnok@gmail.com> on 2022-03-05
"""

from typing import Dict
from datetime import datetime

import os.path
import hashlib
import json
import re
import requests
import xml.etree.ElementTree as ET

FEED_URLS = [
    # episodes 1-29
    "https://web.archive.org/web/20070311065745if_/http://www.threesixtyrecords.net:80/podcasts/radiothreesixtyacc.xml",
    # episodes 30-111
    "https://web.archive.org/web/20210125034103if_/http://www.threesixtyrecords.net/podcasts/radiothreesixtyacc.xml",
]

# some weird XML shit
XML_NS = {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}


def hash_file(file: str) -> str:
    """sha1 hexdigest of a file from https://stackoverflow.com/a/22058673/344151"""

    BUF_SIZE = 65536  # lets read stuff in 64kb chunks!
    sha1 = hashlib.sha1()

    with open(file, "rb") as f:
        while True:
            data = f.read(BUF_SIZE)
            if not data:
                break
            sha1.update(data)

    return sha1.hexdigest()


def item_as_dict(item: ET.Element) -> Dict[str, str]:
    """Convert itunes episode to manifest entry, minus sha1 and archived_on."""

    url = item.find("enclosure").get("url")
    subtitle = item.find("itunes:subtitle", XML_NS).text
    title = item.find("title").text.strip()

    match = re.search(r"radiothreesixtypart([0-9]+)", url)  # works for episode 13+
    if match:
        number = int(match.group(1))
    else:
        subtitle_match = re.search(
            r"Part ([0-9]+) by DJ Darkhorse", subtitle
        )  # works for ep 1-12
        if subtitle_match:
            number = int(subtitle_match.group(1))
        else:
            number = 1

    file_title = title.replace(":", "")
    filename = f"{number:03d} - {file_title}.m4a"

    maybe_duration = item.find("itunes:duration", XML_NS)
    duration = "missing"
    if maybe_duration is not None:
        duration = maybe_duration.text

    return {
        "number": number,
        "title": title,
        "subtitle": subtitle,
        "summary": item.find("itunes:summary", XML_NS).text,
        "published_on": item.find("pubDate").text,
        "duration": duration,
        "url": url,
        "filename": filename,
    }


def download_to_file(url: str, filename: str) -> bool:
    """Download file from url to filename. Return success/fail boolean"""
    response = requests.get(url)
    if response.status_code == requests.codes.ok:
        with open(filename, "wb") as fd:
            for chunk in response.iter_content(chunk_size=128):
                fd.write(chunk)
        return True
    else:
        return False


def main():
    print("Begin")

    episodes = []

    for i, feed_url in enumerate(FEED_URLS):
        path = f"radiothreesixtyacc_part{i}.xml"
        if os.path.isfile(path):
            print(f"{feed_url} already dowloaded to {path}")
        else:
            download_to_file(feed_url, path)
        root = ET.parse(path).getroot().find("channel")

        new_episodes = [item_as_dict(item) for item in root.findall("item")]
        episodes.extend(new_episodes)
        print(f"Extracted {len(episodes)} episodes from {path}")

    success = []
    failed = []

    for episode in episodes:
        filename = episode["filename"]
        if os.path.isfile(filename):
            print(f"{filename} already saved, skipping download")
            episode["sha1"] = hash_file(filename)
            episode["archived_on"] = datetime.now().isoformat()
            success.append(episode)
        elif download_to_file(episode["url"], filename):
            print(f"{filename} saved")
            episode["sha1"] = hash_file(filename)
            episode["archived_on"] = datetime.now().isoformat()
            success.append(episode)
        else:
            print(f"{filename} failed to download {episode}")
            failed.append(episode)

    with open("manifest.json", "w") as manifest:
        json.dump(success, manifest, indent=2)
        print("Wrote manifest.json")

    if len(failed) > 0:
        print("Some episodes failed to download, details in failed.json")
        with open("failed.json", "w") as fail_out:
            json.dump(failed, fail_out, indent=2)

    print("done")


if __name__ == "__main__":
    main()
	"""
	Python 3.7+ script to download episodes of radio three sixty

	Reads from radiothreesixtyacc.xml, a copy of the iTunes RSS feed. Fetched from archive.org

	Episodes are saved in a '$EP_NUMBER - Radio Three Sixty $SHOW_TITLE.m4a' format

	Metadata about the files are written to manifest. This includes:

	- show title
	- show subtitle
	- description (usually including track list)
	- sha1 hash of the original file
	- archived_at date
	- published_on date
	- episode duration

	Written by Wesley Ellis <tahnok@gmail.com> on 2022-03-05
	"""

	from typing import Dict
	from datetime import datetime

	import os.path
	import hashlib
	import json
	import re
	import requests
	import xml.etree.ElementTree as ET

	FEED_URLS = [
	# episodes 1-29
	"https://web.archive.org/web/20070311065745if_/http://www.threesixtyrecords.net:80/podcasts/radiothreesixtyacc.xml",
	# episodes 30-111
	"https://web.archive.org/web/20210125034103if_/http://www.threesixtyrecords.net/podcasts/radiothreesixtyacc.xml",
	]

	# some weird XML shit
	XML_NS = {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}


	def hash_file(file: str) -> str:
	"""sha1 hexdigest of a file from https://stackoverflow.com/a/22058673/344151"""

	BUF_SIZE = 65536 # lets read stuff in 64kb chunks!
	sha1 = hashlib.sha1()

	with open(file, "rb") as f:
	while True:
	data = f.read(BUF_SIZE)
	if not data:
	break
	sha1.update(data)

	return sha1.hexdigest()


	def item_as_dict(item: ET.Element) -> Dict[str, str]:
	"""Convert itunes episode to manifest entry, minus sha1 and archived_on."""

	url = item.find("enclosure").get("url")
	subtitle = item.find("itunes:subtitle", XML_NS).text
	title = item.find("title").text.strip()

	match = re.search(r"radiothreesixtypart([0-9]+)", url) # works for episode 13+
	if match:
	number = int(match.group(1))
	else:
	subtitle_match = re.search(
	r"Part ([0-9]+) by DJ Darkhorse", subtitle
	) # works for ep 1-12
	if subtitle_match:
	number = int(subtitle_match.group(1))
	else:
	number = 1

	file_title = title.replace(":", "")
	filename = f"{number:03d} - {file_title}.m4a"

	maybe_duration = item.find("itunes:duration", XML_NS)
	duration = "missing"
	if maybe_duration is not None:
	duration = maybe_duration.text

	return {
	"number": number,
	"title": title,
	"subtitle": subtitle,
	"summary": item.find("itunes:summary", XML_NS).text,
	"published_on": item.find("pubDate").text,
	"duration": duration,
	"url": url,
	"filename": filename,
	}


	def download_to_file(url: str, filename: str) -> bool:
	"""Download file from url to filename. Return success/fail boolean"""
	response = requests.get(url)
	if response.status_code == requests.codes.ok:
	with open(filename, "wb") as fd:
	for chunk in response.iter_content(chunk_size=128):
	fd.write(chunk)
	return True
	else:
	return False


	def main():
	print("Begin")

	episodes = []

	for i, feed_url in enumerate(FEED_URLS):
	path = f"radiothreesixtyacc_part{i}.xml"
	if os.path.isfile(path):
	print(f"{feed_url} already dowloaded to {path}")
	else:
	download_to_file(feed_url, path)
	root = ET.parse(path).getroot().find("channel")

	new_episodes = [item_as_dict(item) for item in root.findall("item")]
	episodes.extend(new_episodes)
	print(f"Extracted {len(episodes)} episodes from {path}")

	success = []
	failed = []

	for episode in episodes:
	filename = episode["filename"]
	if os.path.isfile(filename):
	print(f"{filename} already saved, skipping download")
	episode["sha1"] = hash_file(filename)
	episode["archived_on"] = datetime.now().isoformat()
	success.append(episode)
	elif download_to_file(episode["url"], filename):
	print(f"{filename} saved")
	episode["sha1"] = hash_file(filename)
	episode["archived_on"] = datetime.now().isoformat()
	success.append(episode)
	else:
	print(f"{filename} failed to download {episode}")
	failed.append(episode)

	with open("manifest.json", "w") as manifest:
	json.dump(success, manifest, indent=2)
	print("Wrote manifest.json")

	if len(failed) > 0:
	print("Some episodes failed to download, details in failed.json")
	with open("failed.json", "w") as fail_out:
	json.dump(failed, fail_out, indent=2)

	print("done")


	if __name__ == "__main__":
	main()