duggan/ Podcache.md

## Podcache.md

      
    Raw
  

               Podcache.md
            
          
    Podcache

Back up podcasts with metadata from a feed URL. Progress indicator and resuming.
Hacky, only really tested against FeedBurner/libsyn feeds.
When you run it, you get a directory named after the podcast with a list of mp3 files and metadata.json files.
tree /mnt/backups/podcast/
/mnt/backups/podcast/
├── bestshowever1.mp3.metadata.json
├── bestshowever1.mp3
├── bestshowever2.mp3.metadata.json
├── bestshowever2.mp3
├── bestshowever3.mp3.metadata.json
├── bestshowever3.mp3

The metadata.json files look like:
{
    "text": "This is probably the best show yet!",
    "href": "http://bestshowever.cachefly.net/bestshowever/bestshowever42.mp3",
    "title": "Best Show Ever 42: All The Answers"
}

Usage


Clone into a directory on your machine with git clone https://gist.github.com/23c43febdc8fc393822d.git podcache
Install dependencies with pip install -r requirements.txt
Run with python podcache.py

Works with Python 2 and 3.
Ignores file

Drop a file named .ignores into the download directory of a particular podcast to skip downloading particular episodes.
Useful for skipping over broken / missing episodes. An example of the syntax is in the .ignores file with this repo.

  
## .ignores
title: Some Busted Entry No Longer Available
# Comments start with a '#' and contine to the end of the line
title: Some Other Broken Entry

## podcache.py
import os
import time
import re
import argparse
import signal
import json
from threading import Event
from collections import deque
from pprint import pprint
import feedparser
import requests
import progressbar

DEFAULT_CHUNK_SIZE = 1024 * 100
IGNORES_FILE = '.ignores'

# Event for signalling.
shutdown = Event()

# Triggers the shutdown event on receipt of a signal.
def shutdown_handler(x,y):
    shutdown.set()

# Register some signals with our shutdown handler.
signal.signal(signal.SIGTERM, shutdown_handler)
signal.signal(signal.SIGQUIT, shutdown_handler)
signal.signal(signal.SIGINT, shutdown_handler)


parser = argparse.ArgumentParser(description='Podcast downloader.')
parser.add_argument('-f', '--feed', required=True,
                    help="RSS feed url")
parser.add_argument('-n', '--name', default=None, required=False,
                    help="Alternate name for feed")
parser.add_argument('-o', '--output', default=None, required=False,
                    help="Location for downloaded items")
parser.add_argument('-t', '--type', default='.mp3', required=False,
                    help="File extension to look for in feed items")
parser.add_argument('-i', '--filter', default=None, required=False,
                    help="Apply a regular expression filter on titles")

opts = parser.parse_args()

download_directory = opts.output
feed_name = opts.name
chunk_size = DEFAULT_CHUNK_SIZE

matcher = None
if opts.filter:
    try:
        matcher = re.compile('%s' % opts.filter, re.UNICODE)
    except Exception as e:
        print("Problem with filter:")
        print(e)
        exit(1)

f = feedparser.parse(opts.feed)

podcasts = []
for entry in f["entries"]:
    podcast = {}
    podcast["title"] = entry["title"]

    if "subtitle_detail" in entry:
        podcast["text"] = entry["subtitle_detail"]["value"]
    elif "subtitle" in entry:
        podcast["text"] = entry["subtitle"]
    else:
        print("Could not figure out description, skipping")

    if "links" in entry:
        links = [item for item in entry["links"] if item["href"].endswith(opts.type)]
        if len(links):
            podcast["href"] = links[0]["href"]
        else:
            print("Could not figure out audio URL")
            pprint(entry["links"])
            exit(1)
    else:
        print("Could not figure out audio URL")
        exit(1)

    if matcher:
        if matcher.search(podcast["title"]):
            podcasts.append(podcast)
    else:
        podcasts.append(podcast)

if opts.output:
    download_directory = opts.output
elif opts.name:
    download_directory = opts.name
else:
    pattern = re.compile('[\W_]+', re.UNICODE)
    download_directory = "./%s" % pattern.sub('', f["feed"]["title"]).lower()

# Create directory for podcast
if not os.path.exists(download_directory):
    os.makedirs(download_directory)

ignores = []
ignores_file = os.path.join(download_directory, IGNORES_FILE)
if os.path.exists(ignores_file):
    try:
        # Horrible little parser
        with open(ignores_file, 'r') as f:
            for line in f:
                # ignore comments
                if not line.startswith("#"):
                    raw = line.rstrip().split(":")
                    field = raw[0]
                    value = raw[1]
                    if len(raw) > 2:
                        value = ":".join(raw[1:])
                    ignores.append({field.strip(): value.strip()})
    except:
        print('Invalid ignores file.')
        print('Should be one entry per line, title:foo bar baz')

for podcast in podcasts:
    if shutdown.is_set():
        print("Stopping...")
        break

    write_properties = 'wb'
    local_filename = os.path.join(download_directory, podcast["href"].split('/')[-1])
    metadata_filename = "%s.metadata.json" % local_filename

    ignore = False
    for rule in ignores:
        for k, v in rule.items():
            if k in podcast:
                if podcast[k] == v:
                    ignore = True

    if ignore:
        print('Ignoring "%s" from %s file...' % (podcast["title"], IGNORES_FILE))
        continue

    print("Processing episode: %s" % podcast["title"])
    print("URL: %s" % podcast["href"])

    # Write some metadata alongside
    print("Writing metadata to %s" % metadata_filename)
    with open(metadata_filename, 'w') as f:
        json.dump(podcast, f, indent = 4)

    try:
        r = requests.get(podcast["href"], stream=True)
        if r.status_code > 400:
            print("--- ERROR:")
            print("--- Could not download this podast :( (Status %d)" % r.status_code)
            print("---")
            continue
    except requests.exceptions.RequestException as e:
        print("--- ERROR:")
        print(e)
        print("---")
        continue

    expected_size = 0
    if 'content-length' in r.headers:
        expected_size = int(r.headers['content-length'])

    progress = 0
    # Couldn't get a content-length from server
    if expected_size > 0:
        # Check whether already downloaded
        if os.path.isfile(local_filename):
            size_on_disk = os.path.getsize(local_filename)
            if size_on_disk == expected_size:
                print("Already downloaded, skipping...")
                continue
            else:
                print("%s downloaded, but mismatched file size (%d/%d)" % (local_filename, size_on_disk, expected_size))
                r = requests.get(podcast["href"], stream=True, headers={'Range': 'bytes=%d-%d' % (size_on_disk, expected_size)})
                if r.status_code == 206:
                    print("Resuming...")
                    write_properties = 'ab'
                    progress = size_on_disk
                else:
                    print("Redownloading...")
                    os.remove(local_filename)

    if expected_size == 0:
        print("Warning: could not determine file size, no progress will be indicated.")

    print("Saving to: %s" % local_filename)
    bar = progressbar.ProgressBar(maxval=expected_size, \
                    widgets=[progressbar.Bar('#', '[', ']'), ' ',
                            progressbar.Percentage(), ' ',
                            progressbar.FileTransferSpeed()])

    with open(local_filename, write_properties) as f:
        for chunk in r.iter_content(chunk_size=chunk_size):
            # check for shutdown
            if shutdown.is_set():
                print("Stopping...")
                break
            if chunk:
                f.write(chunk)
                # do progress
                progress += chunk_size
                if progress <= expected_size:
                    bar.update(progress)
                if progress >= expected_size:
                    bar.finish()


## requirements.txt
feedparser
requests
progressbar2
	title: Some Busted Entry No Longer Available
	# Comments start with a '#' and contine to the end of the line
	title: Some Other Broken Entry
	import os
	import time
	import re
	import argparse
	import signal
	import json
	from threading import Event
	from collections import deque
	from pprint import pprint
	import feedparser
	import requests
	import progressbar

	DEFAULT_CHUNK_SIZE = 1024 * 100
	IGNORES_FILE = '.ignores'

	# Event for signalling.
	shutdown = Event()

	# Triggers the shutdown event on receipt of a signal.
	def shutdown_handler(x,y):
	shutdown.set()

	# Register some signals with our shutdown handler.
	signal.signal(signal.SIGTERM, shutdown_handler)
	signal.signal(signal.SIGQUIT, shutdown_handler)
	signal.signal(signal.SIGINT, shutdown_handler)


	parser = argparse.ArgumentParser(description='Podcast downloader.')
	parser.add_argument('-f', '--feed', required=True,
	help="RSS feed url")
	parser.add_argument('-n', '--name', default=None, required=False,
	help="Alternate name for feed")
	parser.add_argument('-o', '--output', default=None, required=False,
	help="Location for downloaded items")
	parser.add_argument('-t', '--type', default='.mp3', required=False,
	help="File extension to look for in feed items")
	parser.add_argument('-i', '--filter', default=None, required=False,
	help="Apply a regular expression filter on titles")

	opts = parser.parse_args()

	download_directory = opts.output
	feed_name = opts.name
	chunk_size = DEFAULT_CHUNK_SIZE

	matcher = None
	if opts.filter:
	try:
	matcher = re.compile('%s' % opts.filter, re.UNICODE)
	except Exception as e:
	print("Problem with filter:")
	print(e)
	exit(1)

	f = feedparser.parse(opts.feed)

	podcasts = []
	for entry in f["entries"]:
	podcast = {}
	podcast["title"] = entry["title"]

	if "subtitle_detail" in entry:
	podcast["text"] = entry["subtitle_detail"]["value"]
	elif "subtitle" in entry:
	podcast["text"] = entry["subtitle"]
	else:
	print("Could not figure out description, skipping")

	if "links" in entry:
	links = [item for item in entry["links"] if item["href"].endswith(opts.type)]
	if len(links):
	podcast["href"] = links[0]["href"]
	else:
	print("Could not figure out audio URL")
	pprint(entry["links"])
	exit(1)
	else:
	print("Could not figure out audio URL")
	exit(1)

	if matcher:
	if matcher.search(podcast["title"]):
	podcasts.append(podcast)
	else:
	podcasts.append(podcast)

	if opts.output:
	download_directory = opts.output
	elif opts.name:
	download_directory = opts.name
	else:
	pattern = re.compile('[\W_]+', re.UNICODE)
	download_directory = "./%s" % pattern.sub('', f["feed"]["title"]).lower()

	# Create directory for podcast
	if not os.path.exists(download_directory):
	os.makedirs(download_directory)

	ignores = []
	ignores_file = os.path.join(download_directory, IGNORES_FILE)
	if os.path.exists(ignores_file):
	try:
	# Horrible little parser
	with open(ignores_file, 'r') as f:
	for line in f:
	# ignore comments
	if not line.startswith("#"):
	raw = line.rstrip().split(":")
	field = raw[0]
	value = raw[1]
	if len(raw) > 2:
	value = ":".join(raw[1:])
	ignores.append({field.strip(): value.strip()})
	except:
	print('Invalid ignores file.')
	print('Should be one entry per line, title:foo bar baz')

	for podcast in podcasts:
	if shutdown.is_set():
	print("Stopping...")
	break

	write_properties = 'wb'
	local_filename = os.path.join(download_directory, podcast["href"].split('/')[-1])
	metadata_filename = "%s.metadata.json" % local_filename

	ignore = False
	for rule in ignores:
	for k, v in rule.items():
	if k in podcast:
	if podcast[k] == v:
	ignore = True

	if ignore:
	print('Ignoring "%s" from %s file...' % (podcast["title"], IGNORES_FILE))
	continue

	print("Processing episode: %s" % podcast["title"])
	print("URL: %s" % podcast["href"])

	# Write some metadata alongside
	print("Writing metadata to %s" % metadata_filename)
	with open(metadata_filename, 'w') as f:
	json.dump(podcast, f, indent = 4)

	try:
	r = requests.get(podcast["href"], stream=True)
	if r.status_code > 400:
	print("--- ERROR:")
	print("--- Could not download this podast :( (Status %d)" % r.status_code)
	print("---")
	continue
	except requests.exceptions.RequestException as e:
	print("--- ERROR:")
	print(e)
	print("---")
	continue

	expected_size = 0
	if 'content-length' in r.headers:
	expected_size = int(r.headers['content-length'])

	progress = 0
	# Couldn't get a content-length from server
	if expected_size > 0:
	# Check whether already downloaded
	if os.path.isfile(local_filename):
	size_on_disk = os.path.getsize(local_filename)
	if size_on_disk == expected_size:
	print("Already downloaded, skipping...")
	continue
	else:
	print("%s downloaded, but mismatched file size (%d/%d)" % (local_filename, size_on_disk, expected_size))
	r = requests.get(podcast["href"], stream=True, headers={'Range': 'bytes=%d-%d' % (size_on_disk, expected_size)})
	if r.status_code == 206:
	print("Resuming...")
	write_properties = 'ab'
	progress = size_on_disk
	else:
	print("Redownloading...")
	os.remove(local_filename)

	if expected_size == 0:
	print("Warning: could not determine file size, no progress will be indicated.")

	print("Saving to: %s" % local_filename)
	bar = progressbar.ProgressBar(maxval=expected_size, \
	widgets=[progressbar.Bar('#', '[', ']'), ' ',
	progressbar.Percentage(), ' ',
	progressbar.FileTransferSpeed()])

	with open(local_filename, write_properties) as f:
	for chunk in r.iter_content(chunk_size=chunk_size):
	# check for shutdown
	if shutdown.is_set():
	print("Stopping...")
	break
	if chunk:
	f.write(chunk)
	# do progress
	progress += chunk_size
	if progress <= expected_size:
	bar.update(progress)
	if progress >= expected_size:
	bar.finish()