Skip to content

Instantly share code, notes, and snippets.

@44213
Forked from jmcantrell/podcatch
Created September 18, 2021 13:25
Show Gist options
  • Save 44213/42dc4c08e9293b20145e0bc8f1d4dc63 to your computer and use it in GitHub Desktop.
Save 44213/42dc4c08e9293b20145e0bc8f1d4dc63 to your computer and use it in GitHub Desktop.
simple podcast downloader
#!/usr/bin/env bash
set -e
me=${0##*/}
cache=${XDG_CACHE_HOME:-$HOME/.cache}/$me
config=${XDG_CONFIG_HOME:-$HOME/.config}/$me
urls=$config/urls
usage="Update a directory with audio found in RSS/Atom feeds.
Usage: $me [-h] [-d dir] [-p dir] [source...]
-d directory save audio files to directory
-p directory save playlists to directory
default: same directory as audio files
Where source can be any number of:
feed url
xml feed file
opml subscripion file
text file containing feed urls
If the source is a text file with urls, this format is expected:
<url>\\t[name]
The file might have lines like:
http://feeds.wnyc.org/radiolab
https://onbeing.org/programs/feed/\\tOn Being
http://www.ttbook.org/book/radio/rss/feed
Examples:
# Look at all the ways you can import feeds!
podcatch *.opml *.xml
podcatch /path/to/urls.txt
podcatch https://www.wnyc.org/feeds/shows/otm
# By default, running $me will update all feeds in:
$urls
# Format of above is the same as the text file described earlier.
# Audio files will be stored in the current directory.
# The directory can be changed with -d or \$PODCASTS.
# The playlists similarly with -p or \$PLAYLISTS.
# This alias will update all the configured feeds,
# putting the audio files in ~/Podcasts/<feed name>/,
# and updating a playlist at ~/Playlists/<feed name>.m3u.
# Without -p it would be ~/Podcasts/<feed name>/playlist.m3u.
alias podcatch='podcatch -d ~/Podcasts -p ~/Playlists'
Environment variables:
export PODCASTS=/path/to/podcasts
export PLAYLISTS=/path/to/playlists
"
# decode html entities like: &amp; => &
unhtml='recode -q html..ascii'
# follow redirects and give minimal output
curl="curl --fail --location"
has() {
type -p "$@" &>/dev/null
}
unset log
has logger && log="logger -t '$me'"
log() {
[[ $log ]] && $log "$@"
echo "$@" >&2
}
temp() {
local temp=$(mktemp -t $me.XXXXXXXXXX)
trap "rm -f '$temp'" INT TERM EXIT
echo "$temp"
}
input() {
local value=${1:-value}
local default=$2
# press enter to accept default
local reply
read -p "Enter $value${default:+ [$default]}: " reply
[[ $reply ]] || reply=$default
echo "$reply"; [[ $reply ]]
}
get() {
local url=${1:?missing url}
local temp=$(temp)
log "Getting $url"
if ! $curl -o "$temp" "$url"; then
log "Unable to get $url"
return 1
fi
# should be captured in a var
echo "$temp"
}
head_value() {
local url=${1:?missing url}
local header=${2:?missing header}
$curl --silent --head "$url" |
grep "^$header:" | tail -n1 | # ensure only one
cut -d' ' -f2- | cut -d';' -f1 # cut out value
}
content_type() {
head_value "$1" "Content-Type"
}
last_modified() {
head_value "$1" "Last-Modified"
}
get_feed_xml() {
local url=${1:?missing url}
local name=${2:?missing feed name}
local last=$(last_modified "$url") || return 1
local xml=$cache/$name/feed.xml
local lastfile=$cache/$name/last
if [[ -f $lastfile ]]; then
# if values match, no update is needed
if ! grep -q -F "$last" "$lastfile"; then
local temp=$(get "$url") || return 1
mkdir -p "$(dirname "$xml")"
mv -f "$temp" "$xml"
fi
fi
mkdir -p "$(dirname "$lastfile")"
echo "$last" >"$lastfile"
echo "$xml"
}
is_url() {
egrep -q '^https?://' <<<$1
}
is_audio() {
egrep -q '^audio/' <<<$1
}
xpath() {
local xml=${1:?missing xml file}
local query=${2:?missing xpath query}
xml sel -t -v "$query" -n "$xml" | $unhtml
}
audio_extension() {
local file=${1:?missing audio file}
local filetype=$(file --mime-type -b "$file")
if [[ ! $filetype ]]; then
log "Unable to get type for file $file"
return 1
fi
if ! is_audio "$filetype"; then
log "The file $file is not audio"
return 1
fi
local ext
case ${filetype##audio/} in
speex) ext=spx ;;
ogg) ext=ogg ;;
mpeg) ext=mp3 ;;
*)
log "Unrecognized file type $filetype"
return 1
;;
esac
echo "$ext"
}
download() {
local url=${1:?missing audio url}
local name=${2:?missing feed name}
local title=${3:?missing episode title}
if ! is_url "$url"; then
log "Invalid url $url"
return 1
fi
local dir=$podcasts/$name
local seen=$cache/$name/seen
# if this url has been encountered before, skip it
grep -q "\b$url\b" "$seen" &>/dev/null && return 0
# hash helps avoid collisions
# easy solution for sane filenames
local id=$(md5sum <<<$url | cut -f1 -d' ')
# check for existing file
local audio=$(find "$dir" -name "$id.*")
if [[ ! $audio ]]; then
# indicate that this url has been encountered
mkdir -p "$(dirname "$seen")"
echo "$url" >>"$seen"
# eliminate non-audio early
is_audio "$(content_type "$url")" || return 0
local temp=$(get "$url") || return 1
local ext=$(audio_extension "$temp") || return 1
local audio=$dir/$id.$ext
mkdir -p "$dir"
mv -f "$temp" "$audio"
log "Saved to $audio"
# try to avoid getting banned
# wait a sec between each download
sleep 0.5
fi
add_to_m3u "$audio" "$name" "$title"
}
add_to_m3u() {
local audio=${1:?missing audio file}
local name=${2:?missing feed name}
local title=${3:?missing episode title}
# playlist go with the audio or separate?
local m3u entry
if [[ -d $playlists ]]; then
m3u=$playlists/$name.m3u
# playlists live apart from audio, so full path
entry=$audio
else
m3u=$podcasts/$name/playlist.m3u
# if no playlist dir, no need for full path
entry=${audio##*/}
fi
[[ -f $m3u ]] || echo "#EXTM3U" >"$m3u"
if ! grep -q "$entry" "$m3u"; then
log "Adding $title to $m3u"
echo -e "#EXTINF:0,$name: $title\n$entry" >>"$m3u"
fi
}
update_url() {
local url=${1:?missing feed url}
local name=${2:?missing feed name}
log "Checking $name"
local xml=$(get_feed_xml "$url" "$name") || return 1
local guid title
xpath "$xml" "//item/guid" |
while read -r guid; do
title=$(xpath "$xml" "//item[guid='$guid']/title")
if [[ ! $title ]]; then
log "Item $guid has no title"
title=$guid # better than nothing
fi
xpath "$xml" "//item[guid='$guid']//enclosure/@url" |
while read -r url; do
# no enclosures for item
[[ $url ]] || continue
if ! is_url "$url"; then
log "Invalid enclosure url $url"
continue
fi
if ! download "$url" "$name" "$title"; then
log "Unable to download $url"
return 1
fi
done
done
return 0
}
import_urls() {
local urls=${1:?missing urls file}
log "Importing urls file $urls"
local file url name
grep -v '^ *#' "$urls" |
while read -r url name; do
import_url "$url" "$name" || return 1
done
}
import_url() {
local url=${1:?missing feed url}
local name=$2
log "Importing $url"
local temp=$(get "$url") || return 1
# try to get title if it wasn't explicitly set
[[ $name ]] || name=$(xpath "$temp" "//channel/title")
# ask user to make any changes
name=$(input "feed name" "$name")
if [[ ! $name ]]; then
log "Unable to get feed name for $url"
return 1
fi
if grep -q "^$url\b" "$urls"; then
log "Already imported $url"
return 0
fi
mkdir -p "$(dirname "$urls")"
echo -e "$url\t$name" >>"$urls"
}
import_opml() {
local opml=${1:?missing opml file}
log "Importing opml file $opml"
local url name
xpath "$opml" "//outline/@xmlUrl" |
while read -r url; do
is_url "$url" || continue
name=$(xpath "$opml" "//outline[@xmlUrl='$url']/@text")
import_url "$url" "$name" || log "Unable to import $url"
done
}
podcasts=${PODCASTS:-$PWD}
playlists=$PLAYlISTS
# parse command line options
unset query
unset OPTIND
while getopts ":hd:p:q:" option; do
case $option in
q) query=$OPTARG ;;
d) podcasts=$OPTARG ;;
p) playlists=$OPTARG ;;
h) echo "$usage" >&2; exit 0 ;;
*) echo "$usage" >&2; exit 1 ;;
esac
done && shift $(($OPTIND - 1))
if (( $# > 0 )); then
for arg in "$@"; do
if [[ -f $arg ]]; then
filetype=$(file --mime-type -bL "$arg")
case $filetype in
text/*ml) import=import_opml ;;
text/plain) import=import_urls ;;
*) log "Unusable type $filetype for file $arg" ;;
esac
else
case "$arg" in
http*) import=import_url ;;
*) log "Unusuable argument $arg" ;;
esac
fi
$import "$arg" || log "Unable to import $arg"
done
else
grep -v '^ *#' "$urls" |
while read -r url name; do
if [[ $query ]]; then
grep -q "$query" <<<"$url $name" || continue
fi
update_url "$url" "$name" || log "Unable to update $url"
done
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment