|
#!/usr/bin/env python |
|
# |
|
# [https://overcast.fm/](Overcast) on iOS is my primary podcasting |
|
# listening method, but I have an occasional need to download podcasts for |
|
# archival or offline listening purposes. This script takes advantage of |
|
# Overcast's permalink and showpage to get the podcast author and title |
|
# before downloading the podcast itself from the original page. |
|
# |
|
# Usage: python download_from_overcast.py <overcast_url> |
|
# e.g. python download_from_overcast.py https://overcast.fm/+MWUwqlFc |
|
|
|
import os |
|
import re |
|
import sys |
|
|
|
from urllib2 import Request |
|
from urllib2 import urlopen |
|
from urllib import urlretrieve |
|
|
|
|
|
def get_title(html_str): |
|
"""Get the title from the meta tags""" |
|
|
|
title = re.findall(r"<meta name=\"og:title\" content=\"(.+)\"", html_str) |
|
if len(title) == 1: |
|
return title[0].replace("—", "-") |
|
return None |
|
|
|
|
|
def get_description(html_str): |
|
"""Get the description from the Meta tag""" |
|
|
|
desc_re = r"<meta name=\"og:description\" content=\"(.+)\"" |
|
description = re.findall(desc_re, html_str) |
|
if len(description) == 1: |
|
return description[0] |
|
return None |
|
|
|
|
|
def get_url(html_string): |
|
"""Find the URL from the <audio><source>.... tag""" |
|
|
|
url = re.findall(r"<source src=\"(.+?)\"", html_string) |
|
if len(url) == 1: |
|
# strip off the last 4 characters to cater for the #t=0 in the URL |
|
# which urlretrieve flags as invalid |
|
return url[0][:-4] |
|
return None |
|
|
|
|
|
def download(source_url): |
|
"""Given a Overcast source URL fetch the file it points to""" |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) " |
|
"AppleWebKit/537.11 (KHTML, like Gecko) " |
|
"Chrome/23.0.1271.64 Safari/537.11", |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", |
|
"Accept-Encoding": "none", |
|
"Accept-Language": "en-US,en;q=0.8", |
|
"Connection": "keep-alive", |
|
} |
|
req = Request(source_url, None, headers) |
|
source_data = urlopen(req).read() |
|
title = get_title(source_data) |
|
url = get_url(source_data) |
|
|
|
if url is None or title is None: |
|
sys.exit("Could not find parse URL") |
|
|
|
output_format = url[-4:] if url[-4] == "." else ".mp3" |
|
output_file = "{}{}".format(title, output_format) |
|
urlretrieve(url, output_file) |
|
|
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) != 2: |
|
sys.exit("{} <overcast_url>".format(__file__)) |
|
download(sys.argv[1]) |