Last active
April 24, 2018 05:34
-
-
Save gvanem/23f28cf8b66ad09926f5ed2cee557624 to your computer and use it in GitHub Desktop.
Jan Olsen Podcast/MP3 download Python script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !/usr/bin/env python | |
# -*- coding: iso-8859-1 -*- | |
r""" | |
A Python script to download all MP3 files for the character Jan Olsen | |
in the NRK P1 radio-show Lønsj. This will download from NRK's podcast | |
location at: | |
%s (a XML-file). | |
unless this RSS-file doesn't exist in %s already. This should | |
cause all the 153 MP3 files to be downloaded into the current directory | |
if not already present. | |
NOTE: Needs Wget, curl or PyCurl package to actually download the MP3 files. | |
""" | |
from __future__ import print_function | |
# | |
# This scipt is also here: | |
# https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py | |
# | |
# Hence, it's possible to launch it from Github directly. E.g.: | |
# curl -s https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py | python - | |
# | |
import os, sys, getopt, inspect | |
if sys.version[0] >= '3': | |
from xml.etree import ElementTree | |
xmllib = ElementTree | |
else: | |
import xmllib | |
try: | |
from colorama import init, Fore, Style | |
init() | |
_GREEN = Fore.GREEN + Style.BRIGHT | |
_RED = Fore.RED + Style.BRIGHT | |
_YELLOW = Fore.YELLOW + Style.BRIGHT | |
_RESET = Style.RESET_ALL | |
except ImportError as e: | |
_GREEN = '' | |
_RED = '' | |
_YELLOW = '' | |
_RESET = '' | |
pass | |
if sys.platform == 'win32': | |
tmp = os.getenv ("TEMP") | |
else: | |
tmp = os.getenv ("TMP") | |
# | |
# Globals: | |
# | |
rss_url = 'http://podkast.nrk.no/program/jan_olsen_fra_loensj.rss' | |
rss_file = os.path.join (tmp, 'jan_olsen_fra_loensj.rss') | |
debug_level = 0 | |
dry_run = 0 | |
use_curl = 0 | |
use_pycurl = 0 | |
use_wget = 1 | |
this_prog = sys.argv[0] | |
def trace (s, level = 1): | |
if debug_level >= level: | |
if debug_level >= 1: | |
frame = sys._getframe (0) | |
line = frame.f_back.f_lineno | |
file = inspect.getsourcefile (frame.f_back) | |
if s[0] == '\n': | |
print ("") | |
s = s[1:] | |
print ("%s[%d/%3d]: " % (_GREEN, debug_level, line), end="") | |
print ("%s%s%s" % (_YELLOW, s, _RESET), end="") | |
def fatal (s): | |
trace (s, 0) | |
sys.exit (1) | |
def parse_cmd_line (): | |
try: | |
opts, args = getopt.getopt (sys.argv[1:], "hcdnp") | |
except getopt.GetoptError as e: | |
usage ("%sError:%s %s" % (_RED, _RESET, e.msg)) | |
for o, a in opts: | |
if o == '-h': | |
usage() | |
elif o == '-d': | |
global debug_level | |
debug_level += 1 | |
elif o == '-n': | |
global dry_run | |
dry_run = 1 | |
elif o == '-c': | |
global use_curl | |
use_curl = 1 | |
use_wget = 0 | |
elif o == '-p': | |
global use_pycurl | |
use_pycurl = 1 | |
use_wget = 0 | |
use_curl = 0 | |
# | |
# Parse the rss_file and extract all "enclosure" tags like: | |
# <enclosure url="http://podkast.nrk.no/fil/jan_olsen_fra_loensj/jan_olsen_fra_loensj_2016-3-11_913_3202.MP3?stat=1" | |
# length="6354361" type="audio/mpeg"> | |
# | |
# Remove anything after a '?' and append to the 'self.mp3_URLs[]'. Like: | |
# https://podkast.nrk.no/fil/jan_olsen_fra_loensj/DA0_D925167EF4DB4D51B4F135D895F52881.MP3 | |
# | |
# Return array of URLs with these MP3-files. | |
# | |
# Note: Not sorted on date or name. | |
# | |
def parse_rss_file (f): | |
class Jan_Olsen_XML (xmllib.XMLParser): | |
def __init__ (self, **kw): | |
self.mp3_URLs = [] | |
self.mp3_sizes = [] | |
xmllib.XMLParser.__init__ (self, **kw) | |
def syntax_error (self, message): | |
# trace (message + '\n', 3) | |
pass | |
def dump (self): | |
i = 0 | |
for f in self.mp3_URLs: | |
size = self.mp3_sizes[i] | |
trace ("%5d bytes: %s\n" % (size, f)) | |
i += 1 | |
def unknown_starttag (self, tag, attrs): | |
if not attrs: | |
return | |
if not tag.startswith("enclosure"): | |
return | |
for name, value in attrs.items(): | |
trace (' %s=%s\n' % (name, value), 2) | |
if name == "url": | |
i = value.index('?') | |
if i > 0: | |
value = value [:i] | |
self.mp3_URLs.append (value) | |
elif name == 'length': | |
self.mp3_sizes.append (int(value)) | |
p = Jan_Olsen_XML() | |
p.feed (f.read()) | |
p.close() | |
if debug_level >= 2: | |
p.dump() | |
return p.mp3_URLs, p.mp3_sizes | |
# | |
# Download a single file using one of the methods: | |
# the external program 'curl'. | |
# the external program 'wget'. | |
# the Python package 'pycurl'. | |
# | |
# Do it quietly unless 'debug_level' is set. | |
# Returns 0 on success. | |
# | |
def download_file (url, outf, exp_size=0): | |
if use_curl: | |
prog = "curl" | |
if debug_level == 0: | |
cmd = "set CURL_MEMDEBUG= & curl -sLo %s " % outf | |
else: | |
cmd = "curl -Lo %s " % outf | |
elif use_pycurl: | |
def debug_func (t, buf): | |
if quit: | |
fatal ("Quitting") | |
if t == 0 or t == 1: | |
buf = buf.rstrip ('\r\n') | |
trace ("dbg: t:%d, %s\n" % (t, buf)) | |
size_got = 0 | |
f = open (outf, "wb") | |
curl = pycurl.Curl() | |
curl.setopt (pycurl.URL, url) | |
curl.setopt (pycurl.FOLLOWLOCATION, 1) | |
curl.setopt (pycurl.MAXREDIRS, 5) | |
curl.setopt (pycurl.CONNECTTIMEOUT, 30) | |
curl.setopt (pycurl.TIMEOUT, 300) | |
curl.setopt (pycurl.WRITEDATA, f) | |
curl.setopt (pycurl.VERBOSE, debug_level) | |
curl.setopt (pycurl.DEBUGFUNCTION, debug_func) | |
try: | |
curl.perform() | |
curl.close() | |
rc = 0 | |
except: | |
sys.stderr.flush() | |
rc = 1 | |
f.close() | |
else: | |
prog = "wget" | |
cmd = "set WSOCK_TRACE_LEVEL= & wget -o %s -O %s " % (os.devnull, outf) | |
if debug_level == 0: | |
cmd += "-q " | |
if use_pycurl: | |
return rc | |
cmd += url | |
trace ("\ncmd: '%s'\n" % cmd, 1) | |
if dry_run > 0: | |
return 0 | |
try: | |
return os.system (cmd) | |
except KeyboardInterrupt: | |
os.remove (outf) | |
fatal ('Aborting %s.\n' % prog) | |
def download_rss_file (url, fname): | |
if download_file(url,fname) == 0: # Success | |
if dry_run == 0: | |
trace ("Downloaded %d bytes for %s\n" % (os.path.getsize(fname),fname), 0) | |
else: | |
trace ("Downloaded ?? bytes for %s\n" % fname, 0) | |
return 1 | |
return 0 | |
def get_MP3_file_from_url (url): | |
u = url.upper() | |
if not u.endswith(".MP3"): | |
fatal ("Unexpected URL: '%s'\n" % url) | |
i = url.rindex ('/') | |
if i > 0: | |
return url [i+1:] | |
fatal ("Found no '/*.MP3' in URL: '%s'\n" % url) | |
def download_mp3_files (urls, sizes): | |
num = len(urls) | |
i = 0 | |
for u in urls: | |
size = sizes[i] | |
i += 1 | |
f = get_MP3_file_from_url (u) | |
trace ("%3d/%d: %s... " % (i, num, f), 0) | |
if os.path.exists(f): | |
print ("Skipping.") | |
continue | |
if download_file (u, f, size): | |
break | |
if debug_level == 0: | |
print (" size: %d" % size) | |
############################################## | |
def main(): | |
parse_cmd_line() | |
if use_pycurl: | |
try: | |
global pycurl | |
import pycurl | |
except: | |
fatal ('Failed to import PyCurl.\n') | |
if not os.path.exists (rss_file): | |
trace ("RSS-file '%s' not found.\n" % rss_file, 0) | |
if download_rss_file(rss_url, rss_file) == 0: | |
sys.exit (1) | |
if os.path.exists (rss_file): | |
f = open (rss_file, 'r') | |
urls, sizes = parse_rss_file (f) | |
download_mp3_files (urls, sizes) | |
f.close() | |
else: | |
trace ("RSS-file '%s' still not found!!\n" % rss_file, 0) | |
def help_line (opt, h_str): | |
print (" %s-%c%s: %s" % (_YELLOW, opt, _RESET, h_str)) | |
def usage (e = None): | |
if e: | |
print (e) | |
print ("Usage: %s: %s" % (this_prog, "[-cdnph]")) | |
help_line ('c', "use curl to download files (wget is default).") | |
help_line ('d', "sets debug level (-dd for level 2).") | |
help_line ('n', "run in dry-run mode.") | |
help_line ('p', "use PyCurl to download files.") | |
help_line ('h', "this help.") | |
if not e: | |
print (__doc__ % (rss_url, tmp), end="") | |
sys.exit (0) | |
if __name__ == '__main__': | |
main() | |
sys.exit(0) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment