Skip to content

Instantly share code, notes, and snippets.

@gvanem
Last active April 24, 2018 05:34
Show Gist options
  • Save gvanem/23f28cf8b66ad09926f5ed2cee557624 to your computer and use it in GitHub Desktop.
Save gvanem/23f28cf8b66ad09926f5ed2cee557624 to your computer and use it in GitHub Desktop.
Jan Olsen Podcast/MP3 download Python script
# !/usr/bin/env python
# -*- coding: iso-8859-1 -*-
r"""
A Python script to download all MP3 files for the character Jan Olsen
in the NRK P1 radio-show Lønsj. This will download from NRK's podcast
location at:
%s (a XML-file).
unless this RSS-file doesn't exist in %s already. This should
cause all the 153 MP3 files to be downloaded into the current directory
if not already present.
NOTE: Needs Wget, curl or PyCurl package to actually download the MP3 files.
"""
from __future__ import print_function
#
# This scipt is also here:
# https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py
#
# Hence, it's possible to launch it from Github directly. E.g.:
# curl -s https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py | python -
#
import os, sys, getopt, inspect
if sys.version[0] >= '3':
from xml.etree import ElementTree
xmllib = ElementTree
else:
import xmllib
try:
from colorama import init, Fore, Style
init()
_GREEN = Fore.GREEN + Style.BRIGHT
_RED = Fore.RED + Style.BRIGHT
_YELLOW = Fore.YELLOW + Style.BRIGHT
_RESET = Style.RESET_ALL
except ImportError as e:
_GREEN = ''
_RED = ''
_YELLOW = ''
_RESET = ''
pass
if sys.platform == 'win32':
tmp = os.getenv ("TEMP")
else:
tmp = os.getenv ("TMP")
#
# Globals:
#
rss_url = 'http://podkast.nrk.no/program/jan_olsen_fra_loensj.rss'
rss_file = os.path.join (tmp, 'jan_olsen_fra_loensj.rss')
debug_level = 0
dry_run = 0
use_curl = 0
use_pycurl = 0
use_wget = 1
this_prog = sys.argv[0]
def trace (s, level = 1):
if debug_level >= level:
if debug_level >= 1:
frame = sys._getframe (0)
line = frame.f_back.f_lineno
file = inspect.getsourcefile (frame.f_back)
if s[0] == '\n':
print ("")
s = s[1:]
print ("%s[%d/%3d]: " % (_GREEN, debug_level, line), end="")
print ("%s%s%s" % (_YELLOW, s, _RESET), end="")
def fatal (s):
trace (s, 0)
sys.exit (1)
def parse_cmd_line ():
try:
opts, args = getopt.getopt (sys.argv[1:], "hcdnp")
except getopt.GetoptError as e:
usage ("%sError:%s %s" % (_RED, _RESET, e.msg))
for o, a in opts:
if o == '-h':
usage()
elif o == '-d':
global debug_level
debug_level += 1
elif o == '-n':
global dry_run
dry_run = 1
elif o == '-c':
global use_curl
use_curl = 1
use_wget = 0
elif o == '-p':
global use_pycurl
use_pycurl = 1
use_wget = 0
use_curl = 0
#
# Parse the rss_file and extract all "enclosure" tags like:
# <enclosure url="http://podkast.nrk.no/fil/jan_olsen_fra_loensj/jan_olsen_fra_loensj_2016-3-11_913_3202.MP3?stat=1"
# length="6354361" type="audio/mpeg">
#
# Remove anything after a '?' and append to the 'self.mp3_URLs[]'. Like:
# https://podkast.nrk.no/fil/jan_olsen_fra_loensj/DA0_D925167EF4DB4D51B4F135D895F52881.MP3
#
# Return array of URLs with these MP3-files.
#
# Note: Not sorted on date or name.
#
def parse_rss_file (f):
class Jan_Olsen_XML (xmllib.XMLParser):
def __init__ (self, **kw):
self.mp3_URLs = []
self.mp3_sizes = []
xmllib.XMLParser.__init__ (self, **kw)
def syntax_error (self, message):
# trace (message + '\n', 3)
pass
def dump (self):
i = 0
for f in self.mp3_URLs:
size = self.mp3_sizes[i]
trace ("%5d bytes: %s\n" % (size, f))
i += 1
def unknown_starttag (self, tag, attrs):
if not attrs:
return
if not tag.startswith("enclosure"):
return
for name, value in attrs.items():
trace (' %s=%s\n' % (name, value), 2)
if name == "url":
i = value.index('?')
if i > 0:
value = value [:i]
self.mp3_URLs.append (value)
elif name == 'length':
self.mp3_sizes.append (int(value))
p = Jan_Olsen_XML()
p.feed (f.read())
p.close()
if debug_level >= 2:
p.dump()
return p.mp3_URLs, p.mp3_sizes
#
# Download a single file using one of the methods:
# the external program 'curl'.
# the external program 'wget'.
# the Python package 'pycurl'.
#
# Do it quietly unless 'debug_level' is set.
# Returns 0 on success.
#
def download_file (url, outf, exp_size=0):
if use_curl:
prog = "curl"
if debug_level == 0:
cmd = "set CURL_MEMDEBUG= & curl -sLo %s " % outf
else:
cmd = "curl -Lo %s " % outf
elif use_pycurl:
def debug_func (t, buf):
if quit:
fatal ("Quitting")
if t == 0 or t == 1:
buf = buf.rstrip ('\r\n')
trace ("dbg: t:%d, %s\n" % (t, buf))
size_got = 0
f = open (outf, "wb")
curl = pycurl.Curl()
curl.setopt (pycurl.URL, url)
curl.setopt (pycurl.FOLLOWLOCATION, 1)
curl.setopt (pycurl.MAXREDIRS, 5)
curl.setopt (pycurl.CONNECTTIMEOUT, 30)
curl.setopt (pycurl.TIMEOUT, 300)
curl.setopt (pycurl.WRITEDATA, f)
curl.setopt (pycurl.VERBOSE, debug_level)
curl.setopt (pycurl.DEBUGFUNCTION, debug_func)
try:
curl.perform()
curl.close()
rc = 0
except:
sys.stderr.flush()
rc = 1
f.close()
else:
prog = "wget"
cmd = "set WSOCK_TRACE_LEVEL= & wget -o %s -O %s " % (os.devnull, outf)
if debug_level == 0:
cmd += "-q "
if use_pycurl:
return rc
cmd += url
trace ("\ncmd: '%s'\n" % cmd, 1)
if dry_run > 0:
return 0
try:
return os.system (cmd)
except KeyboardInterrupt:
os.remove (outf)
fatal ('Aborting %s.\n' % prog)
def download_rss_file (url, fname):
if download_file(url,fname) == 0: # Success
if dry_run == 0:
trace ("Downloaded %d bytes for %s\n" % (os.path.getsize(fname),fname), 0)
else:
trace ("Downloaded ?? bytes for %s\n" % fname, 0)
return 1
return 0
def get_MP3_file_from_url (url):
u = url.upper()
if not u.endswith(".MP3"):
fatal ("Unexpected URL: '%s'\n" % url)
i = url.rindex ('/')
if i > 0:
return url [i+1:]
fatal ("Found no '/*.MP3' in URL: '%s'\n" % url)
def download_mp3_files (urls, sizes):
num = len(urls)
i = 0
for u in urls:
size = sizes[i]
i += 1
f = get_MP3_file_from_url (u)
trace ("%3d/%d: %s... " % (i, num, f), 0)
if os.path.exists(f):
print ("Skipping.")
continue
if download_file (u, f, size):
break
if debug_level == 0:
print (" size: %d" % size)
##############################################
def main():
parse_cmd_line()
if use_pycurl:
try:
global pycurl
import pycurl
except:
fatal ('Failed to import PyCurl.\n')
if not os.path.exists (rss_file):
trace ("RSS-file '%s' not found.\n" % rss_file, 0)
if download_rss_file(rss_url, rss_file) == 0:
sys.exit (1)
if os.path.exists (rss_file):
f = open (rss_file, 'r')
urls, sizes = parse_rss_file (f)
download_mp3_files (urls, sizes)
f.close()
else:
trace ("RSS-file '%s' still not found!!\n" % rss_file, 0)
def help_line (opt, h_str):
print (" %s-%c%s: %s" % (_YELLOW, opt, _RESET, h_str))
def usage (e = None):
if e:
print (e)
print ("Usage: %s: %s" % (this_prog, "[-cdnph]"))
help_line ('c', "use curl to download files (wget is default).")
help_line ('d', "sets debug level (-dd for level 2).")
help_line ('n', "run in dry-run mode.")
help_line ('p', "use PyCurl to download files.")
help_line ('h', "this help.")
if not e:
print (__doc__ % (rss_url, tmp), end="")
sys.exit (0)
if __name__ == '__main__':
main()
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment