gvanem/Jan-Olsen-MP3.py

## Jan-Olsen-MP3.py
# !/usr/bin/env python
# -*- coding: iso-8859-1 -*-
r"""
  A Python script to download all MP3 files for the character Jan Olsen
  in the NRK P1 radio-show Lønsj. This will download from NRK's podcast
  location at:
    %s  (a XML-file).

  unless this RSS-file doesn't exist in %s already. This should
  cause all the 153 MP3 files to be downloaded into the current directory
  if not already present.

  NOTE: Needs Wget, curl or PyCurl package to actually download the MP3 files.
"""

from __future__ import print_function

#
# This scipt is also here:
#   https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py
#
# Hence, it's possible to launch it from Github directly. E.g.:
#   curl -s https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py | python -
#
import os, sys, getopt, inspect

if sys.version[0] >= '3':
  from xml.etree import ElementTree
  xmllib = ElementTree
else:
  import xmllib

try:
  from colorama import init, Fore, Style
  init()
  _GREEN  = Fore.GREEN  + Style.BRIGHT
  _RED    = Fore.RED    + Style.BRIGHT
  _YELLOW = Fore.YELLOW + Style.BRIGHT
  _RESET  = Style.RESET_ALL

except ImportError as e:
  _GREEN  = ''
  _RED    = ''
  _YELLOW = ''
  _RESET  = ''
  pass

if sys.platform == 'win32':
  tmp = os.getenv ("TEMP")
else:
  tmp = os.getenv ("TMP")

#
# Globals:
#
rss_url     = 'http://podkast.nrk.no/program/jan_olsen_fra_loensj.rss'
rss_file    = os.path.join (tmp, 'jan_olsen_fra_loensj.rss')
debug_level = 0
dry_run     = 0
use_curl    = 0
use_pycurl  = 0
use_wget    = 1
this_prog   = sys.argv[0]

def trace (s, level = 1):
  if debug_level >= level:
    if debug_level >= 1:
      frame = sys._getframe (0)
      line = frame.f_back.f_lineno
      file = inspect.getsourcefile (frame.f_back)
      if s[0] == '\n':
        print ("")
        s = s[1:]
      print ("%s[%d/%3d]: " % (_GREEN, debug_level, line), end="")
    print ("%s%s%s" % (_YELLOW, s, _RESET), end="")

def fatal (s):
  trace (s, 0)
  sys.exit (1)

def parse_cmd_line ():
  try:
    opts, args = getopt.getopt (sys.argv[1:], "hcdnp")
  except getopt.GetoptError as e:
    usage ("%sError:%s %s" % (_RED, _RESET, e.msg))

  for o, a in opts:
    if o == '-h':
      usage()
    elif o == '-d':
      global debug_level
      debug_level += 1
    elif o == '-n':
      global dry_run
      dry_run = 1
    elif o == '-c':
      global use_curl
      use_curl = 1
      use_wget = 0
    elif o == '-p':
      global use_pycurl
      use_pycurl = 1
      use_wget = 0
      use_curl = 0


#
# Parse the rss_file and extract all "enclosure" tags like:
#   <enclosure url="http://podkast.nrk.no/fil/jan_olsen_fra_loensj/jan_olsen_fra_loensj_2016-3-11_913_3202.MP3?stat=1"
#    length="6354361" type="audio/mpeg">
#
# Remove anything after a '?' and append to the 'self.mp3_URLs[]'. Like:
#   https://podkast.nrk.no/fil/jan_olsen_fra_loensj/DA0_D925167EF4DB4D51B4F135D895F52881.MP3
#
# Return array of URLs with these MP3-files.
#
# Note: Not sorted on date or name.
#
def parse_rss_file (f):
  class Jan_Olsen_XML (xmllib.XMLParser):

    def __init__ (self, **kw):
      self.mp3_URLs  = []
      self.mp3_sizes = []
      xmllib.XMLParser.__init__ (self, **kw)

    def syntax_error (self, message):
      # trace (message + '\n', 3)
      pass

    def dump (self):
      i = 0
      for f in self.mp3_URLs:
        size = self.mp3_sizes[i]
        trace ("%5d bytes: %s\n" % (size, f))
        i += 1

    def unknown_starttag (self, tag, attrs):
      if not attrs:
        return
      if not tag.startswith("enclosure"):
        return

      for name, value in attrs.items():
        trace (' %s=%s\n' % (name, value), 2)
        if name == "url":
          i = value.index('?')
          if i > 0:
            value = value [:i]
          self.mp3_URLs.append (value)
        elif name == 'length':
          self.mp3_sizes.append (int(value))

  p = Jan_Olsen_XML()
  p.feed (f.read())
  p.close()
  if debug_level >= 2:
    p.dump()

  return p.mp3_URLs, p.mp3_sizes

#
# Download a single file using one of the methods:
#   the external program 'curl'.
#   the external program 'wget'.
#   the Python package 'pycurl'.
#
# Do it quietly unless 'debug_level' is set.
# Returns 0 on success.
#
def download_file (url, outf, exp_size=0):

  if use_curl:
    prog = "curl"
    if debug_level == 0:
      cmd = "set CURL_MEMDEBUG= & curl -sLo %s " % outf
    else:
      cmd = "curl -Lo %s " % outf

  elif use_pycurl:
    def debug_func (t, buf):
      if quit:
        fatal ("Quitting")
      if t == 0 or t == 1:
         buf = buf.rstrip ('\r\n')
         trace ("dbg: t:%d, %s\n" % (t, buf))

    size_got = 0
    f = open (outf, "wb")
    curl = pycurl.Curl()
    curl.setopt (pycurl.URL, url)
    curl.setopt (pycurl.FOLLOWLOCATION, 1)
    curl.setopt (pycurl.MAXREDIRS, 5)
    curl.setopt (pycurl.CONNECTTIMEOUT, 30)
    curl.setopt (pycurl.TIMEOUT, 300)
    curl.setopt (pycurl.WRITEDATA, f)
    curl.setopt (pycurl.VERBOSE, debug_level)
    curl.setopt (pycurl.DEBUGFUNCTION, debug_func)

    try:
      curl.perform()
      curl.close()
      rc = 0
    except:
      sys.stderr.flush()
      rc = 1
    f.close()

  else:
    prog = "wget"
    cmd  = "set WSOCK_TRACE_LEVEL= & wget -o %s -O %s " % (os.devnull, outf)
    if debug_level == 0:
      cmd += "-q "

  if use_pycurl:
    return rc

  cmd += url
  trace ("\ncmd: '%s'\n" % cmd, 1)

  if dry_run > 0:
    return 0

  try:
    return os.system (cmd)

  except KeyboardInterrupt:
    os.remove (outf)
    fatal ('Aborting %s.\n' % prog)

def download_rss_file (url, fname):
  if download_file(url,fname) == 0:   # Success
    if dry_run == 0:
      trace ("Downloaded %d bytes for %s\n" % (os.path.getsize(fname),fname), 0)
    else:
      trace ("Downloaded ?? bytes for %s\n" % fname, 0)
    return 1
  return 0

def get_MP3_file_from_url (url):
  u = url.upper()
  if not u.endswith(".MP3"):
    fatal ("Unexpected URL: '%s'\n" % url)

  i = url.rindex ('/')
  if i > 0:
     return url [i+1:]
  fatal ("Found no '/*.MP3' in URL: '%s'\n" % url)

def download_mp3_files (urls, sizes):
  num = len(urls)
  i = 0
  for u in urls:
    size = sizes[i]
    i += 1
    f = get_MP3_file_from_url (u)
    trace ("%3d/%d: %s... " % (i, num, f), 0)
    if os.path.exists(f):
      print ("Skipping.")
      continue

    if download_file (u, f, size):
      break
    if debug_level == 0:
      print ("  size: %d" % size)

##############################################

def main():
  parse_cmd_line()

  if use_pycurl:
    try:
      global pycurl
      import pycurl
    except:
      fatal ('Failed to import PyCurl.\n')

  if not os.path.exists (rss_file):
    trace ("RSS-file '%s' not found.\n" % rss_file, 0)
    if download_rss_file(rss_url, rss_file) == 0:
      sys.exit (1)

  if os.path.exists (rss_file):
    f = open (rss_file, 'r')
    urls, sizes = parse_rss_file (f)
    download_mp3_files (urls, sizes)
    f.close()
  else:
    trace ("RSS-file '%s' still not found!!\n" % rss_file, 0)

def help_line (opt, h_str):
  print ("  %s-%c%s:  %s" % (_YELLOW, opt, _RESET, h_str))

def usage (e = None):
  if e:
    print (e)
  print ("Usage: %s: %s" % (this_prog, "[-cdnph]"))

  help_line ('c', "use curl to download files (wget is default).")
  help_line ('d', "sets debug level (-dd for level 2).")
  help_line ('n', "run in dry-run mode.")
  help_line ('p', "use PyCurl to download files.")
  help_line ('h', "this help.")

  if not e:
    print (__doc__ % (rss_url, tmp), end="")
  sys.exit (0)

if __name__ == '__main__':
  main()
  sys.exit(0)
	# !/usr/bin/env python
	# -- coding: iso-8859-1 --
	r"""
	A Python script to download all MP3 files for the character Jan Olsen
	in the NRK P1 radio-show Lønsj. This will download from NRK's podcast
	location at:
	%s (a XML-file).

	unless this RSS-file doesn't exist in %s already. This should
	cause all the 153 MP3 files to be downloaded into the current directory
	if not already present.

	NOTE: Needs Wget, curl or PyCurl package to actually download the MP3 files.
	"""

	from __future__ import print_function

	#
	# This scipt is also here:
	# https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py
	#
	# Hence, it's possible to launch it from Github directly. E.g.:
	# curl -s https://gist.githubusercontent.com/gvanem/23f28cf8b66ad09926f5ed2cee557624/raw/b9690a99729d8f537582a7133b213e814ccf7d97/Jan-Olsen-MP3.py \| python -
	#
	import os, sys, getopt, inspect

	if sys.version[0] >= '3':
	from xml.etree import ElementTree
	xmllib = ElementTree
	else:
	import xmllib

	try:
	from colorama import init, Fore, Style
	init()
	_GREEN = Fore.GREEN + Style.BRIGHT
	_RED = Fore.RED + Style.BRIGHT
	_YELLOW = Fore.YELLOW + Style.BRIGHT
	_RESET = Style.RESET_ALL

	except ImportError as e:
	_GREEN = ''
	_RED = ''
	_YELLOW = ''
	_RESET = ''
	pass

	if sys.platform == 'win32':
	tmp = os.getenv ("TEMP")
	else:
	tmp = os.getenv ("TMP")

	#
	# Globals:
	#
	rss_url = 'http://podkast.nrk.no/program/jan_olsen_fra_loensj.rss'
	rss_file = os.path.join (tmp, 'jan_olsen_fra_loensj.rss')
	debug_level = 0
	dry_run = 0
	use_curl = 0
	use_pycurl = 0
	use_wget = 1
	this_prog = sys.argv[0]

	def trace (s, level = 1):
	if debug_level >= level:
	if debug_level >= 1:
	frame = sys._getframe (0)
	line = frame.f_back.f_lineno
	file = inspect.getsourcefile (frame.f_back)
	if s[0] == '\n':
	print ("")
	s = s[1:]
	print ("%s[%d/%3d]: " % (_GREEN, debug_level, line), end="")
	print ("%s%s%s" % (_YELLOW, s, _RESET), end="")

	def fatal (s):
	trace (s, 0)
	sys.exit (1)

	def parse_cmd_line ():
	try:
	opts, args = getopt.getopt (sys.argv[1:], "hcdnp")
	except getopt.GetoptError as e:
	usage ("%sError:%s %s" % (_RED, _RESET, e.msg))

	for o, a in opts:
	if o == '-h':
	usage()
	elif o == '-d':
	global debug_level
	debug_level += 1
	elif o == '-n':
	global dry_run
	dry_run = 1
	elif o == '-c':
	global use_curl
	use_curl = 1
	use_wget = 0
	elif o == '-p':
	global use_pycurl
	use_pycurl = 1
	use_wget = 0
	use_curl = 0


	#
	# Parse the rss_file and extract all "enclosure" tags like:
	# <enclosure url="http://podkast.nrk.no/fil/jan_olsen_fra_loensj/jan_olsen_fra_loensj_2016-3-11_913_3202.MP3?stat=1"
	# length="6354361" type="audio/mpeg">
	#
	# Remove anything after a '?' and append to the 'self.mp3_URLs[]'. Like:
	# https://podkast.nrk.no/fil/jan_olsen_fra_loensj/DA0_D925167EF4DB4D51B4F135D895F52881.MP3
	#
	# Return array of URLs with these MP3-files.
	#
	# Note: Not sorted on date or name.
	#
	def parse_rss_file (f):
	class Jan_Olsen_XML (xmllib.XMLParser):

	def __init__ (self, **kw):
	self.mp3_URLs = []
	self.mp3_sizes = []
	xmllib.XMLParser.__init__ (self, **kw)

	def syntax_error (self, message):
	# trace (message + '\n', 3)
	pass

	def dump (self):
	i = 0
	for f in self.mp3_URLs:
	size = self.mp3_sizes[i]
	trace ("%5d bytes: %s\n" % (size, f))
	i += 1

	def unknown_starttag (self, tag, attrs):
	if not attrs:
	return
	if not tag.startswith("enclosure"):
	return

	for name, value in attrs.items():
	trace (' %s=%s\n' % (name, value), 2)
	if name == "url":
	i = value.index('?')
	if i > 0:
	value = value [:i]
	self.mp3_URLs.append (value)
	elif name == 'length':
	self.mp3_sizes.append (int(value))

	p = Jan_Olsen_XML()
	p.feed (f.read())
	p.close()
	if debug_level >= 2:
	p.dump()

	return p.mp3_URLs, p.mp3_sizes

	#
	# Download a single file using one of the methods:
	# the external program 'curl'.
	# the external program 'wget'.
	# the Python package 'pycurl'.
	#
	# Do it quietly unless 'debug_level' is set.
	# Returns 0 on success.
	#
	def download_file (url, outf, exp_size=0):

	if use_curl:
	prog = "curl"
	if debug_level == 0:
	cmd = "set CURL_MEMDEBUG= & curl -sLo %s " % outf
	else:
	cmd = "curl -Lo %s " % outf

	elif use_pycurl:
	def debug_func (t, buf):
	if quit:
	fatal ("Quitting")
	if t == 0 or t == 1:
	buf = buf.rstrip ('\r\n')
	trace ("dbg: t:%d, %s\n" % (t, buf))

	size_got = 0
	f = open (outf, "wb")
	curl = pycurl.Curl()
	curl.setopt (pycurl.URL, url)
	curl.setopt (pycurl.FOLLOWLOCATION, 1)
	curl.setopt (pycurl.MAXREDIRS, 5)
	curl.setopt (pycurl.CONNECTTIMEOUT, 30)
	curl.setopt (pycurl.TIMEOUT, 300)
	curl.setopt (pycurl.WRITEDATA, f)
	curl.setopt (pycurl.VERBOSE, debug_level)
	curl.setopt (pycurl.DEBUGFUNCTION, debug_func)

	try:
	curl.perform()
	curl.close()
	rc = 0
	except:
	sys.stderr.flush()
	rc = 1
	f.close()

	else:
	prog = "wget"
	cmd = "set WSOCK_TRACE_LEVEL= & wget -o %s -O %s " % (os.devnull, outf)
	if debug_level == 0:
	cmd += "-q "

	if use_pycurl:
	return rc

	cmd += url
	trace ("\ncmd: '%s'\n" % cmd, 1)

	if dry_run > 0:
	return 0

	try:
	return os.system (cmd)

	except KeyboardInterrupt:
	os.remove (outf)
	fatal ('Aborting %s.\n' % prog)

	def download_rss_file (url, fname):
	if download_file(url,fname) == 0: # Success
	if dry_run == 0:
	trace ("Downloaded %d bytes for %s\n" % (os.path.getsize(fname),fname), 0)
	else:
	trace ("Downloaded ?? bytes for %s\n" % fname, 0)
	return 1
	return 0

	def get_MP3_file_from_url (url):
	u = url.upper()
	if not u.endswith(".MP3"):
	fatal ("Unexpected URL: '%s'\n" % url)

	i = url.rindex ('/')
	if i > 0:
	return url [i+1:]
	fatal ("Found no '/*.MP3' in URL: '%s'\n" % url)

	def download_mp3_files (urls, sizes):
	num = len(urls)
	i = 0
	for u in urls:
	size = sizes[i]
	i += 1
	f = get_MP3_file_from_url (u)
	trace ("%3d/%d: %s... " % (i, num, f), 0)
	if os.path.exists(f):
	print ("Skipping.")
	continue

	if download_file (u, f, size):
	break
	if debug_level == 0:
	print (" size: %d" % size)

	##############################################

	def main():
	parse_cmd_line()

	if use_pycurl:
	try:
	global pycurl
	import pycurl
	except:
	fatal ('Failed to import PyCurl.\n')

	if not os.path.exists (rss_file):
	trace ("RSS-file '%s' not found.\n" % rss_file, 0)
	if download_rss_file(rss_url, rss_file) == 0:
	sys.exit (1)

	if os.path.exists (rss_file):
	f = open (rss_file, 'r')
	urls, sizes = parse_rss_file (f)
	download_mp3_files (urls, sizes)
	f.close()
	else:
	trace ("RSS-file '%s' still not found!!\n" % rss_file, 0)

	def help_line (opt, h_str):
	print (" %s-%c%s: %s" % (_YELLOW, opt, _RESET, h_str))

	def usage (e = None):
	if e:
	print (e)
	print ("Usage: %s: %s" % (this_prog, "[-cdnph]"))

	help_line ('c', "use curl to download files (wget is default).")
	help_line ('d', "sets debug level (-dd for level 2).")
	help_line ('n', "run in dry-run mode.")
	help_line ('p', "use PyCurl to download files.")
	help_line ('h', "this help.")

	if not e:
	print (__doc__ % (rss_url, tmp), end="")
	sys.exit (0)

	if __name__ == '__main__':
	main()
	sys.exit(0)