Skip to content

Instantly share code, notes, and snippets.

@nathanrosspowell
Created May 5, 2012 15:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nathanrosspowell/2603334 to your computer and use it in GitHub Desktop.
Save nathanrosspowell/2603334 to your computer and use it in GitHub Desktop.
Mass podcast downloader.
import xml.dom.minidom
import datetime
import time
import os
import urllib
import sys
def filterName( node, name ):
return node.nodeType == node.ELEMENT_NODE and node.localName == name
def getNode( node, name ):
for elem in node.childNodes:
if filterName( elem, name ):
return elem
def genNodes( node, name ):
for elem in node.childNodes:
if filterName( elem, name ):
yield elem
def getAttribute( node, name ):
return node.attributes[ name ].firstChild.wholeText
def itemDict( node ):
data = {}
for child in node.childNodes:
if child.localName and child.firstChild:
if child.localName == "pubDate":
time_format = "%a, %d %b %Y %H:%M:%S "
dateText = child.firstChild.wholeText
dateText = dateText[ : dateText.rfind("+") ]
timeStamp = time.mktime(time.strptime(dateText, time_format))
data[ "date" ] = datetime.datetime.fromtimestamp(timeStamp)
elif child.localName == "title":
data[ "title" ] = child.firstChild.wholeText.strip()
elif child.localName:
if child.localName == "enclosure":
data[ "url" ] = child.attributes[ "url" ].firstChild.wholeText.strip()
return data
def genItems( doc ):
return ( item
for rss in doc.childNodes
if filterName( rss, "rss" )
for chan in rss.childNodes
if filterName( chan, "channel" )
for item in chan.childNodes
if filterName( item, "item" )
)
def feedOpener( rss ):
opener = urllib.FancyURLopener({})
rssFile = opener.open( rss )
return xml.dom.minidom.parse( rssFile )
def getFileName( data ):
dateFormat = data[ "date" ].strftime( data[ "dateFormat" ] )
title = data[ "title" ].title()
nameFormat = eval( "(%s)" % ( data[ "nameFormat" ], ) )
seperator = data[ "seperator" ]
name = '%s%s%s' % ( nameFormat[ 0 ], seperator, nameFormat[ 1 ] )
url = data[ "url" ]
fileExtension = url[ url.rfind("."):]
fileName = "%s%s" % ( name, fileExtension )
fileName = fileName.replace(" %s " % seperator, seperator ).replace( ", ", seperator ).replace( " ", "_")
return fileName
def gotAllData( data ):
for key in ( "title", "url", "date" ):
if not data.has_key( key ):
return False
return True
def addExtraData( data, feedDict):
data[ "nameFormat" ] = feedDict.get( "nameFormat", "( title, dateFormat )" )
data[ "dateFormat" ] = feedDict.get( "dateFormat", "%Y-%m-%d_%H-%M-%S" )
data[ "seperator" ] = feedDict.get( "seperator", "-" )
data[ "space" ] = feedDict.get( "space", "_" )
data[ "fileName" ] = getFileName( data )
def getRssData( feedDict ):
for itemNode in genItems( feedOpener( feedDict[ "rss" ] ) ):
data = itemDict( itemNode )
if gotAllData( data ):
addExtraData( data, feedDict )
yield data
def getSettingsAndPriority( xmlFile ):
doc = xml.dom.minidom.parse( xmlFile )
settings = { "feeds": {} }
priority = []
for child in getNode( doc, "podcast").childNodes:
if filterName( child, "feeds"):
for feed in child.childNodes:
if filterName( feed, "feed"):
key = None
attributeData = {}
for i in xrange( feed.attributes.length ):
attributeName = feed.attributes.item( i ).localName
if attributeName == "name":
key = getAttribute( feed, "name" );
else:
attributeData[ attributeName ] = getAttribute( feed, attributeName );
if key:
priority.append( key )
settings[ "feeds" ][ key ] = attributeData
else:
print "ERROR, no 'name' attribute"
elif child.nodeType == child.ELEMENT_NODE and child.localName:
settings[ child.localName ] = getAttribute( child, "value" )
return settings, priority
def listNewFiles( folder, feedDict ):
new = []
for data in getRssData( feedDict ):
fullFilePath = os.path.join( os.path.join( folder, feedDict[ "folder"] ), data[ "fileName" ] )
if not os.path.exists( fullFilePath ):
data[ "fullFilePath" ] = fullFilePath
new.append( data )
return new
def newFileStatus( newDict, priority, verbose ):
files = False
for key in priority:
value = newDict.get( key, None)
if not value:
continue
files = True
print "%d NEW: %s" % ( len( value ), key )
if verbose:
for podcast in value:
print "\t%s" % ( podcast[ "fileName" ], )
if not files:
print "All RSS feeds are up to date."
return files
def download( url, filePath, verbose, dryRun):
dirName = os.path.os.path.dirname( filePath )
if not os.path.exists( dirName ):
os.makedirs( dirName )
print "\tStarting", url, "as", filePath, "..."
try:
if not dryRun:
urllib.urlretrieve( url, filePath )
print "\t\tDownloaded!"
except KeyboardInterrupt:
if verbose:
print "\nKeyboardInterrupt, removing: %s\n" % ( filePath, )
if os.path.exists( filePath ):
os.remove( filePath )
raise
except:
print "\nERROR : removing %s" % ( filePath, )
if os.path.exists( filePath ):
os.remove( filePath )
def downloadAll( newDict, priority, verbose, dryRun ):
for key in priority:
value = newDict.get( key, None)
if not value:
continue
print "Downloading %s from: %s" %( len( value ), key )
value.sort( reverse=True, key=lambda v: v[ "date" ] )
for item in value:
download( item[ "url" ], item[ "fullFilePath" ], verbose, dryRun )
def main( xmlFile, verbose, dryRun ):
settings, priority = getSettingsAndPriority( xmlFile )
newFiles = {}
for key, value in settings[ "feeds" ].iteritems():
new = listNewFiles( settings[ "folder" ], value )
if new != []:
newFiles[ key ] = new
if newFileStatus( newFiles, priority, verbose ):
downloadAll( newFiles, priority, verbose, dryRun )
if __name__ == "__main__":
if len( sys.argv ) < 2:
print "Usage: <podcast: xml file> <verbose: 1 or 0: optional> <dryrun: 1 or 0: optional>"
else:
podcastXML = sys.argv[ 1 ]
try:
verbose = sys.argv[ 2 ]
except:
verbose = False
try:
dryRun = sys.argv[ 3 ]
except:
dryRun = False
main( podcastXML, verbose, dryRun )
<?xml version="1.0" encoding="UTF-8"?>
<podcast>
<!-- Root folder to download to. -->
<folder value="Z:\Media\Podcast\" />
<!-- RSS feeds to download, listed highest priority to lowest.
Fields to set:
name= print out name of the feed
folder= sub folder used
rss= RSS URI
Optional Fields:
seperator= '-'
space= '_'
dateFormat= how the datetime will be displayed
nameFormat= tuple for custom file name format: %s+seperator+%s
can use title (string) and dateFormat (string)
e.g "JRE+title, dateFormat
-->
<feeds>
<feed name="Lavender Hour"
folder="Lavender Hour"
rss="http://lavenderhour.libsyn.com/rss"
nameFormat="dateFormat, title"
dateFormat="%Y-%m-%d"
/>
<feed name="Joe Rogan Experience"
folder="JRE"
rss="http://joeroganexp.joerogan.libsynpro.com/irss"
nameFormat="title[ title.find('#')+1: ], dateFormat"
/>
</feeds>
</podcast>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment