Created
May 5, 2012 15:29
-
-
Save nathanrosspowell/2603334 to your computer and use it in GitHub Desktop.
Mass podcast downloader.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.dom.minidom | |
import datetime | |
import time | |
import os | |
import urllib | |
import sys | |
def filterName( node, name ): | |
return node.nodeType == node.ELEMENT_NODE and node.localName == name | |
def getNode( node, name ): | |
for elem in node.childNodes: | |
if filterName( elem, name ): | |
return elem | |
def genNodes( node, name ): | |
for elem in node.childNodes: | |
if filterName( elem, name ): | |
yield elem | |
def getAttribute( node, name ): | |
return node.attributes[ name ].firstChild.wholeText | |
def itemDict( node ): | |
data = {} | |
for child in node.childNodes: | |
if child.localName and child.firstChild: | |
if child.localName == "pubDate": | |
time_format = "%a, %d %b %Y %H:%M:%S " | |
dateText = child.firstChild.wholeText | |
dateText = dateText[ : dateText.rfind("+") ] | |
timeStamp = time.mktime(time.strptime(dateText, time_format)) | |
data[ "date" ] = datetime.datetime.fromtimestamp(timeStamp) | |
elif child.localName == "title": | |
data[ "title" ] = child.firstChild.wholeText.strip() | |
elif child.localName: | |
if child.localName == "enclosure": | |
data[ "url" ] = child.attributes[ "url" ].firstChild.wholeText.strip() | |
return data | |
def genItems( doc ): | |
return ( item | |
for rss in doc.childNodes | |
if filterName( rss, "rss" ) | |
for chan in rss.childNodes | |
if filterName( chan, "channel" ) | |
for item in chan.childNodes | |
if filterName( item, "item" ) | |
) | |
def feedOpener( rss ): | |
opener = urllib.FancyURLopener({}) | |
rssFile = opener.open( rss ) | |
return xml.dom.minidom.parse( rssFile ) | |
def getFileName( data ): | |
dateFormat = data[ "date" ].strftime( data[ "dateFormat" ] ) | |
title = data[ "title" ].title() | |
nameFormat = eval( "(%s)" % ( data[ "nameFormat" ], ) ) | |
seperator = data[ "seperator" ] | |
name = '%s%s%s' % ( nameFormat[ 0 ], seperator, nameFormat[ 1 ] ) | |
url = data[ "url" ] | |
fileExtension = url[ url.rfind("."):] | |
fileName = "%s%s" % ( name, fileExtension ) | |
fileName = fileName.replace(" %s " % seperator, seperator ).replace( ", ", seperator ).replace( " ", "_") | |
return fileName | |
def gotAllData( data ): | |
for key in ( "title", "url", "date" ): | |
if not data.has_key( key ): | |
return False | |
return True | |
def addExtraData( data, feedDict): | |
data[ "nameFormat" ] = feedDict.get( "nameFormat", "( title, dateFormat )" ) | |
data[ "dateFormat" ] = feedDict.get( "dateFormat", "%Y-%m-%d_%H-%M-%S" ) | |
data[ "seperator" ] = feedDict.get( "seperator", "-" ) | |
data[ "space" ] = feedDict.get( "space", "_" ) | |
data[ "fileName" ] = getFileName( data ) | |
def getRssData( feedDict ): | |
for itemNode in genItems( feedOpener( feedDict[ "rss" ] ) ): | |
data = itemDict( itemNode ) | |
if gotAllData( data ): | |
addExtraData( data, feedDict ) | |
yield data | |
def getSettingsAndPriority( xmlFile ): | |
doc = xml.dom.minidom.parse( xmlFile ) | |
settings = { "feeds": {} } | |
priority = [] | |
for child in getNode( doc, "podcast").childNodes: | |
if filterName( child, "feeds"): | |
for feed in child.childNodes: | |
if filterName( feed, "feed"): | |
key = None | |
attributeData = {} | |
for i in xrange( feed.attributes.length ): | |
attributeName = feed.attributes.item( i ).localName | |
if attributeName == "name": | |
key = getAttribute( feed, "name" ); | |
else: | |
attributeData[ attributeName ] = getAttribute( feed, attributeName ); | |
if key: | |
priority.append( key ) | |
settings[ "feeds" ][ key ] = attributeData | |
else: | |
print "ERROR, no 'name' attribute" | |
elif child.nodeType == child.ELEMENT_NODE and child.localName: | |
settings[ child.localName ] = getAttribute( child, "value" ) | |
return settings, priority | |
def listNewFiles( folder, feedDict ): | |
new = [] | |
for data in getRssData( feedDict ): | |
fullFilePath = os.path.join( os.path.join( folder, feedDict[ "folder"] ), data[ "fileName" ] ) | |
if not os.path.exists( fullFilePath ): | |
data[ "fullFilePath" ] = fullFilePath | |
new.append( data ) | |
return new | |
def newFileStatus( newDict, priority, verbose ): | |
files = False | |
for key in priority: | |
value = newDict.get( key, None) | |
if not value: | |
continue | |
files = True | |
print "%d NEW: %s" % ( len( value ), key ) | |
if verbose: | |
for podcast in value: | |
print "\t%s" % ( podcast[ "fileName" ], ) | |
if not files: | |
print "All RSS feeds are up to date." | |
return files | |
def download( url, filePath, verbose, dryRun): | |
dirName = os.path.os.path.dirname( filePath ) | |
if not os.path.exists( dirName ): | |
os.makedirs( dirName ) | |
print "\tStarting", url, "as", filePath, "..." | |
try: | |
if not dryRun: | |
urllib.urlretrieve( url, filePath ) | |
print "\t\tDownloaded!" | |
except KeyboardInterrupt: | |
if verbose: | |
print "\nKeyboardInterrupt, removing: %s\n" % ( filePath, ) | |
if os.path.exists( filePath ): | |
os.remove( filePath ) | |
raise | |
except: | |
print "\nERROR : removing %s" % ( filePath, ) | |
if os.path.exists( filePath ): | |
os.remove( filePath ) | |
def downloadAll( newDict, priority, verbose, dryRun ): | |
for key in priority: | |
value = newDict.get( key, None) | |
if not value: | |
continue | |
print "Downloading %s from: %s" %( len( value ), key ) | |
value.sort( reverse=True, key=lambda v: v[ "date" ] ) | |
for item in value: | |
download( item[ "url" ], item[ "fullFilePath" ], verbose, dryRun ) | |
def main( xmlFile, verbose, dryRun ): | |
settings, priority = getSettingsAndPriority( xmlFile ) | |
newFiles = {} | |
for key, value in settings[ "feeds" ].iteritems(): | |
new = listNewFiles( settings[ "folder" ], value ) | |
if new != []: | |
newFiles[ key ] = new | |
if newFileStatus( newFiles, priority, verbose ): | |
downloadAll( newFiles, priority, verbose, dryRun ) | |
if __name__ == "__main__": | |
if len( sys.argv ) < 2: | |
print "Usage: <podcast: xml file> <verbose: 1 or 0: optional> <dryrun: 1 or 0: optional>" | |
else: | |
podcastXML = sys.argv[ 1 ] | |
try: | |
verbose = sys.argv[ 2 ] | |
except: | |
verbose = False | |
try: | |
dryRun = sys.argv[ 3 ] | |
except: | |
dryRun = False | |
main( podcastXML, verbose, dryRun ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<podcast> | |
<!-- Root folder to download to. --> | |
<folder value="Z:\Media\Podcast\" /> | |
<!-- RSS feeds to download, listed highest priority to lowest. | |
Fields to set: | |
name= print out name of the feed | |
folder= sub folder used | |
rss= RSS URI | |
Optional Fields: | |
seperator= '-' | |
space= '_' | |
dateFormat= how the datetime will be displayed | |
nameFormat= tuple for custom file name format: %s+seperator+%s | |
can use title (string) and dateFormat (string) | |
e.g "JRE+title, dateFormat | |
--> | |
<feeds> | |
<feed name="Lavender Hour" | |
folder="Lavender Hour" | |
rss="http://lavenderhour.libsyn.com/rss" | |
nameFormat="dateFormat, title" | |
dateFormat="%Y-%m-%d" | |
/> | |
<feed name="Joe Rogan Experience" | |
folder="JRE" | |
rss="http://joeroganexp.joerogan.libsynpro.com/irss" | |
nameFormat="title[ title.find('#')+1: ], dateFormat" | |
/> | |
</feeds> | |
</podcast> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment