Skip to content

Instantly share code, notes, and snippets.

@yabberyabber
Last active December 21, 2020 19:05
Show Gist options
  • Save yabberyabber/f274cac126206267675908ad5cf2430b to your computer and use it in GitHub Desktop.
Save yabberyabber/f274cac126206267675908ad5cf2430b to your computer and use it in GitHub Desktop.
Given the rss feed for a podcast, download it in its entirety.
#!/usr/bin/python3
import os
import sys
import pathlib
import shutil
import tempfile
import urllib.request
import concurrent.futures
from xml.dom import minidom
def textFromNode( node ):
res = []
for n in node.childNodes:
if n.nodeType == node.TEXT_NODE:
res.append( n.data )
return ' '.join( res )
def scrape( dest_dir, dom ):
items = dom.getElementsByTagName( 'item' )
def scrape_item( item ):
title = textFromNode( item.getElementsByTagName( 'title' )[ 0 ] )
dest = os.path.join( dest_dir, title )
if os.path.exists( dest ):
return
tmpDir = tempfile.mkdtemp( suffix=title )
with open( os.path.join( tmpDir, 'metadata.xml' ), 'w+' ) as metaFile:
metaFile.write( item.toprettyxml() )
mp3Url = item.getElementsByTagName( 'enclosure' )[ 0 ].getAttribute( 'url' )
urllib.request.urlretrieve( mp3Url, os.path.join( tmpDir, title + '.mp3' ) )
thumbnailUrl = item.getElementsByTagName( 'itunes:image' )[ 0 ].getAttribute( 'href' )
urllib.request.urlretrieve( thumbnailUrl, os.path.join( tmpDir, 'thumbnail.png' ) )
shutil.move( tmpDir, dest )
with concurrent.futures.ThreadPoolExecutor( max_workers=3 ) as pool:
pool.map( scrape_item, items )
def main( args ):
if len( sys.argv ) != 3:
print( 'Usage: {} DEST_DIR FEED.rss'.format( sys.argv[ 0 ] ) )
os.exit( 2 )
_, dest_dir, rssFile = sys.argv
dom = minidom.parse( rssFile )
dest_dir = os.path.abspath( dest_dir )
pathlib.Path( dest_dir ).mkdir( parents=True, exist_ok=True )
scrape( dest_dir, dom )
if __name__ == '__main__':
main( sys.argv )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment