Last active
December 21, 2020 19:05
-
-
Save yabberyabber/f274cac126206267675908ad5cf2430b to your computer and use it in GitHub Desktop.
Given the rss feed for a podcast, download it in its entirety.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import os | |
import sys | |
import pathlib | |
import shutil | |
import tempfile | |
import urllib.request | |
import concurrent.futures | |
from xml.dom import minidom | |
def textFromNode( node ): | |
res = [] | |
for n in node.childNodes: | |
if n.nodeType == node.TEXT_NODE: | |
res.append( n.data ) | |
return ' '.join( res ) | |
def scrape( dest_dir, dom ): | |
items = dom.getElementsByTagName( 'item' ) | |
def scrape_item( item ): | |
title = textFromNode( item.getElementsByTagName( 'title' )[ 0 ] ) | |
dest = os.path.join( dest_dir, title ) | |
if os.path.exists( dest ): | |
return | |
tmpDir = tempfile.mkdtemp( suffix=title ) | |
with open( os.path.join( tmpDir, 'metadata.xml' ), 'w+' ) as metaFile: | |
metaFile.write( item.toprettyxml() ) | |
mp3Url = item.getElementsByTagName( 'enclosure' )[ 0 ].getAttribute( 'url' ) | |
urllib.request.urlretrieve( mp3Url, os.path.join( tmpDir, title + '.mp3' ) ) | |
thumbnailUrl = item.getElementsByTagName( 'itunes:image' )[ 0 ].getAttribute( 'href' ) | |
urllib.request.urlretrieve( thumbnailUrl, os.path.join( tmpDir, 'thumbnail.png' ) ) | |
shutil.move( tmpDir, dest ) | |
with concurrent.futures.ThreadPoolExecutor( max_workers=3 ) as pool: | |
pool.map( scrape_item, items ) | |
def main( args ): | |
if len( sys.argv ) != 3: | |
print( 'Usage: {} DEST_DIR FEED.rss'.format( sys.argv[ 0 ] ) ) | |
os.exit( 2 ) | |
_, dest_dir, rssFile = sys.argv | |
dom = minidom.parse( rssFile ) | |
dest_dir = os.path.abspath( dest_dir ) | |
pathlib.Path( dest_dir ).mkdir( parents=True, exist_ok=True ) | |
scrape( dest_dir, dom ) | |
if __name__ == '__main__': | |
main( sys.argv ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment