Skip to content

Instantly share code, notes, and snippets.

@kmonsoor
Last active February 5, 2022 20:30
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kmonsoor/10727871 to your computer and use it in GitHub Desktop.
Save kmonsoor/10727871 to your computer and use it in GitHub Desktop.
Download contents by grabbing links from a given RSS-feed. Works over HTTP,FTP,HTTPS protocol transparently. Currently, works on Linux, Mac OSX, and possibly on other Unix as well. But, _NOT_ on Windows yet
#!/usr/bin/env python2.7
"""
Parallel RSS-Feed Downloader
----------------------------
Download contents by grabbing links from a given RSS-feed.
Works over HTTP,FTP,HTTPS protocol transparently.
Currently, works on Linux, Mac OSX, and possibly on other Unix as well.
But, _NOT_ on Windows * yet.
__project__ = "Parallel RSS-Feed Downloader"
__author__ = "Khaled Monsoor <k@kmonsoor.com>"
__license__ = "MIT"
__version__ = "1.0"
__python__ = "2.7.*"
Usage:
------
python parallel_rss_download.py --feed=<feed_url>
Pre-requisites:
---------------
* OS: Linux / Mac OS / Unix (Windows' support NOT implemented)
* Python version: 2.7.0 +
* Required Modules:
* URLGrabber < http://urlgrabber.baseurl.org >
- Installation: "sudo pip install urlgrabber"
* PycURL < http://pycurl.sourceforge.net/ >
- Installation: "sudo apt-get install python-pycurl"
Features:
---------
* Transparency over protocols like HTTP, HTTPS, or FTP
* Multi-threaded simultaneous downloading
* Completion of previous partial-downloads
* skip previously completed downloads
* Automatic retry mechanism
* Exception handling
* Proxy support
TODO
----
* Update logging mechanism to have logging to file
* Implement progress meter using URLGrabber's hook
* Custom download location
* Checking file's timestamp and checksum \
for checking the completion of a download
* Utilize callback hook
* Implement nicely handling KeyboadInterruptError,
caused by user's CTRL+C
* smarter feed parsing
"""
from urlgrabber.grabber import URLGrabber, URLGrabError
import xml.etree.ElementTree as xmlparse
import multiprocessing as mp
import datetime as dt
import urllib2 as u2
import signal
import argparse
import sys
import os
# ===============================
# Default Download configurations
# ===============================
default_proxy = None
default_timeout = 300
default_retry = 3
thread_count = 5
# ===============================
# Not utilized yet
def init_worker():
# registering CTRL+C as UserInterrupt
signal.signal(signal.SIGINT, signal.SIG_IGN)
def threaded_download(single_download, logfile=None):
"""
This method initiate with an URL as a thread from a threadPool.
But on its own, it is not thread-safe. It has to be managed to the caller
Download location: <Current Directory>
single_download --> complete download link
logfile --> use default logfile if not supplied with.
"""
# registering CTRL+C as UserInterrupt
# signal.signal(signal.SIGINT, signal.SIG_IGN)
response = "Not Downloaded"
try:
download_size = int((u2.urlopen(single_download)).info().getheaders("Content-Length")[0])
print "Starting: "+ str(single_download) + " :: Download target's size: %s KB" % (download_size/1024)
g = URLGrabber(reget='simple', retry=default_retry, timeout=default_timeout, proxies=default_proxy)
response = g.urlgrab(single_download)
print "Completed: "+ response
except URLGrabError as ue:
print str(ue) + "\nskipping: " + single_download
else:
return response # response --> downloaded file's name, if download is successful
def download(feed_url):
try:
tree = xmlparse.parse(u2.urlopen(feed_url))
except u2.URLError:
print "ERRor: URL Not forund"
except u2.ValueError:
print "ERRor: Invalid URL"
except ParseError:
print "ERRor: Invalid Feed"
# it checks RSS' validity
if str(t.getroot()).find("rss") > 0:
# parsing the feed for the download links
all_downloads = [item.findtext('link') for item in tree.iterfind('channel/item')]
else:
print "Sorry : The given URL is not a valid RSS feed."
exit(0)
print "Feed URL grabbed and parsed successfully \
List of targeted downloads\n \
--------------------------"
for x in all_downloads:
print x
print "\n=============================== \
\n Starting Download\n==============="
# parallel threading download using tricky use of map()
my_pool = mp.Pool(thread_count);#, init_worker)
results = my_pool.map(threaded_download, all_downloads);
if __name__ == '__main__':
cli = argparse.ArgumentParser(description='Download contents by grabbing links from a given RSS-feed')
cli.add_argument('--feed', dest='feed_url', action='store', help='URL of the RSS feed')
#cli.add_argument('--output', dest='local_location', action='store', help='local folder, where to save the files')
parsed_arguments = cli.parse_args(sys.argv[1:])
if parsed_arguments.feed_url==None:
print "Error: Sorry. Feed-URL cannot be empty. Quitting from requested job ..."
exit(0)
else:
feed_url = parsed_arguments.feed_url
download(feed_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment