redmoses/downloader.py

## downloader.py
"""
    RSS Downloader
    ~~~~~~~~~~~~~~

    The application parses rss feeds from a given url and downloads the files
    from the feed to a given output directory.

    Features:
    ~ Resume partially completed download
    ~ Supports http, and ftp protocols

    Logging:
    ~ All logs are generated in the rssdownloader.log file

    Dependencies:
    ~ feedparser (https://pypi.python.org/pypi/feedparser)

    Usage:
    python downloader.py --feed=<RSS-Feed-URL> --output=<PATH-TO-DIRECTORY>

    :Author: Musa Nasrullah
    :Email: musa.dhk@gmail.com
    :Website: http://www.redmoses.org
"""
## Code Starts ##

from __future__ import print_function
import sys
import getopt
import logging
import os
import urllib
import time
import feedparser  # feedparser package for parsing rss feeds


# default file downloader for protocols : HTTP, HTTPS, FTP
def default_downloader(url, download_location, logger):
    # get the file name from the url
    file_name = url.split('/')[-1]
    # create the temporary file name
    temp_file_name = file_name + ".tmp"
    temp_file_path = download_location + temp_file_name
    file_path = download_location + file_name
    # initiate url opener
    url_opener = urllib.FancyURLopener()

    # check if the file already exist in the location
    if not os.path.exists(file_path):
        file_size_dl = 0
        # check if temporary file exists
        if os.path.exists(temp_file_path):
            # open the file in append mode for resuming download
            temp_file = open(temp_file_path, 'ab')
            # set downloaded amount to existing size
            file_size_dl = os.path.getsize(temp_file_path)
            # file range to header
            url_opener.addheader("Range", "bytes=%s-" % file_size_dl)
        else:  # temporary file doesn't exist so just open it for writing
            temp_file = open(temp_file_path, 'wb')

        # open connection for downloading the file
        remote_file = url_opener.open(url)
        # get file meta information
        meta = remote_file.info()
        # get file size in bytes
        file_size = float(meta.getheaders("Content-Length")[0])

        # check if the url is valid
        if remote_file.getcode() >= 400:
            return

        resume_support = True
        # check resume support for http
        #if protocol == "http":
        if remote_file.getcode() != 206:
            resume_support = False

        if not resume_support:  # server doesn't support resume
            if os.path.isfile(temp_file_path):
                # delete temporary file if it exists
                temp_file.close()
                os.remove(temp_file_path)
                # create and open the file for writing
                temp_file = open(temp_file_path, 'wb')
                # set downloaded size to 0
                file_size_dl = 0

        # start download
        print("\n")
        logger.info("Downloading: %s [%3.1f bytes]" % (file_name, file_size))
        # set each chunk size to be 8192 bytes
        block_sz = 8192
        # keep on looping till finish reading the remote file completely
        while True:
            # read the remote file
            remote_data = remote_file.read(block_sz)

            if not remote_data:
                break

            file_size_dl += len(remote_data)
            # write the downloaded data
            temp_file.write(remote_data)
            # calculate percentage downloaded
            status = r"%10d bytes [%3.1f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
            # print the new status message, replacing the previous one
            print(status, end='\r')

        # wait for the downloading to completely finish
        time.sleep(3)
        # download completed
        if file_size_dl == file_size:
            logger.info("\nDownload completed: %s [%3.1f bytes]" % (file_name, file_size))
            # close the files
            temp_file.close()
            remote_file.close()
            # rename the temporary file to its original name
            os.rename(temp_file_path, file_path)

    else:  # file already downloaded
        logger.error("%s already downloaded." % file_name)


# determine protocol and download file accordingly
def download_file(url, download_location, protocol, logger):
    if protocol == "http" or protocol == "ftp":  # use default protocols
        # for default protocols: http and ftp
        default_downloader(url, download_location, logger)
        #elif protocol == 'sftp':
        #    call sftp downloader


# get the download links from the feed
def download_feed(feed_url, download_location, logger):
    logger.info("Retrieving download links from the feed...")
    # get the rss feed
    feed = feedparser.parse(feed_url)
    logger.info("Downloading files...")
    # loop through all the items in the feed
    for item in feed.entries:
        file_url = item.link
        # get url protocol
        protocol = file_url.split(':')[0].lower()
        download_file(file_url, download_location, protocol, logger)

    logger.info("Operations completed")


# show usage function
def show_usage():
    print('Usage example: downloader.py --feed=<RSS-Feed-URL> --output=<PATH-TO-DIRECTORY>')


# the main function
def main(argv):
    # initiate the parameters
    feed_url = ''
    output_location = ''

    try:
        # define parameters
        params, args = getopt.getopt(argv, "hf:o:", ["feed=", "output="])
    except getopt.GetoptError:
        # if no parameter is supplied show usage
        show_usage()
        sys.exit(2)
    for param, arg in params:
        # help
        if param == '-h':
            show_usage()
            sys.exit()
        # get feed url
        elif param in ("-f", "--feed"):
            feed_url = arg
        # get output location
        elif param in ("-o", "--output"):
            output_location = arg

    # configure logger
    logger = logging.getLogger('RSS_Downloader')
    # configure log file
    file_handler = logging.FileHandler('downloader.log')
    # set log formatting
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(formatter)
    # set log to print and write log messages
    logger.addHandler(logging.StreamHandler())
    logger.addHandler(file_handler)
    # set logging level
    logger.setLevel(logging.INFO)

    if feed_url != '' or output_location != '':
        logger.info('Feed URL: %s | Download Destination: %s ' % (feed_url, output_location))
        # download the feed
        download_feed(feed_url, output_location, logger)
    else:
        logger.error('Error: Required parameters are missing.')
        show_usage()
        sys.exit(2)


if __name__ == "__main__":
    main(sys.argv[1:])
	"""
	RSS Downloader
	~~~~~~~~~~~~~~

	The application parses rss feeds from a given url and downloads the files
	from the feed to a given output directory.

	Features:
	~ Resume partially completed download
	~ Supports http, and ftp protocols

	Logging:
	~ All logs are generated in the rssdownloader.log file

	Dependencies:
	~ feedparser (https://pypi.python.org/pypi/feedparser)

	Usage:
	python downloader.py --feed=<RSS-Feed-URL> --output=<PATH-TO-DIRECTORY>

	:Author: Musa Nasrullah
	:Email: musa.dhk@gmail.com
	:Website: http://www.redmoses.org
	"""
	## Code Starts ##

	from __future__ import print_function
	import sys
	import getopt
	import logging
	import os
	import urllib
	import time
	import feedparser # feedparser package for parsing rss feeds


	# default file downloader for protocols : HTTP, HTTPS, FTP
	def default_downloader(url, download_location, logger):
	# get the file name from the url
	file_name = url.split('/')[-1]
	# create the temporary file name
	temp_file_name = file_name + ".tmp"
	temp_file_path = download_location + temp_file_name
	file_path = download_location + file_name
	# initiate url opener
	url_opener = urllib.FancyURLopener()

	# check if the file already exist in the location
	if not os.path.exists(file_path):
	file_size_dl = 0
	# check if temporary file exists
	if os.path.exists(temp_file_path):
	# open the file in append mode for resuming download
	temp_file = open(temp_file_path, 'ab')
	# set downloaded amount to existing size
	file_size_dl = os.path.getsize(temp_file_path)
	# file range to header
	url_opener.addheader("Range", "bytes=%s-" % file_size_dl)
	else: # temporary file doesn't exist so just open it for writing
	temp_file = open(temp_file_path, 'wb')

	# open connection for downloading the file
	remote_file = url_opener.open(url)
	# get file meta information
	meta = remote_file.info()
	# get file size in bytes
	file_size = float(meta.getheaders("Content-Length")[0])

	# check if the url is valid
	if remote_file.getcode() >= 400:
	return

	resume_support = True
	# check resume support for http
	#if protocol == "http":
	if remote_file.getcode() != 206:
	resume_support = False

	if not resume_support: # server doesn't support resume
	if os.path.isfile(temp_file_path):
	# delete temporary file if it exists
	temp_file.close()
	os.remove(temp_file_path)
	# create and open the file for writing
	temp_file = open(temp_file_path, 'wb')
	# set downloaded size to 0
	file_size_dl = 0

	# start download
	print("\n")
	logger.info("Downloading: %s [%3.1f bytes]" % (file_name, file_size))
	# set each chunk size to be 8192 bytes
	block_sz = 8192
	# keep on looping till finish reading the remote file completely
	while True:
	# read the remote file
	remote_data = remote_file.read(block_sz)

	if not remote_data:
	break

	file_size_dl += len(remote_data)
	# write the downloaded data
	temp_file.write(remote_data)
	# calculate percentage downloaded
	status = r"%10d bytes [%3.1f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
	# print the new status message, replacing the previous one
	print(status, end='\r')

	# wait for the downloading to completely finish
	time.sleep(3)
	# download completed
	if file_size_dl == file_size:
	logger.info("\nDownload completed: %s [%3.1f bytes]" % (file_name, file_size))
	# close the files
	temp_file.close()
	remote_file.close()
	# rename the temporary file to its original name
	os.rename(temp_file_path, file_path)

	else: # file already downloaded
	logger.error("%s already downloaded." % file_name)


	# determine protocol and download file accordingly
	def download_file(url, download_location, protocol, logger):
	if protocol == "http" or protocol == "ftp": # use default protocols
	# for default protocols: http and ftp
	default_downloader(url, download_location, logger)
	#elif protocol == 'sftp':
	# call sftp downloader


	# get the download links from the feed
	def download_feed(feed_url, download_location, logger):
	logger.info("Retrieving download links from the feed...")
	# get the rss feed
	feed = feedparser.parse(feed_url)
	logger.info("Downloading files...")
	# loop through all the items in the feed
	for item in feed.entries:
	file_url = item.link
	# get url protocol
	protocol = file_url.split(':')[0].lower()
	download_file(file_url, download_location, protocol, logger)

	logger.info("Operations completed")


	# show usage function
	def show_usage():
	print('Usage example: downloader.py --feed=<RSS-Feed-URL> --output=<PATH-TO-DIRECTORY>')


	# the main function
	def main(argv):
	# initiate the parameters
	feed_url = ''
	output_location = ''

	try:
	# define parameters
	params, args = getopt.getopt(argv, "hf:o:", ["feed=", "output="])
	except getopt.GetoptError:
	# if no parameter is supplied show usage
	show_usage()
	sys.exit(2)
	for param, arg in params:
	# help
	if param == '-h':
	show_usage()
	sys.exit()
	# get feed url
	elif param in ("-f", "--feed"):
	feed_url = arg
	# get output location
	elif param in ("-o", "--output"):
	output_location = arg

	# configure logger
	logger = logging.getLogger('RSS_Downloader')
	# configure log file
	file_handler = logging.FileHandler('downloader.log')
	# set log formatting
	formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
	file_handler.setFormatter(formatter)
	# set log to print and write log messages
	logger.addHandler(logging.StreamHandler())
	logger.addHandler(file_handler)
	# set logging level
	logger.setLevel(logging.INFO)

	if feed_url != '' or output_location != '':
	logger.info('Feed URL: %s \| Download Destination: %s ' % (feed_url, output_location))
	# download the feed
	download_feed(feed_url, output_location, logger)
	else:
	logger.error('Error: Required parameters are missing.')
	show_usage()
	sys.exit(2)


	if __name__ == "__main__":
	main(sys.argv[1:])