paulrohrbeck/download_images.py

## download_images.py
#!/usr/bin/python

"""
Download Images
"""

import optparse
from bs4 import BeautifulSoup
import requests
from urlparse import urlsplit
import urllib
import os


def get_options():
    """
    Define and retrieve options from the command line
    """

    parser = optparse.OptionParser()
    parser.add_option('-u', help='URL with table of contents to parse (required)', dest='url', action='store_true')
    parser.add_option('-c',
                      help='CSS Class or ID to parse, eg. main-content (required)',
                      dest='css_class',
                      action='store_true')
    (opts, args) = parser.parse_args()

    # Making sure all mandatory options are set:
    mandatory_options = ['url', 'css_class']
    for m in mandatory_options:
        if not opts.__dict__[m]:
            print "Mandatory options is missing!\n"
            parser.print_help()
            exit(-1)

    return opts, args


def get_soup(url):
    """
    Retrieve HTML soup from given URL
    """

    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data)
    return soup


def get_links(soup, selector, url):
    """
    Get 'table of contents page' and retrieve list of pages to send to Readability.com
    """

    url_list = []

    # choose selector
    if selector[0] == 'id':
        divs = soup.find_all(id=selector[1])
    elif selector[0] == 'class':
        divs = soup.find_all(class_=selector[1], limit=1)
    else:
        divs = soup.find_all('body', limit=1)

    # then retrieve all links:
    for div in divs:
        for link in div.find_all('a'):
            href = str(link.get('href'))

            # ignore empty links, anchors, and mailto:
            if href != '' and href[0] != '#' and 'None' not in href and 'mailto:' not in href:
                href = sanitize_url(url, link.get('href'))
                url_list.append(href)

    print 'Found %s links (Selector: %s).' % (len(url_list), selector)

    print 'Removing duplicates, the list was reduced to %s links.' % len(url_list)
    return url_list


def sanitize_url(url, current_url):
    """
    Here we have to account for internal links, so if there's no netloc,
    prepend the current (given) URL

    SplitResult(scheme='http', netloc='', path=u'abc.html', query='', fragment=u'')
    """

    current_url_parts = urlsplit(current_url)
    if 'http' in current_url:
        sanitized_url = 'http://' + current_url_parts.netloc + current_url_parts.path
    else:
        url_parts = urlsplit(url)
        #sanitized_url = 'http://' + url_parts.netloc + url_parts.path + current_url_parts.path
        sanitized_url = 'http://' + url_parts.netloc + current_url_parts.path

    return sanitized_url


def class_or_id(selector):
    """
    Differentiate between classes and ids in the way jQuery does (#id, .class)
    """

    if selector[0] == '.':
        soup_selector = 'class'
    elif selector[0] == '#':
        soup_selector = 'id'
    else:
        soup_selector = ''

    return [soup_selector, selector[1:]]


def download_file(url):
    """
    download one file
    """
    filename = url[url.rfind('/')+1:]
    urllib.urlretrieve(url, filename)


def download_files(url_list, category):
    """
    Loop through urls
    """
    os.mkdir(category)
    os.chdir(category)

    for id, url in enumerate(url_list):
        download_file(url)

        # strip id out of url:
        filename = url[url.rfind('/')+1:]
        #bookmark_id = location[location.rfind('/')+1:]

        print '- [%s] Downloaded %s' % (id+1, filename)


def main():
    """
    Main function that starts everything else
    """

    # get options:
    (opts, args) = get_options()
    url = str(args[0]) if opts.url else ""
    css_class = str(args[1]) if opts.css_class else ""

    # start parser:
    soup = get_soup(url)
    css_class = class_or_id(css_class)
    url_list = get_links(soup, css_class, url)
    download_files(url_list, url[url.rfind('/')+1:])

    # DEBUG (only send part of the links):
    #url_list = url_list[21:25]
    #print 'len url_list', len(url_list)
    #print 'url_list', url_list


if __name__ == '__main__':
    main()
	#!/usr/bin/python

	"""
	Download Images
	"""

	import optparse
	from bs4 import BeautifulSoup
	import requests
	from urlparse import urlsplit
	import urllib
	import os


	def get_options():
	"""
	Define and retrieve options from the command line
	"""

	parser = optparse.OptionParser()
	parser.add_option('-u', help='URL with table of contents to parse (required)', dest='url', action='store_true')
	parser.add_option('-c',
	help='CSS Class or ID to parse, eg. main-content (required)',
	dest='css_class',
	action='store_true')
	(opts, args) = parser.parse_args()

	# Making sure all mandatory options are set:
	mandatory_options = ['url', 'css_class']
	for m in mandatory_options:
	if not opts.__dict__[m]:
	print "Mandatory options is missing!\n"
	parser.print_help()
	exit(-1)

	return opts, args


	def get_soup(url):
	"""
	Retrieve HTML soup from given URL
	"""

	r = requests.get(url)
	data = r.text
	soup = BeautifulSoup(data)
	return soup


	def get_links(soup, selector, url):
	"""
	Get 'table of contents page' and retrieve list of pages to send to Readability.com
	"""

	url_list = []

	# choose selector
	if selector[0] == 'id':
	divs = soup.find_all(id=selector[1])
	elif selector[0] == 'class':
	divs = soup.find_all(class_=selector[1], limit=1)
	else:
	divs = soup.find_all('body', limit=1)

	# then retrieve all links:
	for div in divs:
	for link in div.find_all('a'):
	href = str(link.get('href'))

	# ignore empty links, anchors, and mailto:
	if href != '' and href[0] != '#' and 'None' not in href and 'mailto:' not in href:
	href = sanitize_url(url, link.get('href'))
	url_list.append(href)

	print 'Found %s links (Selector: %s).' % (len(url_list), selector)

	print 'Removing duplicates, the list was reduced to %s links.' % len(url_list)
	return url_list


	def sanitize_url(url, current_url):
	"""
	Here we have to account for internal links, so if there's no netloc,
	prepend the current (given) URL

	SplitResult(scheme='http', netloc='', path=u'abc.html', query='', fragment=u'')
	"""

	current_url_parts = urlsplit(current_url)
	if 'http' in current_url:
	sanitized_url = 'http://' + current_url_parts.netloc + current_url_parts.path
	else:
	url_parts = urlsplit(url)
	#sanitized_url = 'http://' + url_parts.netloc + url_parts.path + current_url_parts.path
	sanitized_url = 'http://' + url_parts.netloc + current_url_parts.path

	return sanitized_url


	def class_or_id(selector):
	"""
	Differentiate between classes and ids in the way jQuery does (#id, .class)
	"""

	if selector[0] == '.':
	soup_selector = 'class'
	elif selector[0] == '#':
	soup_selector = 'id'
	else:
	soup_selector = ''

	return [soup_selector, selector[1:]]


	def download_file(url):
	"""
	download one file
	"""
	filename = url[url.rfind('/')+1:]
	urllib.urlretrieve(url, filename)


	def download_files(url_list, category):
	"""
	Loop through urls
	"""
	os.mkdir(category)
	os.chdir(category)

	for id, url in enumerate(url_list):
	download_file(url)

	# strip id out of url:
	filename = url[url.rfind('/')+1:]
	#bookmark_id = location[location.rfind('/')+1:]

	print '- [%s] Downloaded %s' % (id+1, filename)


	def main():
	"""
	Main function that starts everything else
	"""

	# get options:
	(opts, args) = get_options()
	url = str(args[0]) if opts.url else ""
	css_class = str(args[1]) if opts.css_class else ""

	# start parser:
	soup = get_soup(url)
	css_class = class_or_id(css_class)
	url_list = get_links(soup, css_class, url)
	download_files(url_list, url[url.rfind('/')+1:])

	# DEBUG (only send part of the links):
	#url_list = url_list[21:25]
	#print 'len url_list', len(url_list)
	#print 'url_list', url_list


	if __name__ == '__main__':
	main()