gibbbone/soupget.py

## soupget.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import (division, absolute_import, print_function, unicode_literals)
import sys, os, argparse, time
from bs4 import BeautifulSoup

# from: https://stackoverflow.com/a/16518224/6332373
if sys.version_info >= (3,):
    import urllib.request as urllib2
    import urllib.parse as urlparse
else:
    import urllib2
    import urlparse

def download_file(url, dest=None):
    """
    Download and save a file specified by url to dest directory,
    """
    u = urllib2.urlopen(url)
    scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
    filename = os.path.basename(path)
    if not filename:
        filename = 'downloaded.file'
    if dest:
        filename = os.path.join(dest, filename)
    with open(filename, 'wb') as f:
        meta = u.info()
        meta_func = meta.getheaders if hasattr(meta, 'getheaders') else meta.get_all
        meta_length = meta_func("Content-Length")
        file_size = None
        if meta_length:
            file_size = int(meta_length[0])
        print("Downloading: {0} Bytes: {1}".format(url, file_size))

        file_size_dl = 0
        block_sz = 8192
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break

            file_size_dl += len(buffer)
            f.write(buffer)

            status = "{0:16}".format(file_size_dl)
            if file_size:
                status += "   [{0:6.2f}%]".format(file_size_dl * 100 / file_size)
            status += chr(13)
            print(status, end="")
        print()

    return filename

def collect_all_url(page_url, extensions):
    """
    Recovers all links in page_url checking for all the desired extensions
    """
    conn = urllib2.urlopen(page_url)
    html = conn.read()
    soup = BeautifulSoup(html, 'lxml')
    links = soup.find_all('a')

    results = []
    for tag in links:
        link = tag.get('href', None)
        if link is not None:
            for e in extensions:
                if e in link:
                    # Fallback for badly defined links
                    # checks for missing scheme or netloc
                    if bool(urlparse.urlparse(link).scheme) and bool(urlparse.urlparse(link).netloc):
                        results.append(link)
                    else:
                        new_url=urlparse.urljoin(page_url,link)
                        results.append(new_url)
    return results

if __name__ == "__main__":  # Only run if this file is called directly
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Download all files from a webpage.')
    parser.add_argument(
        '-u', '--url',
        help='Page url to request')
    parser.add_argument(
        '-e', '--ext',
        nargs='+',
        help='Extension(s) to find')
    parser.add_argument(
        '-d', '--dest',
        default=None,
        help='Destination where to save the files')
    parser.add_argument(
        '-p', '--par',
        action='store_true', default=False,
        help="Turns on parallel download")
    args = parser.parse_args()

    t1 = time.time()
    # Recover files to download
    all_links = collect_all_url(args.url, args.ext)

    # Download
    if not args.par:
        for l in all_links:
            try:
                filename = download_file(l, args.dest)
                print(l)
            except Exception as e:
                print("Error while downloading: {}".format(e))
    else:
        # from: https://markhneedham.com/blog/2018/07/15/python-parallel-download-files-requests/
        from multiprocessing.pool import ThreadPool
        results = ThreadPool(10).imap_unordered(
            lambda x: download_file(x, args.dest), all_links)
        for p in results:
            print(p)
    t2 = time.time()
    print("Elapsed time: {}".format(t2-t1))
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	from __future__ import (division, absolute_import, print_function, unicode_literals)
	import sys, os, argparse, time
	from bs4 import BeautifulSoup

	# from: https://stackoverflow.com/a/16518224/6332373
	if sys.version_info >= (3,):
	import urllib.request as urllib2
	import urllib.parse as urlparse
	else:
	import urllib2
	import urlparse

	def download_file(url, dest=None):
	"""
	Download and save a file specified by url to dest directory,
	"""
	u = urllib2.urlopen(url)
	scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
	filename = os.path.basename(path)
	if not filename:
	filename = 'downloaded.file'
	if dest:
	filename = os.path.join(dest, filename)
	with open(filename, 'wb') as f:
	meta = u.info()
	meta_func = meta.getheaders if hasattr(meta, 'getheaders') else meta.get_all
	meta_length = meta_func("Content-Length")
	file_size = None
	if meta_length:
	file_size = int(meta_length[0])
	print("Downloading: {0} Bytes: {1}".format(url, file_size))

	file_size_dl = 0
	block_sz = 8192
	while True:
	buffer = u.read(block_sz)
	if not buffer:
	break

	file_size_dl += len(buffer)
	f.write(buffer)

	status = "{0:16}".format(file_size_dl)
	if file_size:
	status += " [{0:6.2f}%]".format(file_size_dl * 100 / file_size)
	status += chr(13)
	print(status, end="")
	print()

	return filename

	def collect_all_url(page_url, extensions):
	"""
	Recovers all links in page_url checking for all the desired extensions
	"""
	conn = urllib2.urlopen(page_url)
	html = conn.read()
	soup = BeautifulSoup(html, 'lxml')
	links = soup.find_all('a')

	results = []
	for tag in links:
	link = tag.get('href', None)
	if link is not None:
	for e in extensions:
	if e in link:
	# Fallback for badly defined links
	# checks for missing scheme or netloc
	if bool(urlparse.urlparse(link).scheme) and bool(urlparse.urlparse(link).netloc):
	results.append(link)
	else:
	new_url=urlparse.urljoin(page_url,link)
	results.append(new_url)
	return results

	if __name__ == "__main__": # Only run if this file is called directly
	# Command line arguments
	parser = argparse.ArgumentParser(
	description='Download all files from a webpage.')
	parser.add_argument(
	'-u', '--url',
	help='Page url to request')
	parser.add_argument(
	'-e', '--ext',
	nargs='+',
	help='Extension(s) to find')
	parser.add_argument(
	'-d', '--dest',
	default=None,
	help='Destination where to save the files')
	parser.add_argument(
	'-p', '--par',
	action='store_true', default=False,
	help="Turns on parallel download")
	args = parser.parse_args()

	t1 = time.time()
	# Recover files to download
	all_links = collect_all_url(args.url, args.ext)

	# Download
	if not args.par:
	for l in all_links:
	try:
	filename = download_file(l, args.dest)
	print(l)
	except Exception as e:
	print("Error while downloading: {}".format(e))
	else:
	# from: https://markhneedham.com/blog/2018/07/15/python-parallel-download-files-requests/
	from multiprocessing.pool import ThreadPool
	results = ThreadPool(10).imap_unordered(
	lambda x: download_file(x, args.dest), all_links)
	for p in results:
	print(p)
	t2 = time.time()
	print("Elapsed time: {}".format(t2-t1))