Created
December 4, 2014 10:33
-
-
Save anonymous/1c06e7fcec9b1b7c3db3 to your computer and use it in GitHub Desktop.
/home/romster/bin/lsurl.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import argparse | |
import urlparse | |
import re | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import ftplib | |
from filtertool.util import dprint, set_synchronous_debug, set_prefix | |
common_extensions = [ | |
'.tar.gz', '.tar.Z', '.tgz', '.tar.bz2', '.tbz2', '.tbz', | |
'.tar.lzma', '.tar.xz', '.zip', '.bin', '.rpm', '.deb' | |
] | |
# pick apart command line | |
parser = argparse.ArgumentParser() | |
parser.add_argument('urls', nargs='*', | |
help='URL to fetch from') | |
parser.add_argument('--timeout', type=int, default=25, | |
help='network timeout. defaults to 25 seconds.') | |
parser.add_argument('--stdin', action='store_true', | |
help='read URLs from standard input (exhaustive read is performed before URLs are being processed!)') | |
parser.add_argument('--synchronous-debug', action='store_true', | |
help='print debug messages to stdout so they stay in proper order with the main output') | |
parser.add_argument('--ext', action='append', default=[], | |
help='semicolon separated list of suffixes. if present, only URLs with a matching suffix will be added to the output. may be specified multiple times.') | |
parser.add_argument('--common-ext', action='store_true', | |
help='shorthand for --ext "' + ';'.join(common_extensions) + '"') | |
parser.add_argument('--strip', | |
help='string suffix. if present at the end of the URL, it is removed.') | |
args = parser.parse_args() | |
if args.synchronous_debug: set_synchronous_debug(True) | |
set_prefix('lsurl') | |
# read URLs from stdin if requested to do so (strip surrounding whitespace and filter blank lines, if present) | |
if args.stdin: | |
args.urls.extend( filter(lambda x: len(x)!=0, [x.strip() for x in sys.stdin.readlines()]) ) | |
# "http://mirrors.ibiblio.org/wine/source/1.4/" | |
# "ftp://ftp.mutt.org/pub/mutt/contrib/" | |
# "http://www.crummy.com/software/BeautifulSoup/download/3.x/" | |
all_ext = [] | |
for x in args.ext: | |
all_ext.extend( filter(lambda x: len(x)!=0, [x.strip() for x in x.split(';')]) ) | |
if args.common_ext: | |
all_ext.extend(common_extensions) | |
# buffer errors for summary | |
errors = {} | |
def push_error(key, message): | |
if key not in errors: errors[key] = [] | |
errors[key].append(message) | |
def strip_end(text, suffix): | |
if not text.endswith(suffix): return text | |
return text[:len(text)-len(suffix)] | |
# do the magic! | |
for x in args.urls: | |
url = urlparse.urlparse(x) | |
canonical_url = url.geturl() | |
#print url | |
results = [] | |
if url.scheme == 'ftp': | |
try: | |
ftp = ftplib.FTP(url.netloc, | |
url.username if url.username is not None else "anonymous", | |
url.password if url.password is not None else "user@example.com", | |
timeout=args.timeout | |
) | |
listing = ftp.nlst(url.path) | |
results.extend( [urlparse.urljoin(canonical_url, f) for f in listing] ) | |
except ftplib.error_perm, e: | |
push_error(x, str(e)) | |
continue | |
elif url.scheme == 'http' or url.scheme == 'https': | |
soup = None | |
try: | |
urllib2.addheaders = [('User-agent', 'curl/7.39.0')] | |
#urllib2.add_header('User-Agent', 'Mozilla/5.0') | |
#urllib2.addheaders = [('User-agent', 'Mozilla/5.0')] | |
page = urllib2.urlopen(url.geturl(), timeout=args.timeout).read() | |
soup = BeautifulSoup(page) | |
except urllib2.HTTPError, e: | |
#dprint(str(e)) | |
push_error(x, str(e)) | |
continue | |
soup.prettify() | |
#print soup | |
for anchor in soup.findAll('a', href=True): | |
link = urlparse.urljoin(canonical_url, anchor['href']) | |
results.append(link) | |
else: | |
push_error(x, "unknown URL scheme") | |
for r in results: | |
#branch = None | |
#pattern = None | |
#pattern = re.compile('^.*\/[0-9.]+\/$') | |
#branch = pattern.match(r) | |
#print branch | |
# strip suffix, if present | |
if args.strip is not None: | |
r = strip_end(r, args.strip) | |
# optionally filter by extension | |
if len(all_ext) == 0: | |
print r; | |
else: | |
for e in all_ext: | |
if r.endswith(e): | |
print r | |
break | |
for x in errors.keys(): | |
for e in errors[x]: | |
dprint("Error: [" + x + "] " + e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment