Skip to content

Instantly share code, notes, and snippets.

Created December 4, 2014 10:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/1c06e7fcec9b1b7c3db3 to your computer and use it in GitHub Desktop.
Save anonymous/1c06e7fcec9b1b7c3db3 to your computer and use it in GitHub Desktop.
/home/romster/bin/lsurl.py
#!/usr/bin/python
import sys
import argparse
import urlparse
import re
import urllib2
from BeautifulSoup import BeautifulSoup
import ftplib
from filtertool.util import dprint, set_synchronous_debug, set_prefix
common_extensions = [
'.tar.gz', '.tar.Z', '.tgz', '.tar.bz2', '.tbz2', '.tbz',
'.tar.lzma', '.tar.xz', '.zip', '.bin', '.rpm', '.deb'
]
# pick apart command line
parser = argparse.ArgumentParser()
parser.add_argument('urls', nargs='*',
help='URL to fetch from')
parser.add_argument('--timeout', type=int, default=25,
help='network timeout. defaults to 25 seconds.')
parser.add_argument('--stdin', action='store_true',
help='read URLs from standard input (exhaustive read is performed before URLs are being processed!)')
parser.add_argument('--synchronous-debug', action='store_true',
help='print debug messages to stdout so they stay in proper order with the main output')
parser.add_argument('--ext', action='append', default=[],
help='semicolon separated list of suffixes. if present, only URLs with a matching suffix will be added to the output. may be specified multiple times.')
parser.add_argument('--common-ext', action='store_true',
help='shorthand for --ext "' + ';'.join(common_extensions) + '"')
parser.add_argument('--strip',
help='string suffix. if present at the end of the URL, it is removed.')
args = parser.parse_args()
if args.synchronous_debug: set_synchronous_debug(True)
set_prefix('lsurl')
# read URLs from stdin if requested to do so (strip surrounding whitespace and filter blank lines, if present)
if args.stdin:
args.urls.extend( filter(lambda x: len(x)!=0, [x.strip() for x in sys.stdin.readlines()]) )
# "http://mirrors.ibiblio.org/wine/source/1.4/"
# "ftp://ftp.mutt.org/pub/mutt/contrib/"
# "http://www.crummy.com/software/BeautifulSoup/download/3.x/"
all_ext = []
for x in args.ext:
all_ext.extend( filter(lambda x: len(x)!=0, [x.strip() for x in x.split(';')]) )
if args.common_ext:
all_ext.extend(common_extensions)
# buffer errors for summary
errors = {}
def push_error(key, message):
if key not in errors: errors[key] = []
errors[key].append(message)
def strip_end(text, suffix):
if not text.endswith(suffix): return text
return text[:len(text)-len(suffix)]
# do the magic!
for x in args.urls:
url = urlparse.urlparse(x)
canonical_url = url.geturl()
#print url
results = []
if url.scheme == 'ftp':
try:
ftp = ftplib.FTP(url.netloc,
url.username if url.username is not None else "anonymous",
url.password if url.password is not None else "user@example.com",
timeout=args.timeout
)
listing = ftp.nlst(url.path)
results.extend( [urlparse.urljoin(canonical_url, f) for f in listing] )
except ftplib.error_perm, e:
push_error(x, str(e))
continue
elif url.scheme == 'http' or url.scheme == 'https':
soup = None
try:
urllib2.addheaders = [('User-agent', 'curl/7.39.0')]
#urllib2.add_header('User-Agent', 'Mozilla/5.0')
#urllib2.addheaders = [('User-agent', 'Mozilla/5.0')]
page = urllib2.urlopen(url.geturl(), timeout=args.timeout).read()
soup = BeautifulSoup(page)
except urllib2.HTTPError, e:
#dprint(str(e))
push_error(x, str(e))
continue
soup.prettify()
#print soup
for anchor in soup.findAll('a', href=True):
link = urlparse.urljoin(canonical_url, anchor['href'])
results.append(link)
else:
push_error(x, "unknown URL scheme")
for r in results:
#branch = None
#pattern = None
#pattern = re.compile('^.*\/[0-9.]+\/$')
#branch = pattern.match(r)
#print branch
# strip suffix, if present
if args.strip is not None:
r = strip_end(r, args.strip)
# optionally filter by extension
if len(all_ext) == 0:
print r;
else:
for e in all_ext:
if r.endswith(e):
print r
break
for x in errors.keys():
for e in errors[x]:
dprint("Error: [" + x + "] " + e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment