/homeromsterbinlsurl.py.txt

## homeromsterbinlsurl.py.txt
#!/usr/bin/python

import sys
import argparse
import urlparse

import re

import urllib2
from BeautifulSoup import BeautifulSoup
import ftplib

from filtertool.util import dprint, set_synchronous_debug, set_prefix

common_extensions = [
	'.tar.gz', '.tar.Z', '.tgz', '.tar.bz2', '.tbz2', '.tbz',
	'.tar.lzma', '.tar.xz', '.zip', '.bin', '.rpm', '.deb'
]

# pick apart command line
parser = argparse.ArgumentParser()

parser.add_argument('urls', nargs='*',
	help='URL to fetch from')

parser.add_argument('--timeout', type=int, default=25,
	help='network timeout. defaults to 25 seconds.')

parser.add_argument('--stdin', action='store_true',
	help='read URLs from standard input (exhaustive read is performed before URLs are being processed!)')

parser.add_argument('--synchronous-debug', action='store_true',
	help='print debug messages to stdout so they stay in proper order with the main output')

parser.add_argument('--ext', action='append', default=[],
	help='semicolon separated list of suffixes. if present, only URLs with a matching suffix will be added to the output. may be specified multiple times.')
parser.add_argument('--common-ext', action='store_true',
	help='shorthand for --ext "' + ';'.join(common_extensions) + '"')

parser.add_argument('--strip',
	help='string suffix. if present at the end of the URL, it is removed.')

args = parser.parse_args()


if args.synchronous_debug: set_synchronous_debug(True)
set_prefix('lsurl')


# read URLs from stdin if requested to do so (strip surrounding whitespace and filter blank lines, if present)
if args.stdin:
	args.urls.extend( filter(lambda x: len(x)!=0, [x.strip() for x in sys.stdin.readlines()]) )

#	"http://mirrors.ibiblio.org/wine/source/1.4/"
#	"ftp://ftp.mutt.org/pub/mutt/contrib/"
#	"http://www.crummy.com/software/BeautifulSoup/download/3.x/"

all_ext = []
for x in args.ext:
	all_ext.extend( filter(lambda x: len(x)!=0, [x.strip() for x in x.split(';')])  )
if args.common_ext:
	all_ext.extend(common_extensions)


# buffer errors for summary
errors = {}

def push_error(key, message):
	if key not in errors: errors[key] = []
	errors[key].append(message)

def strip_end(text, suffix):
	if not text.endswith(suffix): return text
	return text[:len(text)-len(suffix)]


# do the magic!
for x in args.urls:
	url = urlparse.urlparse(x)
	canonical_url = url.geturl()
	#print url


	results = []

	if url.scheme == 'ftp':

		try:
			ftp = ftplib.FTP(url.netloc,
				url.username if url.username is not None else "anonymous",
				url.password if url.password is not None else "user@example.com",
				timeout=args.timeout
			)

			listing = ftp.nlst(url.path)

			results.extend( [urlparse.urljoin(canonical_url, f) for f in listing] )
		except ftplib.error_perm, e:
			push_error(x, str(e))
			continue


	elif url.scheme == 'http' or url.scheme == 'https':

		soup = None
		try:
                        urllib2.addheaders = [('User-agent', 'curl/7.39.0')]
                        #urllib2.add_header('User-Agent', 'Mozilla/5.0')
                        #urllib2.addheaders = [('User-agent', 'Mozilla/5.0')]
			page = urllib2.urlopen(url.geturl(), timeout=args.timeout).read()
			soup = BeautifulSoup(page)
		except urllib2.HTTPError, e:
			#dprint(str(e))
			push_error(x, str(e))
			continue

		soup.prettify()

                #print soup

		for anchor in soup.findAll('a', href=True):
			link = urlparse.urljoin(canonical_url, anchor['href'])

			results.append(link)


	else:
		push_error(x, "unknown URL scheme")


	for r in results:

                #branch = None
                #pattern = None
                #pattern = re.compile('^.*\/[0-9.]+\/$')
                #branch = pattern.match(r)
                #print branch

		# strip suffix, if present
		if args.strip is not None:
			r = strip_end(r, args.strip)

		# optionally filter by extension
		if len(all_ext) == 0:
			print r;
		else:
			for e in all_ext:
				if r.endswith(e):
					print r
					break


for x in errors.keys():
	for e in errors[x]:
		dprint("Error: [" + x + "] " + e)
	#!/usr/bin/python

	import sys
	import argparse
	import urlparse

	import re

	import urllib2
	from BeautifulSoup import BeautifulSoup
	import ftplib

	from filtertool.util import dprint, set_synchronous_debug, set_prefix

	common_extensions = [
	'.tar.gz', '.tar.Z', '.tgz', '.tar.bz2', '.tbz2', '.tbz',
	'.tar.lzma', '.tar.xz', '.zip', '.bin', '.rpm', '.deb'
	]

	# pick apart command line
	parser = argparse.ArgumentParser()

	parser.add_argument('urls', nargs='*',
	help='URL to fetch from')

	parser.add_argument('--timeout', type=int, default=25,
	help='network timeout. defaults to 25 seconds.')

	parser.add_argument('--stdin', action='store_true',
	help='read URLs from standard input (exhaustive read is performed before URLs are being processed!)')

	parser.add_argument('--synchronous-debug', action='store_true',
	help='print debug messages to stdout so they stay in proper order with the main output')

	parser.add_argument('--ext', action='append', default=[],
	help='semicolon separated list of suffixes. if present, only URLs with a matching suffix will be added to the output. may be specified multiple times.')
	parser.add_argument('--common-ext', action='store_true',
	help='shorthand for --ext "' + ';'.join(common_extensions) + '"')

	parser.add_argument('--strip',
	help='string suffix. if present at the end of the URL, it is removed.')

	args = parser.parse_args()


	if args.synchronous_debug: set_synchronous_debug(True)
	set_prefix('lsurl')


	# read URLs from stdin if requested to do so (strip surrounding whitespace and filter blank lines, if present)
	if args.stdin:
	args.urls.extend( filter(lambda x: len(x)!=0, [x.strip() for x in sys.stdin.readlines()]) )

	# "http://mirrors.ibiblio.org/wine/source/1.4/"
	# "ftp://ftp.mutt.org/pub/mutt/contrib/"
	# "http://www.crummy.com/software/BeautifulSoup/download/3.x/"

	all_ext = []
	for x in args.ext:
	all_ext.extend( filter(lambda x: len(x)!=0, [x.strip() for x in x.split(';')]) )
	if args.common_ext:
	all_ext.extend(common_extensions)


	# buffer errors for summary
	errors = {}

	def push_error(key, message):
	if key not in errors: errors[key] = []
	errors[key].append(message)

	def strip_end(text, suffix):
	if not text.endswith(suffix): return text
	return text[:len(text)-len(suffix)]


	# do the magic!
	for x in args.urls:
	url = urlparse.urlparse(x)
	canonical_url = url.geturl()
	#print url


	results = []

	if url.scheme == 'ftp':

	try:
	ftp = ftplib.FTP(url.netloc,
	url.username if url.username is not None else "anonymous",
	url.password if url.password is not None else "user@example.com",
	timeout=args.timeout
	)

	listing = ftp.nlst(url.path)

	results.extend( [urlparse.urljoin(canonical_url, f) for f in listing] )
	except ftplib.error_perm, e:
	push_error(x, str(e))
	continue


	elif url.scheme == 'http' or url.scheme == 'https':

	soup = None
	try:
	urllib2.addheaders = [('User-agent', 'curl/7.39.0')]
	#urllib2.add_header('User-Agent', 'Mozilla/5.0')
	#urllib2.addheaders = [('User-agent', 'Mozilla/5.0')]
	page = urllib2.urlopen(url.geturl(), timeout=args.timeout).read()
	soup = BeautifulSoup(page)
	except urllib2.HTTPError, e:
	#dprint(str(e))
	push_error(x, str(e))
	continue

	soup.prettify()

	#print soup

	for anchor in soup.findAll('a', href=True):
	link = urlparse.urljoin(canonical_url, anchor['href'])

	results.append(link)


	else:
	push_error(x, "unknown URL scheme")


	for r in results:

	#branch = None
	#pattern = None
	#pattern = re.compile('^.*\/[0-9.]+\/$')
	#branch = pattern.match(r)
	#print branch

	# strip suffix, if present
	if args.strip is not None:
	r = strip_end(r, args.strip)

	# optionally filter by extension
	if len(all_ext) == 0:
	print r;
	else:
	for e in all_ext:
	if r.endswith(e):
	print r
	break



	for x in errors.keys():
	for e in errors[x]:
	dprint("Error: [" + x + "] " + e)