icedraco/url-extractor.py

## url-extractor.py
#!/usr/bin/python

### Simple RegExp-Driven URL Extractor (20141012.0118) by IceDragon
#

import re, sys, os

RE_ANCHOR_DQUOTES = re.compile('<a [^>]*href="([^"]*)"[^>]*>')
RE_ANCHOR_SQUOTES = re.compile("<a [^>]*href=s([^']*)s[^>]*>")

def get_url_list(input):
        global RE_ANCHOR_DQUOTES, RE_ANCHOR_SQUOTES
        buffer = input.read()
        return RE_ANCHOR_DQUOTES.findall(buffer) + RE_ANCHOR_SQUOTES.findall(buffer)


def main(argv):
        output = []

        # Either extract URLs from a list of files, or from STDIN otherwise
        if len(argv) > 1:
                for filename in sys.argv[1:]:
                        if os.path.exists(filename):
                                output += get_url_list(open(filename, 'r'))
        else:
                output += get_url_list(sys.stdin)

        # Print all resulting URLs
        for url in output:
                print url

        return 0


if __name__ == "__main__":
        raise SystemExit(main(sys.argv))
	#!/usr/bin/python

	### Simple RegExp-Driven URL Extractor (20141012.0118) by IceDragon
	#

	import re, sys, os

	RE_ANCHOR_DQUOTES = re.compile('<a [^>]href="([^"])"[^>]*>')
	RE_ANCHOR_SQUOTES = re.compile("<a [^>]href=s([^'])s[^>]*>")

	def get_url_list(input):
	global RE_ANCHOR_DQUOTES, RE_ANCHOR_SQUOTES
	buffer = input.read()
	return RE_ANCHOR_DQUOTES.findall(buffer) + RE_ANCHOR_SQUOTES.findall(buffer)


	def main(argv):
	output = []

	# Either extract URLs from a list of files, or from STDIN otherwise
	if len(argv) > 1:
	for filename in sys.argv[1:]:
	if os.path.exists(filename):
	output += get_url_list(open(filename, 'r'))
	else:
	output += get_url_list(sys.stdin)

	# Print all resulting URLs
	for url in output:
	print url

	return 0


	if __name__ == "__main__":
	raise SystemExit(main(sys.argv))