Skip to content

Instantly share code, notes, and snippets.

@icedraco
Created November 27, 2014 09:50
Show Gist options
  • Save icedraco/f96d4e20e968c09033fa to your computer and use it in GitHub Desktop.
Save icedraco/f96d4e20e968c09033fa to your computer and use it in GitHub Desktop.
Simple regexp-driven URL extractor that looks for anchor tags in an HTML file and returns the HREF contents back
#!/usr/bin/python
### Simple RegExp-Driven URL Extractor (20141012.0118) by IceDragon
#
import re, sys, os
RE_ANCHOR_DQUOTES = re.compile('<a [^>]*href="([^"]*)"[^>]*>')
RE_ANCHOR_SQUOTES = re.compile("<a [^>]*href=s([^']*)s[^>]*>")
def get_url_list(input):
global RE_ANCHOR_DQUOTES, RE_ANCHOR_SQUOTES
buffer = input.read()
return RE_ANCHOR_DQUOTES.findall(buffer) + RE_ANCHOR_SQUOTES.findall(buffer)
def main(argv):
output = []
# Either extract URLs from a list of files, or from STDIN otherwise
if len(argv) > 1:
for filename in sys.argv[1:]:
if os.path.exists(filename):
output += get_url_list(open(filename, 'r'))
else:
output += get_url_list(sys.stdin)
# Print all resulting URLs
for url in output:
print url
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment