Created
November 27, 2014 09:50
-
-
Save icedraco/f96d4e20e968c09033fa to your computer and use it in GitHub Desktop.
Simple regexp-driven URL extractor that looks for anchor tags in an HTML file and returns the HREF contents back
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
### Simple RegExp-Driven URL Extractor (20141012.0118) by IceDragon | |
# | |
import re, sys, os | |
RE_ANCHOR_DQUOTES = re.compile('<a [^>]*href="([^"]*)"[^>]*>') | |
RE_ANCHOR_SQUOTES = re.compile("<a [^>]*href=s([^']*)s[^>]*>") | |
def get_url_list(input): | |
global RE_ANCHOR_DQUOTES, RE_ANCHOR_SQUOTES | |
buffer = input.read() | |
return RE_ANCHOR_DQUOTES.findall(buffer) + RE_ANCHOR_SQUOTES.findall(buffer) | |
def main(argv): | |
output = [] | |
# Either extract URLs from a list of files, or from STDIN otherwise | |
if len(argv) > 1: | |
for filename in sys.argv[1:]: | |
if os.path.exists(filename): | |
output += get_url_list(open(filename, 'r')) | |
else: | |
output += get_url_list(sys.stdin) | |
# Print all resulting URLs | |
for url in output: | |
print url | |
return 0 | |
if __name__ == "__main__": | |
raise SystemExit(main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment