Created
July 21, 2012 07:31
-
-
Save elundmark/3154973 to your computer and use it in GitHub Desktop.
Extract urls from webpage as list with python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python -tt | |
# from: http://www.techgaun.com/2012/03/extracting-all-hyperlinks-from-webpages.html | |
import re, urllib2, sys | |
def main(): | |
''' Usage: url_extractor.py "http://example.com/" | |
NOTICE: Intended for root urls; ie no */file or /subfolder/* | |
In that case you need to edit this file first | |
'./abc', '/abc' will be translated to | |
'http://example.com/abc' (../ not translated) | |
Return value: list | |
''' | |
if (len(sys.argv) != 2): | |
print 'No URL specified. Taking default URL for link extraction' | |
url = 'http://www.example.com/' | |
else: | |
url = str(sys.argv[1]) | |
links_regex = re.compile('<a\s+.*href=[\'"]?([^\'" >]+)', re.IGNORECASE) | |
url_request = urllib2.Request(url) | |
try: | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.43 Safari/536.11')] | |
response = opener.open(url) | |
html = response.read() | |
links = links_regex.findall(html) | |
fixed_links = [] | |
for link in links: | |
full_url = re.sub(r'^\.?/{1,2}', url, link, count=1) | |
fixed_links.append(full_url) | |
print '\n'.join(fixed_links) | |
except urllib2.URLError: | |
print 'Can\'t Connect to the website' | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment