Skip to content

Instantly share code, notes, and snippets.

@elundmark
Created July 21, 2012 07:31
Show Gist options
  • Save elundmark/3154973 to your computer and use it in GitHub Desktop.
Save elundmark/3154973 to your computer and use it in GitHub Desktop.
Extract urls from webpage as list with python
#!/usr/bin/python -tt
# from: http://www.techgaun.com/2012/03/extracting-all-hyperlinks-from-webpages.html
import re, urllib2, sys
def main():
''' Usage: url_extractor.py "http://example.com/"
NOTICE: Intended for root urls; ie no */file or /subfolder/*
In that case you need to edit this file first
'./abc', '/abc' will be translated to
'http://example.com/abc' (../ not translated)
Return value: list
'''
if (len(sys.argv) != 2):
print 'No URL specified. Taking default URL for link extraction'
url = 'http://www.example.com/'
else:
url = str(sys.argv[1])
links_regex = re.compile('<a\s+.*href=[\'"]?([^\'" >]+)', re.IGNORECASE)
url_request = urllib2.Request(url)
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.43 Safari/536.11')]
response = opener.open(url)
html = response.read()
links = links_regex.findall(html)
fixed_links = []
for link in links:
full_url = re.sub(r'^\.?/{1,2}', url, link, count=1)
fixed_links.append(full_url)
print '\n'.join(fixed_links)
except urllib2.URLError:
print 'Can\'t Connect to the website'
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment