Created
January 22, 2010 19:41
-
-
Save matagus/284068 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#*-- coding: utf-8 -*- | |
"""Given an url extract all external links and return a list of tuples (title, url)""" | |
from mechanize import Browser | |
from urlparse import urlparse | |
import sys | |
if __name__ == "__main__": | |
url_list = sys.argv[1:] | |
br = Browser() | |
for url in url_list: | |
base_url = urlparse(url)[1] # get the netloc part of the url | |
br.open(url) | |
external_links = set() | |
for link in br.links(): | |
if urlparse(link.absolute_url)[1] != base_url: | |
external_links.add((link.text, link.absolute_url)) | |
for l in external_links: | |
print l |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment