zach-klippenstein/linkparser.py

## linkparser.py
from HTMLParser import HTMLParser

class LinkParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []
    #end __init__()

    def reset(self):
        HTMLParser.reset(self)
        self.links = []
    #end reset()

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    self.links.append(attr[1])
    # end handle_starttag()

## test.py
import linkparser
import urllib
from wikicrawler import WikiCrawler

lp = linkparser.LinkParser()
f = urllib.urlopen('http://docs.python.org/library/urllib.html')

lp.feed(f.read())
print(lp.links)

print('Testing wikicrawler...')
print()

search = 'disambiguation'
filterRegex = 'wikimediafoundation.org|wikipedia.org'

print("Searching for '" + search + "' filtering by '" + filterRegex + "'")

c = WikiCrawler('http://en.wikipedia.org/wiki/United_Nations', filterRegex)
c.crawlTo(search)

## wikicrawler.py
from linkparser import LinkParser
import urllib
import re

class WikiCrawler:

    def __init__(self, url, filterRegex=''):
        self.startUrl = url
        self.filterRegex = re.compile(filterRegex)

    def crawlTo(self, regex):
        """Returns a list of URLs on the way to the first one that matches regex"""
        curUrl = self.startUrl
        queue = [curUrl]
        visitedPages = [curUrl]
        counter = 0
        matcher = re.compile(regex)

        while len(queue) > 0:
            counter += 1
            curUrl = queue.pop(0)
            print("Processing '" + self.getPlainUrl(curUrl) + "'...")

            if None != matcher.search(self.getPlainUrl(curUrl)):
                return counter

            # add the links from the current node onto the queue
            links = self.getLinks(curUrl)
            for link in links:
                if self.filterUrl(link) and not link in visitedPages:
                    queue.append(link)
                    visitedPages.append(link)
        #end while
    #end crawlTo()

    def filterUrl(self, url):
        """Returns True if the url should be processed."""
        allow = True
        plainUrl = self.getPlainUrl(url)

        if not url.startswith("http://"):
            allow = False

        if None == self.filterRegex.search(plainUrl):
            allow = False

        return allow
    #end filterUrl()

    def getLinks(self, url):
        parser = LinkParser()

        if url != None and len(url) > 0:
            try:
                f = urllib.urlopen(url)

                parser.feed(f.read())
            except Exception:
                pass
        #end if

        return parser.links
    #end getLinks()

    def getPlainUrl(self, url):
        """Returns url with any targets or ? GET arguments removed"""
        plainUrl = url.partition('#')[0]
        plainUrl = plainUrl.partition('?')[0]
        plainUrl = urllib.unquote(plainUrl)

        return plainUrl
    #end getPlainUrl()
	from HTMLParser import HTMLParser

	class LinkParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.links = []
	#end __init__()

	def reset(self):
	HTMLParser.reset(self)
	self.links = []
	#end reset()

	def handle_starttag(self, tag, attrs):
	if tag == 'a':
	for attr in attrs:
	if attr[0] == 'href':
	self.links.append(attr[1])
	# end handle_starttag()
	import linkparser
	import urllib
	from wikicrawler import WikiCrawler

	lp = linkparser.LinkParser()
	f = urllib.urlopen('http://docs.python.org/library/urllib.html')

	lp.feed(f.read())
	print(lp.links)

	print('Testing wikicrawler...')
	print()

	search = 'disambiguation'
	filterRegex = 'wikimediafoundation.org\|wikipedia.org'

	print("Searching for '" + search + "' filtering by '" + filterRegex + "'")

	c = WikiCrawler('http://en.wikipedia.org/wiki/United_Nations', filterRegex)
	c.crawlTo(search)
	from linkparser import LinkParser
	import urllib
	import re

	class WikiCrawler:

	def __init__(self, url, filterRegex=''):
	self.startUrl = url
	self.filterRegex = re.compile(filterRegex)

	def crawlTo(self, regex):
	"""Returns a list of URLs on the way to the first one that matches regex"""
	curUrl = self.startUrl
	queue = [curUrl]
	visitedPages = [curUrl]
	counter = 0
	matcher = re.compile(regex)

	while len(queue) > 0:
	counter += 1
	curUrl = queue.pop(0)
	print("Processing '" + self.getPlainUrl(curUrl) + "'...")

	if None != matcher.search(self.getPlainUrl(curUrl)):
	return counter

	# add the links from the current node onto the queue
	links = self.getLinks(curUrl)
	for link in links:
	if self.filterUrl(link) and not link in visitedPages:
	queue.append(link)
	visitedPages.append(link)
	#end while
	#end crawlTo()

	def filterUrl(self, url):
	"""Returns True if the url should be processed."""
	allow = True
	plainUrl = self.getPlainUrl(url)

	if not url.startswith("http://"):
	allow = False

	if None == self.filterRegex.search(plainUrl):
	allow = False

	return allow
	#end filterUrl()

	def getLinks(self, url):
	parser = LinkParser()

	if url != None and len(url) > 0:
	try:
	f = urllib.urlopen(url)

	parser.feed(f.read())
	except Exception:
	pass
	#end if

	return parser.links
	#end getLinks()

	def getPlainUrl(self, url):
	"""Returns url with any targets or ? GET arguments removed"""
	plainUrl = url.partition('#')[0]
	plainUrl = plainUrl.partition('?')[0]
	plainUrl = urllib.unquote(plainUrl)

	return plainUrl
	#end getPlainUrl()