Skip to content

Instantly share code, notes, and snippets.

@zach-klippenstein
Created December 9, 2009 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zach-klippenstein/252733 to your computer and use it in GitHub Desktop.
Save zach-klippenstein/252733 to your computer and use it in GitHub Desktop.
Attempt at creating a very simple, breadth-first web crawler. As the test file shows, it was supposed to work on wikipedia. However, it doesn't (Wikipedia seems to send HTML that doesn't actually contain the article content).
from HTMLParser import HTMLParser
class LinkParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
#end __init__()
def reset(self):
HTMLParser.reset(self)
self.links = []
#end reset()
def handle_starttag(self, tag, attrs):
if tag == 'a':
for attr in attrs:
if attr[0] == 'href':
self.links.append(attr[1])
# end handle_starttag()
import linkparser
import urllib
from wikicrawler import WikiCrawler
lp = linkparser.LinkParser()
f = urllib.urlopen('http://docs.python.org/library/urllib.html')
lp.feed(f.read())
print(lp.links)
print('Testing wikicrawler...')
print()
search = 'disambiguation'
filterRegex = 'wikimediafoundation.org|wikipedia.org'
print("Searching for '" + search + "' filtering by '" + filterRegex + "'")
c = WikiCrawler('http://en.wikipedia.org/wiki/United_Nations', filterRegex)
c.crawlTo(search)
from linkparser import LinkParser
import urllib
import re
class WikiCrawler:
def __init__(self, url, filterRegex=''):
self.startUrl = url
self.filterRegex = re.compile(filterRegex)
def crawlTo(self, regex):
"""Returns a list of URLs on the way to the first one that matches regex"""
curUrl = self.startUrl
queue = [curUrl]
visitedPages = [curUrl]
counter = 0
matcher = re.compile(regex)
while len(queue) > 0:
counter += 1
curUrl = queue.pop(0)
print("Processing '" + self.getPlainUrl(curUrl) + "'...")
if None != matcher.search(self.getPlainUrl(curUrl)):
return counter
# add the links from the current node onto the queue
links = self.getLinks(curUrl)
for link in links:
if self.filterUrl(link) and not link in visitedPages:
queue.append(link)
visitedPages.append(link)
#end while
#end crawlTo()
def filterUrl(self, url):
"""Returns True if the url should be processed."""
allow = True
plainUrl = self.getPlainUrl(url)
if not url.startswith("http://"):
allow = False
if None == self.filterRegex.search(plainUrl):
allow = False
return allow
#end filterUrl()
def getLinks(self, url):
parser = LinkParser()
if url != None and len(url) > 0:
try:
f = urllib.urlopen(url)
parser.feed(f.read())
except Exception:
pass
#end if
return parser.links
#end getLinks()
def getPlainUrl(self, url):
"""Returns url with any targets or ? GET arguments removed"""
plainUrl = url.partition('#')[0]
plainUrl = plainUrl.partition('?')[0]
plainUrl = urllib.unquote(plainUrl)
return plainUrl
#end getPlainUrl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment