Skip to content

Instantly share code, notes, and snippets.

@csytan
Created May 11, 2009 06:37
Show Gist options
  • Save csytan/109889 to your computer and use it in GitHub Desktop.
Save csytan/109889 to your computer and use it in GitHub Desktop.
crawler.py
import urllib2
from urlparse import urljoin
import time
import BeautifulSoup
class Crawler:
def __init__(self, parser):
self.parser = parser
self.queue = set()
self.visited = set()
def crawl(self, link, delay=1):
if link:
self.queue.add(link)
while True:
next_queue = set()
for link in self.queue:
try:
print "Visiting: " + link
url = urllib2.urlopen(link)
except urllib2.URLError:
print "Could not open page"
continue
string = unicode(url.read(), errors='ignore')
soup = BeautifulSoup.BeautifulSoup(string)
self.visited.add(link)
new_links = self.parser.parse_page(url.geturl(), soup)
if new_links:
for link in new_links:
if url not in self.visited:
next_queue.add(link)
page_links = soup('a')
for page_link in page_links:
if not 'href' in dict(page_link.attrs):
continue
url = urljoin(link, page_link['href'])
url = url.split('#')[0]
if url.startswith('http'):
if self.parser.handle_link(url):
if url not in self.visited:
next_queue.add(url)
self.queue = next_queue
time.sleep(delay)
class VideoParser:
def __init__(self):
self.videos = {}
def handle_link(self, url):
return False
def parse_page(self, url, soup):
"""Parses a page. Optionally returns a list of links to be added to the queue"""
if __name__ == '__main__':
parser = VideoParser()
c = Crawler(parser)
try:
c.crawl('http://google.com')
except KeyboardInterrupt:
print parser.videos
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment