Created
May 11, 2009 06:37
-
-
Save csytan/109889 to your computer and use it in GitHub Desktop.
crawler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from urlparse import urljoin | |
import time | |
import BeautifulSoup | |
class Crawler: | |
def __init__(self, parser): | |
self.parser = parser | |
self.queue = set() | |
self.visited = set() | |
def crawl(self, link, delay=1): | |
if link: | |
self.queue.add(link) | |
while True: | |
next_queue = set() | |
for link in self.queue: | |
try: | |
print "Visiting: " + link | |
url = urllib2.urlopen(link) | |
except urllib2.URLError: | |
print "Could not open page" | |
continue | |
string = unicode(url.read(), errors='ignore') | |
soup = BeautifulSoup.BeautifulSoup(string) | |
self.visited.add(link) | |
new_links = self.parser.parse_page(url.geturl(), soup) | |
if new_links: | |
for link in new_links: | |
if url not in self.visited: | |
next_queue.add(link) | |
page_links = soup('a') | |
for page_link in page_links: | |
if not 'href' in dict(page_link.attrs): | |
continue | |
url = urljoin(link, page_link['href']) | |
url = url.split('#')[0] | |
if url.startswith('http'): | |
if self.parser.handle_link(url): | |
if url not in self.visited: | |
next_queue.add(url) | |
self.queue = next_queue | |
time.sleep(delay) | |
class VideoParser: | |
def __init__(self): | |
self.videos = {} | |
def handle_link(self, url): | |
return False | |
def parse_page(self, url, soup): | |
"""Parses a page. Optionally returns a list of links to be added to the queue""" | |
if __name__ == '__main__': | |
parser = VideoParser() | |
c = Crawler(parser) | |
try: | |
c.crawl('http://google.com') | |
except KeyboardInterrupt: | |
print parser.videos | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment