|
# -*- coding: utf-8 -*- |
|
# filename: crawler.py |
|
|
|
import sqlite3 |
|
import urllib2 |
|
from HTMLParser import HTMLParser |
|
from urlparse import urlparse |
|
|
|
|
|
class HREFParser(HTMLParser): |
|
""" |
|
Parser that extracts hrefs |
|
""" |
|
hrefs = set() |
|
def handle_starttag(self, tag, attrs): |
|
if tag == 'a': |
|
dict_attrs = dict(attrs) |
|
if dict_attrs.get('href'): |
|
self.hrefs.add(dict_attrs['href']) |
|
|
|
|
|
def get_local_links(html, domain): |
|
""" |
|
Read through HTML content and returns a tuple of links |
|
internal to the given domain |
|
""" |
|
hrefs = set() |
|
parser = HREFParser() |
|
parser.feed(html) |
|
for href in parser.hrefs: |
|
u_parse = urlparse(href) |
|
if href.startswith('/'): |
|
# purposefully using path, no query, no hash |
|
hrefs.add(u_parse.path) |
|
else: |
|
# only keep the local urls |
|
if u_parse.netloc == domain: |
|
hrefs.add(u_parse.path) |
|
return hrefs |
|
|
|
|
|
class CrawlerCache(object): |
|
""" |
|
Crawler data caching per relative URL and domain. |
|
""" |
|
def __init__(self, db_file): |
|
self.conn = sqlite3.connect(db_file) |
|
c = self.conn.cursor() |
|
c.execute('''CREATE TABLE IF NOT EXISTS sites |
|
(domain text, url text, content text)''') |
|
self.conn.commit() |
|
self.cursor = self.conn.cursor() |
|
|
|
def set(self, domain, url, data): |
|
""" |
|
store the content for a given domain and relative url |
|
""" |
|
self.cursor.execute("INSERT INTO sites VALUES (?,?,?)", |
|
(domain, url, data)) |
|
self.conn.commit() |
|
|
|
def get(self, domain, url): |
|
""" |
|
return the content for a given domain and relative url |
|
""" |
|
self.cursor.execute("SELECT content FROM sites WHERE domain=? and url=?", |
|
(domain, url)) |
|
row = self.cursor.fetchone() |
|
if row: |
|
return row[0] |
|
|
|
def get_urls(self, domain): |
|
""" |
|
return all the URLS within a domain |
|
""" |
|
self.cursor.execute("SELECT url FROM sites WHERE domain=?", (domain,)) |
|
# could use fetchone and yield but I want to release |
|
# my cursor after the call. I could have create a new cursor tho. |
|
# ...Oh well |
|
return [row[0] for row in self.cursor.fetchall()] |
|
|
|
|
|
class Crawler(object): |
|
def __init__(self, cache=None, depth=2): |
|
""" |
|
depth: how many time it will bounce from page one (optional) |
|
cache: a basic cache controller (optional) |
|
""" |
|
self.depth = depth |
|
self.content = {} |
|
self.cache = cache |
|
|
|
def crawl(self, url, no_cache=None): |
|
""" |
|
url: where we start crawling, should be a complete URL like |
|
'http://www.intel.com/news/' |
|
no_cache: function returning True if the url should be refreshed |
|
""" |
|
u_parse = urlparse(url) |
|
self.domain = u_parse.netloc |
|
self.content[self.domain] = {} |
|
self.scheme = u_parse.scheme |
|
self.no_cache = no_cache |
|
self._crawl([u_parse.path], self.depth) |
|
|
|
def set(self, url, html): |
|
self.content[self.domain][url] = html |
|
if self.is_cacheable(url): |
|
self.cache.set(self.domain, url, html) |
|
|
|
def get(self, url): |
|
page = None |
|
if self.is_cacheable(url): |
|
page = self.cache.get(self.domain, url) |
|
if page is None: |
|
page = self.curl(url) |
|
else: |
|
print "cached url... [%s] %s" % (self.domain, url) |
|
return page |
|
|
|
def is_cacheable(self, url): |
|
return self.cache and self.no_cache \ |
|
and not self.no_cache(url) |
|
|
|
def _crawl(self, urls, max_depth): |
|
n_urls = set() |
|
if max_depth: |
|
for url in urls: |
|
# do not crawl twice the same page |
|
if url not in self.content: |
|
html = self.get(url) |
|
self.set(url, html) |
|
n_urls = n_urls.union(get_local_links(html, self.domain)) |
|
self._crawl(n_urls, max_depth-1) |
|
|
|
def curl(self, url): |
|
""" |
|
return content at url. |
|
return empty string if response raise an HTTPError (not found, 500...) |
|
""" |
|
try: |
|
print "retrieving url... [%s] %s" % (self.domain, url) |
|
req = urllib2.Request('%s://%s%s' % (self.scheme, self.domain, url)) |
|
response = urllib2.urlopen(req) |
|
return response.read().decode('ascii', 'ignore') |
|
except urllib2.HTTPError, e: |
|
print "error [%s] %s: %s" % (self.domain, url, e) |
|
return '' |
Sorry for disturbing you. I just started to learn programming crawler by python and imitated your code to practice. Can you give me some tips about how to output html content of captured URL or store them in doc? Thanks sooooo much~