Last active
December 27, 2015 05:59
-
-
Save jgxvx/7278911 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'Juerg Gutknecht <juerg.gutknecht@students.ffhs.ch>' | |
import urllib2 | |
from BeautifulSoup import * | |
from urlparse import urljoin | |
from pysqlite2 import dbapi2 as sqlite | |
stopwords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) | |
class Crawler: | |
# Initialize the crawler with the name of the database | |
def __init__(self, db): | |
self.connection = sqlite.connect(db) | |
def __del__(self): | |
self.connection.close() | |
def commit(self): | |
self.connection.commit() | |
def get_entry_id(self, table, field, value, create_new=True): | |
cur = self.connection.execute( | |
"SELECT rowid FROM %s WHERE %s = '%s'" % (table, field, value) | |
) | |
res = cur.fetchone() | |
if None == res: | |
cur = self.connection.execute( | |
"INSERT INTO %s (%s) VALUES ('%s')" % (table, field, value) | |
) | |
return cur.lastrowid | |
else: | |
return res[0] | |
def add_to_index(self, url, soup): | |
if self.is_indexed(url): | |
return | |
print('Indexing ' + url) | |
# Get the individual words | |
text = self.get_text_only(soup) | |
words = self.separate_words(text) | |
# Get the URL id | |
url_id = self.get_entry_id('urllist', 'url', url) | |
# Link each word to this URL | |
for i in range(len(words)): | |
word = words[i] | |
if word in stopwords: | |
continue | |
word_id = self.get_entry_id('wordlist', 'word', word) | |
self.connection.execute( | |
"INSERT INTO wordlocation(urlid, wordid, location) VALUES(%d, %d, %d)" % (url_id, word_id, i) | |
) | |
def get_text_only(self, soup): | |
v = soup.string | |
if None == v: | |
c = soup.contents | |
result_text = '' | |
for t in c: | |
subtext = self.get_text_only(t) | |
result_text += subtext + '\n' | |
return result_text | |
else: | |
return v.strip() | |
def separate_words(self, text): | |
splitter = re.compile('\\W*') | |
return [s.lower() for s in splitter.split(text) if s != ''] | |
def is_indexed(self, url): | |
u = self.connection.execute( | |
"SELECT rowid FROM urllist WHERE url = '%s'" % url | |
).fetchone() | |
if None != u: | |
# Check if it has actually been crawled | |
v = self.connection.execute( | |
"SELECT * FROM wordlocation WHERE urlid = %d" % u[0] | |
).fetchone() | |
if None != v: | |
return True | |
return False | |
def add_link_ref(self, url_from, url_to, link_text): | |
words = self.separate_words(link_text) | |
from_id = self.get_entry_id('urllist', 'url', url_from) | |
to_id = self.get_entry_id('urllist', 'url', url_to) | |
if from_id == to_id: | |
return | |
cur = self.connection.execute( | |
"INSERT INTO link(fromid, toid) VALUES (%d, %d)" % (from_id, to_id) | |
) | |
link_id = cur.lastrowid | |
for word in words: | |
if word in stopwords: | |
continue | |
word_id = self.get_entry_id('wordlist', 'word', word) | |
self.connection.execute( | |
"INSERT INTO linkwords(linkid, wordid) VALUES (%d, %d)" % (link_id, word_id) | |
) | |
def crawl(self, pages, depth=2): | |
for i in range(depth): | |
new_pages = set() | |
for page in pages: | |
try: | |
c = urllib2.urlopen(page) | |
except: | |
print("Could not open %s") % page | |
continue | |
soup = BeautifulSoup(c.read()) | |
self.add_to_index(page, soup) | |
links = soup('a') | |
for link in links: | |
if 'href' in dict(link.attrs): | |
url = urljoin(page, link['href']) | |
if url.find("'") != -1: | |
continue | |
url = url.split('#')[0] | |
if url[0:4] == 'http' and not self.is_indexed(url): | |
new_pages.add(url) | |
link_text = self.get_text_only(link) | |
self.add_link_ref(page, url, link_text) | |
self.commit() | |
pages = new_pages | |
def calculate_page_rank(self, iterations=20): | |
# Clear out the current page rank tables | |
self.connection.execute('DROP TABLE IF EXISTS pagerank') | |
self.connection.execute('CREATE TABLE pagerank(urlid primary key, score)') | |
# Initialize every URL with a PageRank of 1 | |
self.connection.execute( | |
"INSERT INTO pagerank SELECT rowid, 1.0 FROM urllist" | |
) | |
self.commit() | |
for i in range(iterations): | |
print("Iteration %d") % i | |
for (url_id,) in self.connection.execute('SELECT rowid FROM urllist'): | |
rank = 0.15 | |
# Loop through all the pages that link to this one | |
for (linker,) in self.connection.execute( | |
"SELECT DISTINCT fromid FROM link WHERE toid=%d" % url_id | |
): | |
# Get the page rank of the linker | |
linking_rank = self.connection.execute( | |
"SELECT score FROM pagerank WHERE urlid=%d" % linker | |
).fetchone()[0] | |
# Get the total number of links from the linker | |
linking_count = self.connection.execute( | |
"SELECT COUNT(*) FROM link WHERE fromid=%d" % linker | |
).fetchone()[0] | |
rank += 0.85 * (linking_rank / linking_count) | |
self.connection.execute( | |
"UPDATE pagerank SET score=%f WHERE urlid=%d" % (rank, url_id) | |
) | |
self.commit() | |
def create_index_tables(self): | |
self.connection.execute('CREATE TABLE urllist(url)') | |
self.connection.execute('CREATE TABLE wordlist(word)') | |
self.connection.execute('CREATE TABLE wordlocation(urlid, wordid, location)') | |
self.connection.execute('CREATE TABLE link(fromid INTEGER, toid INTEGER)') | |
self.connection.execute('CREATE TABLE linkwords(wordid, linkid)') | |
self.connection.execute('CREATE INDEX wordidx ON wordlist(word)') | |
self.connection.execute('CREATE INDEX urlidx ON urllist(url)') | |
self.connection.execute('CREATE INDEX wordurlidx ON wordlocation(wordid)') | |
self.connection.execute('CREATE INDEX urltoidx ON link(toid)') | |
self.connection.execute('CREATE INDEX urlfromidx ON link(fromid)') | |
self.commit() | |
class Searcher: | |
def __init__(self, db): | |
self.connection = sqlite.connect(db) | |
def __del__(self): | |
self.connection.close() | |
def get_match_rows(self, query): | |
# Strings to build the query | |
field_list = 'w0.urlid' | |
table_list = '' | |
clause_list = '' | |
word_ids = [] | |
# Split the words by spaces | |
words = query.split(' ') | |
table_number = 0 | |
for word in words: | |
# Get the word id | |
word_row = self.connection.execute( | |
"SELECT rowid FROM wordlist WHERE word = '%s'" % word | |
).fetchone() | |
if None != word_row: | |
word_id = word_row[0] | |
word_ids.append(word_id) | |
if table_number > 0: | |
table_list += ',' | |
clause_list += ' AND ' | |
clause_list += 'w%d.urlid = w%d.urlid AND ' % (table_number - 1, table_number) | |
field_list += ', w%d.location' % table_number | |
table_list += 'wordlocation w%d' % table_number | |
clause_list += 'w%d.wordid = %d' % (table_number, word_id) | |
table_number += 1 | |
# Create the query | |
full_query = 'SELECT %s FROM %s WHERE %s' % (field_list, table_list, clause_list) | |
cur = self.connection.execute(full_query) | |
rows = [row for row in cur] | |
return rows,word_ids | |
def get_scored_lists(self, rows, word_ids): | |
total_scores = dict([(row[0], 0) for row in rows]) | |
weights = [ | |
(1.0, self.frequency_score(rows)), | |
(1.0, self.location_score(rows)), | |
(1.0, self.page_rank_score(rows)), | |
(1.0, self.link_text_score(rows, word_ids)) | |
] | |
for (weight, scores) in weights: | |
for url in total_scores: | |
total_scores[url] += weight * scores[url] | |
return total_scores | |
def get_url_name(self, id): | |
return self.connection.execute( | |
"SELECT url FROM urllist WHERE rowid = %d" % id | |
).fetchone()[0] | |
def query(self, query): | |
rows,word_ids = self.get_match_rows(query) | |
scores = self.get_scored_lists(rows, word_ids) | |
ranked_scores = sorted( | |
[(score, url) for (url, score) in scores.items()], reverse=1 | |
) | |
for (score,url_id) in ranked_scores[0:10]: | |
print('%f\t%s') % (score, self.get_url_name(url_id)) | |
def normalize_scores(self, scores, small_is_better=0): | |
v_small = 0.00001 | |
if small_is_better: | |
min_score = min(scores.values()) | |
return dict([(u, float(min_score)/max(v_small, 1)) for (u,l) in scores.items()]) | |
else: | |
max_score = max(scores.values()) | |
if max_score == 0: | |
max_score = v_small | |
return dict([(u, float(c)/max_score) for (u,c) in scores.items()]) | |
def frequency_score(self, rows): | |
counts = dict([(row[0], 0) for row in rows]) | |
for row in rows: | |
counts[row[0]] += 1 | |
return self.normalize_scores(counts) | |
def location_score(self, rows): | |
locations = dict([(row[0], 1000000) for row in rows]) | |
for row in rows: | |
loc = sum(row[1:]) | |
if loc < locations[row[0]]: | |
locations[row[0]] = loc | |
return self.normalize_scores(locations, 1) | |
def distance_score(self, rows): | |
if len(rows[0]) <= 2: | |
return dict([(row[0], 1.0) for row in rows]) | |
min_distance = dict([(row[0], 1000000) for row in rows]) | |
for row in rows: | |
dist = sum([abs(row[i]-row[i-1]) for i in range(2,len(row))]) | |
if dist < min_distance[row[0]]: | |
min_distance[row[0]] = dist | |
return self.normalize_scores(min_distance, 1) | |
def page_rank_score(self, rows): | |
ranks = dict([(row[0], self.connection.execute( | |
"SELECT score FROM pagerank WHERE urlid=%d" % row[0] | |
).fetchone()[0]) for row in rows]) | |
max_rank = max(ranks.values()) | |
normalized_scores = dict([(u, float(1)/max_rank) for (u, l) in ranks.items()]) | |
return normalized_scores | |
def link_text_score(self, rows, word_ids): | |
link_scores = dict([(row[0], 0) for row in rows]) | |
for word_id in word_ids: | |
cur = self.connection.execute( | |
"SELECT link.fromid, link.toid FROM linkwords,link" | |
+ " WHERE wordid=" + str(word_id) | |
+ " AND linkwords.linkid=link.rowid" | |
) | |
for (from_id, to_id) in cur: | |
if to_id in link_scores: | |
rank = self.connection.execute( | |
"SELECT score FROM pagerank WHERE urlid=%d" % from_id | |
).fetchone()[0] | |
link_scores[to_id] += rank | |
max_score = max(link_scores.values()) | |
normalized_scores = dict([ | |
(u, float(1)/max_score) for (u,l) in link_scores.items() | |
]) | |
return normalized_scores |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Crawler and Searcher based on Segaran, O'Reilly, 2007, Chapter 4