Skip to content

Instantly share code, notes, and snippets.

@jgxvx
Last active December 27, 2015 05:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jgxvx/7278911 to your computer and use it in GitHub Desktop.
Save jgxvx/7278911 to your computer and use it in GitHub Desktop.
__author__ = 'Juerg Gutknecht <juerg.gutknecht@students.ffhs.ch>'
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite
stopwords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
class Crawler:
# Initialize the crawler with the name of the database
def __init__(self, db):
self.connection = sqlite.connect(db)
def __del__(self):
self.connection.close()
def commit(self):
self.connection.commit()
def get_entry_id(self, table, field, value, create_new=True):
cur = self.connection.execute(
"SELECT rowid FROM %s WHERE %s = '%s'" % (table, field, value)
)
res = cur.fetchone()
if None == res:
cur = self.connection.execute(
"INSERT INTO %s (%s) VALUES ('%s')" % (table, field, value)
)
return cur.lastrowid
else:
return res[0]
def add_to_index(self, url, soup):
if self.is_indexed(url):
return
print('Indexing ' + url)
# Get the individual words
text = self.get_text_only(soup)
words = self.separate_words(text)
# Get the URL id
url_id = self.get_entry_id('urllist', 'url', url)
# Link each word to this URL
for i in range(len(words)):
word = words[i]
if word in stopwords:
continue
word_id = self.get_entry_id('wordlist', 'word', word)
self.connection.execute(
"INSERT INTO wordlocation(urlid, wordid, location) VALUES(%d, %d, %d)" % (url_id, word_id, i)
)
def get_text_only(self, soup):
v = soup.string
if None == v:
c = soup.contents
result_text = ''
for t in c:
subtext = self.get_text_only(t)
result_text += subtext + '\n'
return result_text
else:
return v.strip()
def separate_words(self, text):
splitter = re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s != '']
def is_indexed(self, url):
u = self.connection.execute(
"SELECT rowid FROM urllist WHERE url = '%s'" % url
).fetchone()
if None != u:
# Check if it has actually been crawled
v = self.connection.execute(
"SELECT * FROM wordlocation WHERE urlid = %d" % u[0]
).fetchone()
if None != v:
return True
return False
def add_link_ref(self, url_from, url_to, link_text):
words = self.separate_words(link_text)
from_id = self.get_entry_id('urllist', 'url', url_from)
to_id = self.get_entry_id('urllist', 'url', url_to)
if from_id == to_id:
return
cur = self.connection.execute(
"INSERT INTO link(fromid, toid) VALUES (%d, %d)" % (from_id, to_id)
)
link_id = cur.lastrowid
for word in words:
if word in stopwords:
continue
word_id = self.get_entry_id('wordlist', 'word', word)
self.connection.execute(
"INSERT INTO linkwords(linkid, wordid) VALUES (%d, %d)" % (link_id, word_id)
)
def crawl(self, pages, depth=2):
for i in range(depth):
new_pages = set()
for page in pages:
try:
c = urllib2.urlopen(page)
except:
print("Could not open %s") % page
continue
soup = BeautifulSoup(c.read())
self.add_to_index(page, soup)
links = soup('a')
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http' and not self.is_indexed(url):
new_pages.add(url)
link_text = self.get_text_only(link)
self.add_link_ref(page, url, link_text)
self.commit()
pages = new_pages
def calculate_page_rank(self, iterations=20):
# Clear out the current page rank tables
self.connection.execute('DROP TABLE IF EXISTS pagerank')
self.connection.execute('CREATE TABLE pagerank(urlid primary key, score)')
# Initialize every URL with a PageRank of 1
self.connection.execute(
"INSERT INTO pagerank SELECT rowid, 1.0 FROM urllist"
)
self.commit()
for i in range(iterations):
print("Iteration %d") % i
for (url_id,) in self.connection.execute('SELECT rowid FROM urllist'):
rank = 0.15
# Loop through all the pages that link to this one
for (linker,) in self.connection.execute(
"SELECT DISTINCT fromid FROM link WHERE toid=%d" % url_id
):
# Get the page rank of the linker
linking_rank = self.connection.execute(
"SELECT score FROM pagerank WHERE urlid=%d" % linker
).fetchone()[0]
# Get the total number of links from the linker
linking_count = self.connection.execute(
"SELECT COUNT(*) FROM link WHERE fromid=%d" % linker
).fetchone()[0]
rank += 0.85 * (linking_rank / linking_count)
self.connection.execute(
"UPDATE pagerank SET score=%f WHERE urlid=%d" % (rank, url_id)
)
self.commit()
def create_index_tables(self):
self.connection.execute('CREATE TABLE urllist(url)')
self.connection.execute('CREATE TABLE wordlist(word)')
self.connection.execute('CREATE TABLE wordlocation(urlid, wordid, location)')
self.connection.execute('CREATE TABLE link(fromid INTEGER, toid INTEGER)')
self.connection.execute('CREATE TABLE linkwords(wordid, linkid)')
self.connection.execute('CREATE INDEX wordidx ON wordlist(word)')
self.connection.execute('CREATE INDEX urlidx ON urllist(url)')
self.connection.execute('CREATE INDEX wordurlidx ON wordlocation(wordid)')
self.connection.execute('CREATE INDEX urltoidx ON link(toid)')
self.connection.execute('CREATE INDEX urlfromidx ON link(fromid)')
self.commit()
class Searcher:
def __init__(self, db):
self.connection = sqlite.connect(db)
def __del__(self):
self.connection.close()
def get_match_rows(self, query):
# Strings to build the query
field_list = 'w0.urlid'
table_list = ''
clause_list = ''
word_ids = []
# Split the words by spaces
words = query.split(' ')
table_number = 0
for word in words:
# Get the word id
word_row = self.connection.execute(
"SELECT rowid FROM wordlist WHERE word = '%s'" % word
).fetchone()
if None != word_row:
word_id = word_row[0]
word_ids.append(word_id)
if table_number > 0:
table_list += ','
clause_list += ' AND '
clause_list += 'w%d.urlid = w%d.urlid AND ' % (table_number - 1, table_number)
field_list += ', w%d.location' % table_number
table_list += 'wordlocation w%d' % table_number
clause_list += 'w%d.wordid = %d' % (table_number, word_id)
table_number += 1
# Create the query
full_query = 'SELECT %s FROM %s WHERE %s' % (field_list, table_list, clause_list)
cur = self.connection.execute(full_query)
rows = [row for row in cur]
return rows,word_ids
def get_scored_lists(self, rows, word_ids):
total_scores = dict([(row[0], 0) for row in rows])
weights = [
(1.0, self.frequency_score(rows)),
(1.0, self.location_score(rows)),
(1.0, self.page_rank_score(rows)),
(1.0, self.link_text_score(rows, word_ids))
]
for (weight, scores) in weights:
for url in total_scores:
total_scores[url] += weight * scores[url]
return total_scores
def get_url_name(self, id):
return self.connection.execute(
"SELECT url FROM urllist WHERE rowid = %d" % id
).fetchone()[0]
def query(self, query):
rows,word_ids = self.get_match_rows(query)
scores = self.get_scored_lists(rows, word_ids)
ranked_scores = sorted(
[(score, url) for (url, score) in scores.items()], reverse=1
)
for (score,url_id) in ranked_scores[0:10]:
print('%f\t%s') % (score, self.get_url_name(url_id))
def normalize_scores(self, scores, small_is_better=0):
v_small = 0.00001
if small_is_better:
min_score = min(scores.values())
return dict([(u, float(min_score)/max(v_small, 1)) for (u,l) in scores.items()])
else:
max_score = max(scores.values())
if max_score == 0:
max_score = v_small
return dict([(u, float(c)/max_score) for (u,c) in scores.items()])
def frequency_score(self, rows):
counts = dict([(row[0], 0) for row in rows])
for row in rows:
counts[row[0]] += 1
return self.normalize_scores(counts)
def location_score(self, rows):
locations = dict([(row[0], 1000000) for row in rows])
for row in rows:
loc = sum(row[1:])
if loc < locations[row[0]]:
locations[row[0]] = loc
return self.normalize_scores(locations, 1)
def distance_score(self, rows):
if len(rows[0]) <= 2:
return dict([(row[0], 1.0) for row in rows])
min_distance = dict([(row[0], 1000000) for row in rows])
for row in rows:
dist = sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
if dist < min_distance[row[0]]:
min_distance[row[0]] = dist
return self.normalize_scores(min_distance, 1)
def page_rank_score(self, rows):
ranks = dict([(row[0], self.connection.execute(
"SELECT score FROM pagerank WHERE urlid=%d" % row[0]
).fetchone()[0]) for row in rows])
max_rank = max(ranks.values())
normalized_scores = dict([(u, float(1)/max_rank) for (u, l) in ranks.items()])
return normalized_scores
def link_text_score(self, rows, word_ids):
link_scores = dict([(row[0], 0) for row in rows])
for word_id in word_ids:
cur = self.connection.execute(
"SELECT link.fromid, link.toid FROM linkwords,link"
+ " WHERE wordid=" + str(word_id)
+ " AND linkwords.linkid=link.rowid"
)
for (from_id, to_id) in cur:
if to_id in link_scores:
rank = self.connection.execute(
"SELECT score FROM pagerank WHERE urlid=%d" % from_id
).fetchone()[0]
link_scores[to_id] += rank
max_score = max(link_scores.values())
normalized_scores = dict([
(u, float(1)/max_score) for (u,l) in link_scores.items()
])
return normalized_scores
@jgxvx
Copy link
Author

jgxvx commented Nov 2, 2013

Crawler and Searcher based on Segaran, O'Reilly, 2007, Chapter 4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment