jgxvx/searchengine.py

## searchengine.py
__author__ = 'Juerg Gutknecht <juerg.gutknecht@students.ffhs.ch>'

import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite

stopwords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])


class Crawler:
    # Initialize the crawler with the name of the database
    def __init__(self, db):
        self.connection = sqlite.connect(db)

    def __del__(self):
        self.connection.close()

    def commit(self):
        self.connection.commit()

    def get_entry_id(self, table, field, value, create_new=True):
        cur = self.connection.execute(
            "SELECT rowid FROM %s WHERE %s = '%s'" % (table, field, value)
        )
        res = cur.fetchone()
        if None == res:
            cur = self.connection.execute(
                "INSERT INTO %s (%s) VALUES ('%s')" % (table, field, value)
            )
            return cur.lastrowid
        else:
            return res[0]

    def add_to_index(self, url, soup):
        if self.is_indexed(url):
            return
        print('Indexing ' + url)

        # Get the individual words
        text = self.get_text_only(soup)
        words = self.separate_words(text)

        # Get the URL id
        url_id = self.get_entry_id('urllist', 'url', url)

        # Link each word to this URL
        for i in range(len(words)):
            word = words[i]
            if word in stopwords:
                continue
            word_id = self.get_entry_id('wordlist', 'word', word)
            self.connection.execute(
                "INSERT INTO wordlocation(urlid, wordid, location) VALUES(%d, %d, %d)" % (url_id, word_id, i)
            )

    def get_text_only(self, soup):
        v = soup.string
        if None == v:
            c = soup.contents
            result_text = ''
            for t in c:
                subtext = self.get_text_only(t)
                result_text += subtext + '\n'
            return result_text
        else:
            return v.strip()

    def separate_words(self, text):
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s != '']

    def is_indexed(self, url):
        u = self.connection.execute(
            "SELECT rowid  FROM urllist WHERE url = '%s'" % url
        ).fetchone()

        if None != u:
            # Check if it has actually been crawled
            v = self.connection.execute(
                "SELECT * FROM wordlocation WHERE urlid = %d" % u[0]
            ).fetchone()
            if None != v:
                return True
        return False

    def add_link_ref(self, url_from, url_to, link_text):
        words = self.separate_words(link_text)
        from_id = self.get_entry_id('urllist', 'url', url_from)
        to_id = self.get_entry_id('urllist', 'url', url_to)
        if from_id == to_id:
            return
        cur = self.connection.execute(
            "INSERT INTO link(fromid, toid) VALUES (%d, %d)" % (from_id, to_id)
        )
        link_id = cur.lastrowid
        for word in words:
            if word in stopwords:
                continue
            word_id = self.get_entry_id('wordlist', 'word', word)
            self.connection.execute(
                "INSERT INTO linkwords(linkid, wordid) VALUES (%d, %d)" % (link_id, word_id)
            )

    def crawl(self, pages, depth=2):
        for i in range(depth):
            new_pages = set()
            for page in pages:
                try:
                    c = urllib2.urlopen(page)
                except:
                    print("Could not open %s") % page
                    continue
                soup = BeautifulSoup(c.read())
                self.add_to_index(page, soup)

                links = soup('a')
                for link in links:
                    if 'href' in dict(link.attrs):
                        url = urljoin(page, link['href'])
                        if url.find("'") != -1:
                            continue
                        url = url.split('#')[0]
                        if url[0:4] == 'http' and not self.is_indexed(url):
                            new_pages.add(url)
                        link_text = self.get_text_only(link)
                        self.add_link_ref(page, url, link_text)

                self.commit()

            pages = new_pages

    def calculate_page_rank(self, iterations=20):
        # Clear out the current page rank tables
        self.connection.execute('DROP TABLE IF EXISTS pagerank')
        self.connection.execute('CREATE TABLE pagerank(urlid primary key, score)')

        # Initialize every URL with a PageRank of 1
        self.connection.execute(
            "INSERT INTO pagerank SELECT rowid, 1.0 FROM urllist"
        )
        self.commit()

        for i in range(iterations):
            print("Iteration %d") % i

            for (url_id,) in self.connection.execute('SELECT rowid FROM urllist'):
                rank = 0.15

                # Loop through all the pages that link to this one
                for (linker,) in self.connection.execute(
                    "SELECT DISTINCT fromid FROM link WHERE toid=%d" % url_id
                ):
                    # Get the page rank of the linker
                    linking_rank = self.connection.execute(
                        "SELECT score FROM pagerank WHERE urlid=%d" % linker
                    ).fetchone()[0]

                    # Get the total number of links from the linker
                    linking_count = self.connection.execute(
                        "SELECT COUNT(*) FROM link WHERE fromid=%d" % linker
                    ).fetchone()[0]

                    rank += 0.85 * (linking_rank / linking_count)

                self.connection.execute(
                    "UPDATE pagerank SET score=%f WHERE urlid=%d" % (rank, url_id)
                )
            self.commit()

    def create_index_tables(self):
        self.connection.execute('CREATE TABLE urllist(url)')
        self.connection.execute('CREATE TABLE wordlist(word)')
        self.connection.execute('CREATE TABLE wordlocation(urlid, wordid, location)')
        self.connection.execute('CREATE TABLE link(fromid INTEGER, toid INTEGER)')
        self.connection.execute('CREATE TABLE linkwords(wordid, linkid)')
        self.connection.execute('CREATE INDEX wordidx ON wordlist(word)')
        self.connection.execute('CREATE INDEX urlidx ON urllist(url)')
        self.connection.execute('CREATE INDEX wordurlidx ON wordlocation(wordid)')
        self.connection.execute('CREATE INDEX urltoidx ON link(toid)')
        self.connection.execute('CREATE INDEX urlfromidx ON link(fromid)')
        self.commit()


class Searcher:
    def __init__(self, db):
        self.connection = sqlite.connect(db)

    def __del__(self):
        self.connection.close()

    def get_match_rows(self, query):
        # Strings to build the query
        field_list = 'w0.urlid'
        table_list = ''
        clause_list = ''
        word_ids = []

        # Split the words by spaces
        words = query.split(' ')
        table_number = 0

        for word in words:
            # Get the word id
            word_row = self.connection.execute(
                "SELECT rowid FROM wordlist WHERE word = '%s'" % word
            ).fetchone()

        if None != word_row:
            word_id = word_row[0]
            word_ids.append(word_id)

            if table_number > 0:
                table_list += ','
                clause_list += ' AND '
                clause_list += 'w%d.urlid = w%d.urlid AND ' % (table_number - 1, table_number)
            field_list += ', w%d.location' % table_number
            table_list += 'wordlocation w%d' % table_number
            clause_list += 'w%d.wordid = %d' % (table_number, word_id)
            table_number += 1

        # Create the query
        full_query = 'SELECT %s FROM %s WHERE %s' % (field_list, table_list, clause_list)

        cur = self.connection.execute(full_query)
        rows = [row for row in cur]

        return rows,word_ids

    def get_scored_lists(self, rows, word_ids):
        total_scores = dict([(row[0], 0) for row in rows])

        weights = [
            (1.0, self.frequency_score(rows)),
            (1.0, self.location_score(rows)),
            (1.0, self.page_rank_score(rows)),
            (1.0, self.link_text_score(rows, word_ids))
        ]

        for (weight, scores) in weights:
            for url in total_scores:
                total_scores[url] += weight * scores[url]

        return total_scores

    def get_url_name(self, id):
        return self.connection.execute(
            "SELECT url FROM urllist WHERE rowid = %d" % id
        ).fetchone()[0]

    def query(self, query):
        rows,word_ids = self.get_match_rows(query)
        scores = self.get_scored_lists(rows, word_ids)
        ranked_scores = sorted(
            [(score, url) for (url, score) in scores.items()], reverse=1
        )

        for (score,url_id) in ranked_scores[0:10]:
            print('%f\t%s') % (score, self.get_url_name(url_id))

    def normalize_scores(self, scores, small_is_better=0):
        v_small = 0.00001
        if small_is_better:
            min_score = min(scores.values())
            return dict([(u, float(min_score)/max(v_small, 1)) for (u,l) in scores.items()])
        else:
            max_score = max(scores.values())
            if max_score == 0:
                max_score = v_small
            return dict([(u, float(c)/max_score) for (u,c) in scores.items()])

    def frequency_score(self, rows):
        counts = dict([(row[0], 0) for row in rows])
        for row in rows:
            counts[row[0]] += 1
        return self.normalize_scores(counts)

    def location_score(self, rows):
        locations = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            loc = sum(row[1:])
            if loc < locations[row[0]]:
                locations[row[0]] = loc

        return self.normalize_scores(locations, 1)

    def distance_score(self, rows):
        if len(rows[0]) <= 2:
            return dict([(row[0], 1.0) for row in rows])

        min_distance = dict([(row[0], 1000000) for row in rows])

        for row in rows:
            dist = sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
            if dist < min_distance[row[0]]:
                min_distance[row[0]] = dist

        return self.normalize_scores(min_distance, 1)

    def page_rank_score(self, rows):
        ranks = dict([(row[0], self.connection.execute(
            "SELECT score FROM pagerank WHERE urlid=%d" % row[0]
        ).fetchone()[0]) for row in rows])
        max_rank = max(ranks.values())
        normalized_scores = dict([(u, float(1)/max_rank) for (u, l) in ranks.items()])
        return normalized_scores

    def link_text_score(self, rows, word_ids):
        link_scores = dict([(row[0], 0) for row in rows])
        for word_id in word_ids:
            cur = self.connection.execute(
                "SELECT link.fromid, link.toid FROM linkwords,link"
                + " WHERE wordid=" + str(word_id)
                + " AND linkwords.linkid=link.rowid"
            )
            for (from_id, to_id) in cur:
                if to_id in link_scores:
                    rank = self.connection.execute(
                        "SELECT score FROM pagerank WHERE urlid=%d" % from_id
                    ).fetchone()[0]
                    link_scores[to_id] += rank
        max_score = max(link_scores.values())
        normalized_scores = dict([
            (u, float(1)/max_score) for (u,l) in link_scores.items()
        ])
        return normalized_scores
	__author__ = 'Juerg Gutknecht <juerg.gutknecht@students.ffhs.ch>'

	import urllib2
	from BeautifulSoup import *
	from urlparse import urljoin
	from pysqlite2 import dbapi2 as sqlite

	stopwords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])


	class Crawler:
	# Initialize the crawler with the name of the database
	def __init__(self, db):
	self.connection = sqlite.connect(db)

	def __del__(self):
	self.connection.close()

	def commit(self):
	self.connection.commit()

	def get_entry_id(self, table, field, value, create_new=True):
	cur = self.connection.execute(
	"SELECT rowid FROM %s WHERE %s = '%s'" % (table, field, value)
	)
	res = cur.fetchone()
	if None == res:
	cur = self.connection.execute(
	"INSERT INTO %s (%s) VALUES ('%s')" % (table, field, value)
	)
	return cur.lastrowid
	else:
	return res[0]

	def add_to_index(self, url, soup):
	if self.is_indexed(url):
	return
	print('Indexing ' + url)

	# Get the individual words
	text = self.get_text_only(soup)
	words = self.separate_words(text)

	# Get the URL id
	url_id = self.get_entry_id('urllist', 'url', url)

	# Link each word to this URL
	for i in range(len(words)):
	word = words[i]
	if word in stopwords:
	continue
	word_id = self.get_entry_id('wordlist', 'word', word)
	self.connection.execute(
	"INSERT INTO wordlocation(urlid, wordid, location) VALUES(%d, %d, %d)" % (url_id, word_id, i)
	)

	def get_text_only(self, soup):
	v = soup.string
	if None == v:
	c = soup.contents
	result_text = ''
	for t in c:
	subtext = self.get_text_only(t)
	result_text += subtext + '\n'
	return result_text
	else:
	return v.strip()

	def separate_words(self, text):
	splitter = re.compile('\\W*')
	return [s.lower() for s in splitter.split(text) if s != '']

	def is_indexed(self, url):
	u = self.connection.execute(
	"SELECT rowid FROM urllist WHERE url = '%s'" % url
	).fetchone()

	if None != u:
	# Check if it has actually been crawled
	v = self.connection.execute(
	"SELECT * FROM wordlocation WHERE urlid = %d" % u[0]
	).fetchone()
	if None != v:
	return True
	return False

	def add_link_ref(self, url_from, url_to, link_text):
	words = self.separate_words(link_text)
	from_id = self.get_entry_id('urllist', 'url', url_from)
	to_id = self.get_entry_id('urllist', 'url', url_to)
	if from_id == to_id:
	return
	cur = self.connection.execute(
	"INSERT INTO link(fromid, toid) VALUES (%d, %d)" % (from_id, to_id)
	)
	link_id = cur.lastrowid
	for word in words:
	if word in stopwords:
	continue
	word_id = self.get_entry_id('wordlist', 'word', word)
	self.connection.execute(
	"INSERT INTO linkwords(linkid, wordid) VALUES (%d, %d)" % (link_id, word_id)
	)

	def crawl(self, pages, depth=2):
	for i in range(depth):
	new_pages = set()
	for page in pages:
	try:
	c = urllib2.urlopen(page)
	except:
	print("Could not open %s") % page
	continue
	soup = BeautifulSoup(c.read())
	self.add_to_index(page, soup)

	links = soup('a')
	for link in links:
	if 'href' in dict(link.attrs):
	url = urljoin(page, link['href'])
	if url.find("'") != -1:
	continue
	url = url.split('#')[0]
	if url[0:4] == 'http' and not self.is_indexed(url):
	new_pages.add(url)
	link_text = self.get_text_only(link)
	self.add_link_ref(page, url, link_text)

	self.commit()

	pages = new_pages

	def calculate_page_rank(self, iterations=20):
	# Clear out the current page rank tables
	self.connection.execute('DROP TABLE IF EXISTS pagerank')
	self.connection.execute('CREATE TABLE pagerank(urlid primary key, score)')

	# Initialize every URL with a PageRank of 1
	self.connection.execute(
	"INSERT INTO pagerank SELECT rowid, 1.0 FROM urllist"
	)
	self.commit()

	for i in range(iterations):
	print("Iteration %d") % i

	for (url_id,) in self.connection.execute('SELECT rowid FROM urllist'):
	rank = 0.15

	# Loop through all the pages that link to this one
	for (linker,) in self.connection.execute(
	"SELECT DISTINCT fromid FROM link WHERE toid=%d" % url_id
	):
	# Get the page rank of the linker
	linking_rank = self.connection.execute(
	"SELECT score FROM pagerank WHERE urlid=%d" % linker
	).fetchone()[0]

	# Get the total number of links from the linker
	linking_count = self.connection.execute(
	"SELECT COUNT(*) FROM link WHERE fromid=%d" % linker
	).fetchone()[0]

	rank += 0.85 * (linking_rank / linking_count)

	self.connection.execute(
	"UPDATE pagerank SET score=%f WHERE urlid=%d" % (rank, url_id)
	)
	self.commit()

	def create_index_tables(self):
	self.connection.execute('CREATE TABLE urllist(url)')
	self.connection.execute('CREATE TABLE wordlist(word)')
	self.connection.execute('CREATE TABLE wordlocation(urlid, wordid, location)')
	self.connection.execute('CREATE TABLE link(fromid INTEGER, toid INTEGER)')
	self.connection.execute('CREATE TABLE linkwords(wordid, linkid)')
	self.connection.execute('CREATE INDEX wordidx ON wordlist(word)')
	self.connection.execute('CREATE INDEX urlidx ON urllist(url)')
	self.connection.execute('CREATE INDEX wordurlidx ON wordlocation(wordid)')
	self.connection.execute('CREATE INDEX urltoidx ON link(toid)')
	self.connection.execute('CREATE INDEX urlfromidx ON link(fromid)')
	self.commit()


	class Searcher:
	def __init__(self, db):
	self.connection = sqlite.connect(db)

	def __del__(self):
	self.connection.close()

	def get_match_rows(self, query):
	# Strings to build the query
	field_list = 'w0.urlid'
	table_list = ''
	clause_list = ''
	word_ids = []

	# Split the words by spaces
	words = query.split(' ')
	table_number = 0

	for word in words:
	# Get the word id
	word_row = self.connection.execute(
	"SELECT rowid FROM wordlist WHERE word = '%s'" % word
	).fetchone()

	if None != word_row:
	word_id = word_row[0]
	word_ids.append(word_id)

	if table_number > 0:
	table_list += ','
	clause_list += ' AND '
	clause_list += 'w%d.urlid = w%d.urlid AND ' % (table_number - 1, table_number)
	field_list += ', w%d.location' % table_number
	table_list += 'wordlocation w%d' % table_number
	clause_list += 'w%d.wordid = %d' % (table_number, word_id)
	table_number += 1

	# Create the query
	full_query = 'SELECT %s FROM %s WHERE %s' % (field_list, table_list, clause_list)

	cur = self.connection.execute(full_query)
	rows = [row for row in cur]

	return rows,word_ids

	def get_scored_lists(self, rows, word_ids):
	total_scores = dict([(row[0], 0) for row in rows])

	weights = [
	(1.0, self.frequency_score(rows)),
	(1.0, self.location_score(rows)),
	(1.0, self.page_rank_score(rows)),
	(1.0, self.link_text_score(rows, word_ids))
	]

	for (weight, scores) in weights:
	for url in total_scores:
	total_scores[url] += weight * scores[url]

	return total_scores

	def get_url_name(self, id):
	return self.connection.execute(
	"SELECT url FROM urllist WHERE rowid = %d" % id
	).fetchone()[0]

	def query(self, query):
	rows,word_ids = self.get_match_rows(query)
	scores = self.get_scored_lists(rows, word_ids)
	ranked_scores = sorted(
	[(score, url) for (url, score) in scores.items()], reverse=1
	)

	for (score,url_id) in ranked_scores[0:10]:
	print('%f\t%s') % (score, self.get_url_name(url_id))

	def normalize_scores(self, scores, small_is_better=0):
	v_small = 0.00001
	if small_is_better:
	min_score = min(scores.values())
	return dict([(u, float(min_score)/max(v_small, 1)) for (u,l) in scores.items()])
	else:
	max_score = max(scores.values())
	if max_score == 0:
	max_score = v_small
	return dict([(u, float(c)/max_score) for (u,c) in scores.items()])

	def frequency_score(self, rows):
	counts = dict([(row[0], 0) for row in rows])
	for row in rows:
	counts[row[0]] += 1
	return self.normalize_scores(counts)

	def location_score(self, rows):
	locations = dict([(row[0], 1000000) for row in rows])
	for row in rows:
	loc = sum(row[1:])
	if loc < locations[row[0]]:
	locations[row[0]] = loc

	return self.normalize_scores(locations, 1)

	def distance_score(self, rows):
	if len(rows[0]) <= 2:
	return dict([(row[0], 1.0) for row in rows])

	min_distance = dict([(row[0], 1000000) for row in rows])

	for row in rows:
	dist = sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
	if dist < min_distance[row[0]]:
	min_distance[row[0]] = dist

	return self.normalize_scores(min_distance, 1)

	def page_rank_score(self, rows):
	ranks = dict([(row[0], self.connection.execute(
	"SELECT score FROM pagerank WHERE urlid=%d" % row[0]
	).fetchone()[0]) for row in rows])
	max_rank = max(ranks.values())
	normalized_scores = dict([(u, float(1)/max_rank) for (u, l) in ranks.items()])
	return normalized_scores

	def link_text_score(self, rows, word_ids):
	link_scores = dict([(row[0], 0) for row in rows])
	for word_id in word_ids:
	cur = self.connection.execute(
	"SELECT link.fromid, link.toid FROM linkwords,link"
	+ " WHERE wordid=" + str(word_id)
	+ " AND linkwords.linkid=link.rowid"
	)
	for (from_id, to_id) in cur:
	if to_id in link_scores:
	rank = self.connection.execute(
	"SELECT score FROM pagerank WHERE urlid=%d" % from_id
	).fetchone()[0]
	link_scores[to_id] += rank
	max_score = max(link_scores.values())
	normalized_scores = dict([
	(u, float(1)/max_score) for (u,l) in link_scores.items()
	])
	return normalized_scores