-
-
Save davidbgk/805600 to your computer and use it in GitHub Desktop.
Double metaphone
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# http://atomboy.isa-geek.com/plone/Members/acoil/programing/double-metaphone | |
from metaphone import dm as double_metaphone | |
# get the Redis connection | |
from jellybean.core import redis | |
import models | |
# Words which should not be indexed | |
STOP_WORDS = ("the", "of", "to", "and", "a", "in", "is", "it", "you", "that") | |
# Do not index any words shorter than this | |
MIN_WORD_LENGTH = 3 | |
# Consider these characters to be punctuation (they will be replaced with spaces prior to word extraction) | |
PUNCTUATION_CHARS = ".,;:!?@£$%^&*()-–<>[]{}\\|/`~'\"" | |
# A redis key to store a list of metaphones present in this project | |
REDIS_KEY_METAPHONES = "project_id:%(project_id)d:fulltext_search:metaphones" | |
# A redis key to store a list of item IDs which have the given metaphone within the given project | |
REDIS_KEY_METAPHONE = "project_id:%(project_id)d:fulltext_search:metaphone:%(metaphone)s" | |
class FullTextIndex(object): | |
"""A class to provide full-text indexing functionality using Redis""" | |
def __init__(self): | |
self.punctuation_regex = re.compile(r"[%s]" % re.escape(PUNCTUATION_CHARS)) | |
super(FullTextIndex, self).__init__() | |
def get_words_from_text(self, text): | |
"""Extract a list of words to index from the given text""" | |
if not text: | |
return [] | |
text = self.punctuation_regex.sub(" ", text) | |
words = text.split() | |
words = [word for word in text.split() if len(word) >= MIN_WORD_LENGTH and word.lower() not in STOP_WORDS] | |
return words | |
def index_item(self, item): | |
"""Extract content from the given item and add it to the index""" | |
# TODO: Added item users to index | |
words = self.get_words_from_text(item.subject) | |
words += self.get_words_from_text(item.body) | |
words += self.get_words_from_text(item.milestone.name) | |
words += self.get_words_from_text(item.type_name) | |
words += self.get_words_from_text(" ".join(item.tags)) | |
metaphones = self.get_metaphones(words) | |
for metaphone in metaphones: | |
self._link_item_and_metaphone(item, metaphone) | |
def index_item_content(self, item, content): | |
"""Index a specific bit of item content""" | |
words = self.get_words_from_text(content) | |
metaphones = self.get_metaphones(words) | |
for metaphone in metaphones: | |
self._link_item_and_metaphone(item, metaphone) | |
def _link_item_and_metaphone(self, item, metaphone): | |
# Add the item to the metaphone key | |
redis_key = REDIS_KEY_METAPHONE % {"project_id": item.project_id, "metaphone": metaphone} | |
redis.sadd(redis_key, item.item_id) | |
# Make sure we record that this project contains this metaphone | |
redis_key = REDIS_KEY_METAPHONES % {"project_id": item.project_id} | |
redis.sadd(redis_key, metaphone) | |
def get_metaphones(self, words): | |
"""Get the metaphones for a given list of words""" | |
metaphones = set() | |
for word in words: | |
metaphone = double_metaphone(unicode(word)) | |
metaphones.add(metaphone[0].strip()) | |
if(metaphone[1]): | |
metaphones.add(metaphone[1].strip()) | |
return metaphones | |
def reindex_project(self, project_id): | |
"""Reindex an entire project, removing the existing index for the project""" | |
# Remove all the existing index data | |
redis_key = REDIS_KEY_METAPHONES % {"project_id": project_id} | |
project_metaphones = redis.smembers(redis_key) | |
if project_metaphones is None: | |
project_metaphones = [] | |
redis.delete(redis_key) | |
for project_metaphone in project_metaphones: | |
redis.delete(REDIS_KEY_METAPHONE % {"project_id": project_id, "metaphone": project_metaphone}) | |
# Now index each item | |
project = models.Project(project_id) | |
for item in project.items: | |
self.index_item(item) | |
return True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment