Skip to content

Instantly share code, notes, and snippets.

@davidbgk
Forked from adamcharnock/gist:389875
Created February 1, 2011 08:46
Show Gist options
  • Save davidbgk/805600 to your computer and use it in GitHub Desktop.
Save davidbgk/805600 to your computer and use it in GitHub Desktop.
Double metaphone
import re
# http://atomboy.isa-geek.com/plone/Members/acoil/programing/double-metaphone
from metaphone import dm as double_metaphone
# get the Redis connection
from jellybean.core import redis
import models
# Words which should not be indexed
STOP_WORDS = ("the", "of", "to", "and", "a", "in", "is", "it", "you", "that")
# Do not index any words shorter than this
MIN_WORD_LENGTH = 3
# Consider these characters to be punctuation (they will be replaced with spaces prior to word extraction)
PUNCTUATION_CHARS = ".,;:!?@£$%^&*()-–<>[]{}\\|/`~'\""
# A redis key to store a list of metaphones present in this project
REDIS_KEY_METAPHONES = "project_id:%(project_id)d:fulltext_search:metaphones"
# A redis key to store a list of item IDs which have the given metaphone within the given project
REDIS_KEY_METAPHONE = "project_id:%(project_id)d:fulltext_search:metaphone:%(metaphone)s"
class FullTextIndex(object):
"""A class to provide full-text indexing functionality using Redis"""
def __init__(self):
self.punctuation_regex = re.compile(r"[%s]" % re.escape(PUNCTUATION_CHARS))
super(FullTextIndex, self).__init__()
def get_words_from_text(self, text):
"""Extract a list of words to index from the given text"""
if not text:
return []
text = self.punctuation_regex.sub(" ", text)
words = text.split()
words = [word for word in text.split() if len(word) >= MIN_WORD_LENGTH and word.lower() not in STOP_WORDS]
return words
def index_item(self, item):
"""Extract content from the given item and add it to the index"""
# TODO: Added item users to index
words = self.get_words_from_text(item.subject)
words += self.get_words_from_text(item.body)
words += self.get_words_from_text(item.milestone.name)
words += self.get_words_from_text(item.type_name)
words += self.get_words_from_text(" ".join(item.tags))
metaphones = self.get_metaphones(words)
for metaphone in metaphones:
self._link_item_and_metaphone(item, metaphone)
def index_item_content(self, item, content):
"""Index a specific bit of item content"""
words = self.get_words_from_text(content)
metaphones = self.get_metaphones(words)
for metaphone in metaphones:
self._link_item_and_metaphone(item, metaphone)
def _link_item_and_metaphone(self, item, metaphone):
# Add the item to the metaphone key
redis_key = REDIS_KEY_METAPHONE % {"project_id": item.project_id, "metaphone": metaphone}
redis.sadd(redis_key, item.item_id)
# Make sure we record that this project contains this metaphone
redis_key = REDIS_KEY_METAPHONES % {"project_id": item.project_id}
redis.sadd(redis_key, metaphone)
def get_metaphones(self, words):
"""Get the metaphones for a given list of words"""
metaphones = set()
for word in words:
metaphone = double_metaphone(unicode(word))
metaphones.add(metaphone[0].strip())
if(metaphone[1]):
metaphones.add(metaphone[1].strip())
return metaphones
def reindex_project(self, project_id):
"""Reindex an entire project, removing the existing index for the project"""
# Remove all the existing index data
redis_key = REDIS_KEY_METAPHONES % {"project_id": project_id}
project_metaphones = redis.smembers(redis_key)
if project_metaphones is None:
project_metaphones = []
redis.delete(redis_key)
for project_metaphone in project_metaphones:
redis.delete(REDIS_KEY_METAPHONE % {"project_id": project_id, "metaphone": project_metaphone})
# Now index each item
project = models.Project(project_id)
for item in project.items:
self.index_item(item)
return True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment