Skip to content

Instantly share code, notes, and snippets.

@paulorsbrito
Last active December 25, 2015 07:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paulorsbrito/6939028 to your computer and use it in GitHub Desktop.
Save paulorsbrito/6939028 to your computer and use it in GitHub Desktop.
This custom django-haystack highligther apply Whoosh's ChartsetFilter to both the keyword and the text, so a keyword 'café' gets highlighted on a text 'have a cup of cofe' and 'drink some café' as well. As you can see, it's strong coupled to whoosh backend.
# -*- coding: utf-8 -*-
from django.utils.html import strip_tags
from haystack.utils import Highlighter
from whoosh.analysis import CharsetFilter, RegexTokenizer
from whoosh.support.charset import accent_map
"""
This custom django-haystack highligther apply Whoosh's ChartsetFilter to both
the keyword and the text, so a keyword 'café' gets highlighted on a text
'have a cup of cofe' and 'drink some café' as well. As you can see, it's
strong coupled to whoosh backend.
Put this file on python path and add the following on settings.py:
HAYSTACK_CUSTOM_HIGHLIGHTER = 'path.to.lib.CustomHighlighter'
https://gist.github.com/paulorsbrito/6939028
"""
class CustomHighlighter(Highlighter):
def __init__(self, query, **kwargs):
super(CustomHighlighter, self).__init__(query, **kwargs);
self.query_words = set([self.filter(word) for word in self.query_words])
self.full_text = ('max_length' in kwargs) and (kwargs['max_length'] == "-1")
def filter(self, text):
tokenizer = RegexTokenizer()
tokens = list([t.text for t in tokenizer(text)])
filtered_tokens = list([t.text for t in CharsetFilter(accent_map)(tokenizer(text))])
index = 0
for token in tokens:
text = text.replace(token, filtered_tokens[index])
index += 1
return text
def highlight(self, text_block):
original_text_block = unicode(text_block)
self.text_block = self.filter(strip_tags(text_block))
highlight_locations = self.find_highlightable_words()
start_offset, end_offset = self.find_window(highlight_locations)
if self.full_text:
start_offset = 0
end_offset = len(unicode(text_block))
self.text_block = original_text_block
return self.render_html(highlight_locations, start_offset, end_offset)
def render_html(self, highlight_locations=None, start_offset=None, end_offset=None):
# Start by chopping the block down to the proper window.
text = unicode(self.text_block[start_offset:end_offset])
# Invert highlight_locations to a location -> term list
term_list = []
for term, locations in highlight_locations.items():
term_list += [(loc - start_offset, term) for loc in locations]
loc_to_term = sorted(term_list)
# Prepare the highlight template
if self.css_class:
hl_start = '<%s class="%s">' % (self.html_tag, self.css_class)
else:
hl_start = '<%s>' % (self.html_tag)
hl_end = '</%s>' % self.html_tag
# Copy the part from the start of the string to the first match,
# and there replace the match with a highlighted version.
highlighted_chunk = ""
matched_so_far = 0
prev = 0
prev_str = ""
for cur, cur_str in loc_to_term:
# This can be in a different case than cur_str
actual_term = text[cur:cur + len(cur_str)]
if cur < prev + len(prev_str):
continue
highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end
prev = cur
prev_str = cur_str
# Keep track of how far we've copied so far, for the last step
matched_so_far = cur + len(actual_term)
# Don't forget the chunk after the last term
highlighted_chunk += text[matched_so_far:]
if start_offset > 0:
highlighted_chunk = '...%s' % highlighted_chunk
if end_offset < len(self.text_block):
highlighted_chunk = '%s...' % highlighted_chunk
return highlighted_chunk
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment