Last active
December 25, 2015 07:28
-
-
Save paulorsbrito/6939028 to your computer and use it in GitHub Desktop.
This custom django-haystack highligther apply Whoosh's ChartsetFilter to both
the keyword and the text, so a keyword 'café' gets highlighted on a text
'have a cup of cofe' and 'drink some café' as well. As you can see, it's
strong coupled to whoosh backend.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from django.utils.html import strip_tags | |
from haystack.utils import Highlighter | |
from whoosh.analysis import CharsetFilter, RegexTokenizer | |
from whoosh.support.charset import accent_map | |
""" | |
This custom django-haystack highligther apply Whoosh's ChartsetFilter to both | |
the keyword and the text, so a keyword 'café' gets highlighted on a text | |
'have a cup of cofe' and 'drink some café' as well. As you can see, it's | |
strong coupled to whoosh backend. | |
Put this file on python path and add the following on settings.py: | |
HAYSTACK_CUSTOM_HIGHLIGHTER = 'path.to.lib.CustomHighlighter' | |
https://gist.github.com/paulorsbrito/6939028 | |
""" | |
class CustomHighlighter(Highlighter): | |
def __init__(self, query, **kwargs): | |
super(CustomHighlighter, self).__init__(query, **kwargs); | |
self.query_words = set([self.filter(word) for word in self.query_words]) | |
self.full_text = ('max_length' in kwargs) and (kwargs['max_length'] == "-1") | |
def filter(self, text): | |
tokenizer = RegexTokenizer() | |
tokens = list([t.text for t in tokenizer(text)]) | |
filtered_tokens = list([t.text for t in CharsetFilter(accent_map)(tokenizer(text))]) | |
index = 0 | |
for token in tokens: | |
text = text.replace(token, filtered_tokens[index]) | |
index += 1 | |
return text | |
def highlight(self, text_block): | |
original_text_block = unicode(text_block) | |
self.text_block = self.filter(strip_tags(text_block)) | |
highlight_locations = self.find_highlightable_words() | |
start_offset, end_offset = self.find_window(highlight_locations) | |
if self.full_text: | |
start_offset = 0 | |
end_offset = len(unicode(text_block)) | |
self.text_block = original_text_block | |
return self.render_html(highlight_locations, start_offset, end_offset) | |
def render_html(self, highlight_locations=None, start_offset=None, end_offset=None): | |
# Start by chopping the block down to the proper window. | |
text = unicode(self.text_block[start_offset:end_offset]) | |
# Invert highlight_locations to a location -> term list | |
term_list = [] | |
for term, locations in highlight_locations.items(): | |
term_list += [(loc - start_offset, term) for loc in locations] | |
loc_to_term = sorted(term_list) | |
# Prepare the highlight template | |
if self.css_class: | |
hl_start = '<%s class="%s">' % (self.html_tag, self.css_class) | |
else: | |
hl_start = '<%s>' % (self.html_tag) | |
hl_end = '</%s>' % self.html_tag | |
# Copy the part from the start of the string to the first match, | |
# and there replace the match with a highlighted version. | |
highlighted_chunk = "" | |
matched_so_far = 0 | |
prev = 0 | |
prev_str = "" | |
for cur, cur_str in loc_to_term: | |
# This can be in a different case than cur_str | |
actual_term = text[cur:cur + len(cur_str)] | |
if cur < prev + len(prev_str): | |
continue | |
highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end | |
prev = cur | |
prev_str = cur_str | |
# Keep track of how far we've copied so far, for the last step | |
matched_so_far = cur + len(actual_term) | |
# Don't forget the chunk after the last term | |
highlighted_chunk += text[matched_so_far:] | |
if start_offset > 0: | |
highlighted_chunk = '...%s' % highlighted_chunk | |
if end_offset < len(self.text_block): | |
highlighted_chunk = '%s...' % highlighted_chunk | |
return highlighted_chunk |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment