paulorsbrito/custom_highlighter.py

## custom_highlighter.py
# -*- coding: utf-8 -*-

from django.utils.html import strip_tags
from haystack.utils import Highlighter
from whoosh.analysis import CharsetFilter, RegexTokenizer
from whoosh.support.charset import accent_map

"""
This custom django-haystack highligther apply Whoosh's ChartsetFilter to both
the keyword and the text, so a keyword 'café' gets highlighted on a text
'have a cup of cofe' and 'drink some café' as well. As you can see, it's
strong coupled to whoosh backend.

Put this file on python path and add the following on settings.py:

HAYSTACK_CUSTOM_HIGHLIGHTER = 'path.to.lib.CustomHighlighter'


https://gist.github.com/paulorsbrito/6939028

"""

class CustomHighlighter(Highlighter):

    def __init__(self, query, **kwargs):
        super(CustomHighlighter, self).__init__(query, **kwargs);

        self.query_words = set([self.filter(word) for word in self.query_words])
        self.full_text = ('max_length' in kwargs) and (kwargs['max_length'] == "-1")

    def filter(self, text):
        tokenizer = RegexTokenizer()

        tokens = list([t.text for t in tokenizer(text)])
        filtered_tokens = list([t.text for t in CharsetFilter(accent_map)(tokenizer(text))])

        index = 0

        for token in tokens:
            text = text.replace(token, filtered_tokens[index])
            index += 1

        return text

    def highlight(self, text_block):
        original_text_block = unicode(text_block)

        self.text_block = self.filter(strip_tags(text_block))

        highlight_locations = self.find_highlightable_words()
        start_offset, end_offset = self.find_window(highlight_locations)

        if self.full_text:
            start_offset = 0
            end_offset = len(unicode(text_block))

        self.text_block = original_text_block
        return self.render_html(highlight_locations, start_offset, end_offset)

    def render_html(self, highlight_locations=None, start_offset=None, end_offset=None):
        # Start by chopping the block down to the proper window.
        text = unicode(self.text_block[start_offset:end_offset])

        # Invert highlight_locations to a location -> term list
        term_list = []

        for term, locations in highlight_locations.items():
            term_list += [(loc - start_offset, term) for loc in locations]

        loc_to_term = sorted(term_list)

        # Prepare the highlight template
        if self.css_class:
            hl_start = '<%s class="%s">' % (self.html_tag, self.css_class)
        else:
            hl_start = '<%s>' % (self.html_tag)

        hl_end = '</%s>' % self.html_tag

        # Copy the part from the start of the string to the first match,
        # and there replace the match with a highlighted version.
        highlighted_chunk = ""
        matched_so_far = 0
        prev = 0
        prev_str = ""

        for cur, cur_str in loc_to_term:
            # This can be in a different case than cur_str
            actual_term = text[cur:cur + len(cur_str)]

            if cur < prev + len(prev_str):
                continue

            highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end
            prev = cur
            prev_str = cur_str

            # Keep track of how far we've copied so far, for the last step
            matched_so_far = cur + len(actual_term)

        # Don't forget the chunk after the last term
        highlighted_chunk += text[matched_so_far:]

        if start_offset > 0:
            highlighted_chunk = '...%s' % highlighted_chunk

        if end_offset < len(self.text_block):
            highlighted_chunk = '%s...' % highlighted_chunk

        return highlighted_chunk
	# -- coding: utf-8 --

	from django.utils.html import strip_tags
	from haystack.utils import Highlighter
	from whoosh.analysis import CharsetFilter, RegexTokenizer
	from whoosh.support.charset import accent_map

	"""
	This custom django-haystack highligther apply Whoosh's ChartsetFilter to both
	the keyword and the text, so a keyword 'café' gets highlighted on a text
	'have a cup of cofe' and 'drink some café' as well. As you can see, it's
	strong coupled to whoosh backend.

	Put this file on python path and add the following on settings.py:

	HAYSTACK_CUSTOM_HIGHLIGHTER = 'path.to.lib.CustomHighlighter'


	https://gist.github.com/paulorsbrito/6939028

	"""

	class CustomHighlighter(Highlighter):

	def __init__(self, query, **kwargs):
	super(CustomHighlighter, self).__init__(query, **kwargs);

	self.query_words = set([self.filter(word) for word in self.query_words])
	self.full_text = ('max_length' in kwargs) and (kwargs['max_length'] == "-1")

	def filter(self, text):
	tokenizer = RegexTokenizer()

	tokens = list([t.text for t in tokenizer(text)])
	filtered_tokens = list([t.text for t in CharsetFilter(accent_map)(tokenizer(text))])

	index = 0

	for token in tokens:
	text = text.replace(token, filtered_tokens[index])
	index += 1

	return text

	def highlight(self, text_block):
	original_text_block = unicode(text_block)

	self.text_block = self.filter(strip_tags(text_block))

	highlight_locations = self.find_highlightable_words()
	start_offset, end_offset = self.find_window(highlight_locations)

	if self.full_text:
	start_offset = 0
	end_offset = len(unicode(text_block))

	self.text_block = original_text_block
	return self.render_html(highlight_locations, start_offset, end_offset)

	def render_html(self, highlight_locations=None, start_offset=None, end_offset=None):
	# Start by chopping the block down to the proper window.
	text = unicode(self.text_block[start_offset:end_offset])

	# Invert highlight_locations to a location -> term list
	term_list = []

	for term, locations in highlight_locations.items():
	term_list += [(loc - start_offset, term) for loc in locations]

	loc_to_term = sorted(term_list)

	# Prepare the highlight template
	if self.css_class:
	hl_start = '<%s class="%s">' % (self.html_tag, self.css_class)
	else:
	hl_start = '<%s>' % (self.html_tag)

	hl_end = '</%s>' % self.html_tag

	# Copy the part from the start of the string to the first match,
	# and there replace the match with a highlighted version.
	highlighted_chunk = ""
	matched_so_far = 0
	prev = 0
	prev_str = ""

	for cur, cur_str in loc_to_term:
	# This can be in a different case than cur_str
	actual_term = text[cur:cur + len(cur_str)]

	if cur < prev + len(prev_str):
	continue

	highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end
	prev = cur
	prev_str = cur_str

	# Keep track of how far we've copied so far, for the last step
	matched_so_far = cur + len(actual_term)

	# Don't forget the chunk after the last term
	highlighted_chunk += text[matched_so_far:]

	if start_offset > 0:
	highlighted_chunk = '...%s' % highlighted_chunk

	if end_offset < len(self.text_block):
	highlighted_chunk = '%s...' % highlighted_chunk

	return highlighted_chunk