karolyi/search_highlight.py

## search_highlight.py
from functools import lru_cache

from bs4 import BeautifulSoup as bs
from bs4 import Tag
from bs4.element import NavigableString
from django.utils.html import escape
from django.utils.functional import cached_property
from unidecode import unidecode

from . import memoized_method

@lru_cache(maxsize=5000)
def cached_unidecode_char(char: str) -> str:
    """Resolve and return one character with unidecode."""
    return unidecode(string=char).lower()


@lru_cache(maxsize=5000)
def cached_unidecode(string: str) -> str:
    """Translate, cache and return an unidecoded string."""
    return unidecode(string=string).lower()


class HtmlHighlighter(object):
    """Highlighter of HTML-based search results."""
    term_min_len = 2
    hl_pre = '<span class="search-result-highlight">'
    hl_post = '</span>'

    def __init__(self, term: str, hl_pre: str = None, hl_post: str = None):
        self.term = term
        self.hl_pre = hl_pre or self.hl_pre
        self.hl_post = hl_post or self.hl_post

    @cached_property
    def unidecoded_terms(self) -> tuple:
        """Compile the terms into unidecoded terms."""
        terms = {x for x in self.term.split() if len(x) >= self.term_min_len}
        result = set()
        for term in terms:
            unidecoded_term = cached_unidecode(string=term)
            result.add(unidecoded_term)
        return tuple(result)

    @memoized_method(maxsize=30)
    def _reduce_overlapping_terms(self, terms: tuple) -> tuple:
        """
        Reduce and return the found search terms that may overlap, to
        the longest overlapping ones.
        """
        if len(terms) < 2:
            # Zero or one element
            return terms
        term_set = set(terms)
        new_terms = ()
        for term in terms:
            other_terms = term_set - {term}
            # Check if the term is part of any other terms
            if not [x for x in other_terms if term in x]:
                new_terms += term,
        return new_terms

    @memoized_method(maxsize=30)
    def _get_highlighted_words(
            self, word: str, found_unidecoded_terms: tuple) -> str:
        """
        Take the passed `term` and replace any of its originals in the
        passed `word`.
        """
        unidecoded_part = result = word_part = ''
        was_partial = False
        for letter in word:
            unidecoded_part += cached_unidecode_char(char=letter)
            word_part += letter
            if unidecoded_part in found_unidecoded_terms:
                # Full match, add the highlighted word part
                result += \
                    f'{self.hl_pre}{escape(text=word_part)}{self.hl_post}'
                word_part = unidecoded_part = ''
                continue
            partials_matches = [
                x for x in found_unidecoded_terms
                if x.startswith(unidecoded_part)]
            if partials_matches:
                # Partial match, wait (iterate further) for the match
                was_partial = True
                continue
            if was_partial:
                # Former cycle was partial match, add word_part
                result += escape(text=word_part)
                was_partial = False
                word_part = unidecoded_part = ''
                continue
            # No match, not even partial, add the letter, reset iters
            # and continue looking
            word_part = unidecoded_part = ''
            result += escape(text=letter)
        if was_partial:
            # The last letters were a partial match
            result += escape(text=word_part)
        return result

    def _replace_text(self, text: NavigableString, result: str):
        """Replace the passed `text` with the replaced HTML content."""
        result = f'<body>{result}</body>'
        html = bs(markup=result, features='lxml')
        childrens = list(html.body.children)
        text.replace_with(replace_with=childrens[-1])
        for item in childrens[:-1]:
            childrens[-1].insert_before(item)

    def _find_in_text(self, text: NavigableString) -> str:
        """
        Unidecode the splitted elements in the strings, look for
        matches, and add the highlight HTML to the words that match.

        Return the reconstructed HTML string.
        """
        result = []
        is_modified = False
        for str_item in text.split():
            is_found = False
            found_unidecoded_terms = ()
            unidecoded_str = cached_unidecode(string=str_item)
            for unidecoded_term in self.unidecoded_terms:
                if unidecoded_term in unidecoded_str:
                    is_found = True
                    found_unidecoded_terms += unidecoded_term,
            if not is_found:
                result.append(escape(text=str_item))
                continue
            # Match!
            is_modified = True
            found_unidecoded_terms = self._reduce_overlapping_terms(
                terms=found_unidecoded_terms)
            result.append(self._get_highlighted_words(
                word=str_item, found_unidecoded_terms=found_unidecoded_terms))
        if is_modified:
            changed_text = ' '.join(result)
            self._replace_text(text=text, result=changed_text)

    def _find_text(self):
        """Find the regex in the HTML."""
        for navigable_str in self.bs.body.find_all(
                text=True):  # type: NavigableString
            if not type(navigable_str) is NavigableString:
                # It can be a comment
                continue
            if navigable_str.parent.name in ['script', 'style']:
                # Skip highlighting script and style content
                continue
            self._find_in_text(text=navigable_str)

    def highlight(self, html: str) -> str:
        """Highlight the terms in the HTML input."""
        html = '<body>' + html + '</body>'
        self.bs = bs(markup=html, features='lxml')  # type: Tag
        self._find_text()
        return self.bs.body.decode_contents()
	from functools import lru_cache

	from bs4 import BeautifulSoup as bs
	from bs4 import Tag
	from bs4.element import NavigableString
	from django.utils.html import escape
	from django.utils.functional import cached_property
	from unidecode import unidecode

	from . import memoized_method

	@lru_cache(maxsize=5000)
	def cached_unidecode_char(char: str) -> str:
	"""Resolve and return one character with unidecode."""
	return unidecode(string=char).lower()


	@lru_cache(maxsize=5000)
	def cached_unidecode(string: str) -> str:
	"""Translate, cache and return an unidecoded string."""
	return unidecode(string=string).lower()


	class HtmlHighlighter(object):
	"""Highlighter of HTML-based search results."""
	term_min_len = 2
	hl_pre = '<span class="search-result-highlight">'
	hl_post = '</span>'

	def __init__(self, term: str, hl_pre: str = None, hl_post: str = None):
	self.term = term
	self.hl_pre = hl_pre or self.hl_pre
	self.hl_post = hl_post or self.hl_post

	@cached_property
	def unidecoded_terms(self) -> tuple:
	"""Compile the terms into unidecoded terms."""
	terms = {x for x in self.term.split() if len(x) >= self.term_min_len}
	result = set()
	for term in terms:
	unidecoded_term = cached_unidecode(string=term)
	result.add(unidecoded_term)
	return tuple(result)

	@memoized_method(maxsize=30)
	def _reduce_overlapping_terms(self, terms: tuple) -> tuple:
	"""
	Reduce and return the found search terms that may overlap, to
	the longest overlapping ones.
	"""
	if len(terms) < 2:
	# Zero or one element
	return terms
	term_set = set(terms)
	new_terms = ()
	for term in terms:
	other_terms = term_set - {term}
	# Check if the term is part of any other terms
	if not [x for x in other_terms if term in x]:
	new_terms += term,
	return new_terms

	@memoized_method(maxsize=30)
	def _get_highlighted_words(
	self, word: str, found_unidecoded_terms: tuple) -> str:
	"""
	Take the passed `term` and replace any of its originals in the
	passed `word`.
	"""
	unidecoded_part = result = word_part = ''
	was_partial = False
	for letter in word:
	unidecoded_part += cached_unidecode_char(char=letter)
	word_part += letter
	if unidecoded_part in found_unidecoded_terms:
	# Full match, add the highlighted word part
	result += \
	f'{self.hl_pre}{escape(text=word_part)}{self.hl_post}'
	word_part = unidecoded_part = ''
	continue
	partials_matches = [
	x for x in found_unidecoded_terms
	if x.startswith(unidecoded_part)]
	if partials_matches:
	# Partial match, wait (iterate further) for the match
	was_partial = True
	continue
	if was_partial:
	# Former cycle was partial match, add word_part
	result += escape(text=word_part)
	was_partial = False
	word_part = unidecoded_part = ''
	continue
	# No match, not even partial, add the letter, reset iters
	# and continue looking
	word_part = unidecoded_part = ''
	result += escape(text=letter)
	if was_partial:
	# The last letters were a partial match
	result += escape(text=word_part)
	return result

	def _replace_text(self, text: NavigableString, result: str):
	"""Replace the passed `text` with the replaced HTML content."""
	result = f'<body>{result}</body>'
	html = bs(markup=result, features='lxml')
	childrens = list(html.body.children)
	text.replace_with(replace_with=childrens[-1])
	for item in childrens[:-1]:
	childrens[-1].insert_before(item)

	def _find_in_text(self, text: NavigableString) -> str:
	"""
	Unidecode the splitted elements in the strings, look for
	matches, and add the highlight HTML to the words that match.

	Return the reconstructed HTML string.
	"""
	result = []
	is_modified = False
	for str_item in text.split():
	is_found = False
	found_unidecoded_terms = ()
	unidecoded_str = cached_unidecode(string=str_item)
	for unidecoded_term in self.unidecoded_terms:
	if unidecoded_term in unidecoded_str:
	is_found = True
	found_unidecoded_terms += unidecoded_term,
	if not is_found:
	result.append(escape(text=str_item))
	continue
	# Match!
	is_modified = True
	found_unidecoded_terms = self._reduce_overlapping_terms(
	terms=found_unidecoded_terms)
	result.append(self._get_highlighted_words(
	word=str_item, found_unidecoded_terms=found_unidecoded_terms))
	if is_modified:
	changed_text = ' '.join(result)
	self._replace_text(text=text, result=changed_text)

	def _find_text(self):
	"""Find the regex in the HTML."""
	for navigable_str in self.bs.body.find_all(
	text=True): # type: NavigableString
	if not type(navigable_str) is NavigableString:
	# It can be a comment
	continue
	if navigable_str.parent.name in ['script', 'style']:
	# Skip highlighting script and style content
	continue
	self._find_in_text(text=navigable_str)

	def highlight(self, html: str) -> str:
	"""Highlight the terms in the HTML input."""
	html = '<body>' + html + '</body>'
	self.bs = bs(markup=html, features='lxml') # type: Tag
	self._find_text()
	return self.bs.body.decode_contents()