Skip to content

Instantly share code, notes, and snippets.

@karolyi
Last active June 10, 2019 17:09
Show Gist options
  • Save karolyi/02510555e45f9050c945096233458351 to your computer and use it in GitHub Desktop.
Save karolyi/02510555e45f9050c945096233458351 to your computer and use it in GitHub Desktop.
Highlighter of search terms in HTML
from functools import lru_cache
from bs4 import BeautifulSoup as bs
from bs4 import Tag
from bs4.element import NavigableString
from django.utils.html import escape
from django.utils.functional import cached_property
from unidecode import unidecode
from . import memoized_method
@lru_cache(maxsize=5000)
def cached_unidecode_char(char: str) -> str:
"""Resolve and return one character with unidecode."""
return unidecode(string=char).lower()
@lru_cache(maxsize=5000)
def cached_unidecode(string: str) -> str:
"""Translate, cache and return an unidecoded string."""
return unidecode(string=string).lower()
class HtmlHighlighter(object):
"""Highlighter of HTML-based search results."""
term_min_len = 2
hl_pre = '<span class="search-result-highlight">'
hl_post = '</span>'
def __init__(self, term: str, hl_pre: str = None, hl_post: str = None):
self.term = term
self.hl_pre = hl_pre or self.hl_pre
self.hl_post = hl_post or self.hl_post
@cached_property
def unidecoded_terms(self) -> tuple:
"""Compile the terms into unidecoded terms."""
terms = {x for x in self.term.split() if len(x) >= self.term_min_len}
result = set()
for term in terms:
unidecoded_term = cached_unidecode(string=term)
result.add(unidecoded_term)
return tuple(result)
@memoized_method(maxsize=30)
def _reduce_overlapping_terms(self, terms: tuple) -> tuple:
"""
Reduce and return the found search terms that may overlap, to
the longest overlapping ones.
"""
if len(terms) < 2:
# Zero or one element
return terms
term_set = set(terms)
new_terms = ()
for term in terms:
other_terms = term_set - {term}
# Check if the term is part of any other terms
if not [x for x in other_terms if term in x]:
new_terms += term,
return new_terms
@memoized_method(maxsize=30)
def _get_highlighted_words(
self, word: str, found_unidecoded_terms: tuple) -> str:
"""
Take the passed `term` and replace any of its originals in the
passed `word`.
"""
unidecoded_part = result = word_part = ''
was_partial = False
for letter in word:
unidecoded_part += cached_unidecode_char(char=letter)
word_part += letter
if unidecoded_part in found_unidecoded_terms:
# Full match, add the highlighted word part
result += \
f'{self.hl_pre}{escape(text=word_part)}{self.hl_post}'
word_part = unidecoded_part = ''
continue
partials_matches = [
x for x in found_unidecoded_terms
if x.startswith(unidecoded_part)]
if partials_matches:
# Partial match, wait (iterate further) for the match
was_partial = True
continue
if was_partial:
# Former cycle was partial match, add word_part
result += escape(text=word_part)
was_partial = False
word_part = unidecoded_part = ''
continue
# No match, not even partial, add the letter, reset iters
# and continue looking
word_part = unidecoded_part = ''
result += escape(text=letter)
if was_partial:
# The last letters were a partial match
result += escape(text=word_part)
return result
def _replace_text(self, text: NavigableString, result: str):
"""Replace the passed `text` with the replaced HTML content."""
result = f'<body>{result}</body>'
html = bs(markup=result, features='lxml')
childrens = list(html.body.children)
text.replace_with(replace_with=childrens[-1])
for item in childrens[:-1]:
childrens[-1].insert_before(item)
def _find_in_text(self, text: NavigableString) -> str:
"""
Unidecode the splitted elements in the strings, look for
matches, and add the highlight HTML to the words that match.
Return the reconstructed HTML string.
"""
result = []
is_modified = False
for str_item in text.split():
is_found = False
found_unidecoded_terms = ()
unidecoded_str = cached_unidecode(string=str_item)
for unidecoded_term in self.unidecoded_terms:
if unidecoded_term in unidecoded_str:
is_found = True
found_unidecoded_terms += unidecoded_term,
if not is_found:
result.append(escape(text=str_item))
continue
# Match!
is_modified = True
found_unidecoded_terms = self._reduce_overlapping_terms(
terms=found_unidecoded_terms)
result.append(self._get_highlighted_words(
word=str_item, found_unidecoded_terms=found_unidecoded_terms))
if is_modified:
changed_text = ' '.join(result)
self._replace_text(text=text, result=changed_text)
def _find_text(self):
"""Find the regex in the HTML."""
for navigable_str in self.bs.body.find_all(
text=True): # type: NavigableString
if not type(navigable_str) is NavigableString:
# It can be a comment
continue
if navigable_str.parent.name in ['script', 'style']:
# Skip highlighting script and style content
continue
self._find_in_text(text=navigable_str)
def highlight(self, html: str) -> str:
"""Highlight the terms in the HTML input."""
html = '<body>' + html + '</body>'
self.bs = bs(markup=html, features='lxml') # type: Tag
self._find_text()
return self.bs.body.decode_contents()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment