Skip to content

Instantly share code, notes, and snippets.

@premchalmeti
Created December 11, 2022 13:51
Show Gist options
  • Save premchalmeti/d2e4ab72f2bde462a113d6d8327d88b3 to your computer and use it in GitHub Desktop.
Save premchalmeti/d2e4ab72f2bde462a113d6d8327d88b3 to your computer and use it in GitHub Desktop.
Elasticsearch: custom highlighter implementation for proximity search implementation using slop query
doc = 'The quick brown fox jumps over the lazy dog. The quick brown fox over the lazy dog. The quick brown fox jumps over the lazy dog.'
cur_pos = -1
end_pos = 0
distance = 2
tracked_words = []
start_pos = 0
search_words = ['quick', 'fox', 'over']
highlights = []
splitted_words = doc.split(' ')
matched_pair_pos = []
for word in splitted_words:
cur_pos += 1
if word in search_words and word not in tracked_words:
if not tracked_words:
start_pos = cur_pos
tracked_words.append(word)
if len(tracked_words) == len(search_words):
end_pos = cur_pos + 1
tracked_words = []
cur_highlight = " ".join(splitted_words[start_pos:end_pos])
pre_highlight = " ".join(splitted_words[:start_pos])
post_highlight = " ".join(splitted_words[end_pos:])
highlights.append("%s <b>%s</b> %s" %
(pre_highlight, cur_highlight, post_highlight))
print highlights
@premchalmeti
Copy link
Author

default_highlights = {
"info.FullContent" : [
"The generated Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc."
]
}

search_words = ['repetition', 'humour']
pre_tags = """"""
post_tags = "
"
distance = 2

proximity_highlights = {}

for field, highlights in default_highlights.iteritems():
proximity_highlights[field] = []
for highlight in highlights:
splitted_words = highlight.split(' ')

    cur_pos = -1
    end_pos = 0
    start_pos = 0
    tracked_words = []

    for word in splitted_words:
        cur_pos += 1
        if word in search_words and word not in tracked_words:
            if not tracked_words:
                start_pos = cur_pos
            tracked_words.append(word)
        if len(tracked_words) == len(search_words):
            end_pos = cur_pos + 1
            tracked_words = []
            pre_highlight = " ".join(splitted_words[:start_pos])
            cur_highlight = " ".join(splitted_words[start_pos:end_pos])
            post_highlight = " ".join(splitted_words[end_pos:])
            complete_highlight = """%s %s%s%s %s""" % (
                pre_highlight, pre_tags, cur_highlight, post_tags, post_highlight
            )
            proximity_highlights[field].append(complete_highlight)

print proximity_highlights

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment