Skip to content

Instantly share code, notes, and snippets.

@TheSithPadawan
Last active April 15, 2022 03:11
Show Gist options
  • Save TheSithPadawan/a5f966143d59e1a5d76492a7af10077e to your computer and use it in GitHub Desktop.
Save TheSithPadawan/a5f966143d59e1a5d76492a7af10077e to your computer and use it in GitHub Desktop.
CFLT onsite prep -- inverted index phrase search
# data modeling {word: {doc_id: {positions}}}
from collections import defaultdict
def build_index(sentences):
index = defaultdict(lambda: defaultdict(set))
for docid, sentence in enumerate(sentences):
wordlist = sentence.split(' ')
for pos, word in enumerate(wordlist):
index[word][docid].add(pos)
return index
def search_index(index, phrase):
"""
input: list of phrase
output: document ids
"""
wordlist = phrase.split(' ')
result = []
if not wordlist:
return result
if len(wordlist) == 1:
return list(index[wordlist[0]].keys())
docs = index[wordlist[0]]
for docid in docs.keys():
pos = docs[docid]
docfound = True
for p in pos:
count = 0
for i in range(1, len(wordlist)):
if docid not in index[wordlist[i]]:
# check the next document
docfound = False
break
if p + i not in index[wordlist[i]][docid]:
break
# otherwise check off the current word in current doc
count += 1
if not docfound:
break
if count == len(wordlist) - 1: # everyone exists
result.append(docid)
# check for the next doc
break
return result
"""
Example usage
Output:
[0, 1, 2, 3]
[0]
[0, 3]
"""
index = build_index(['cloud computing is great', 'cloud needs end to end', 'cloud is easy computing is not', 'learn about cloud computing', 'on prem service great'])
print (search_index(index, 'cloud'))
print (search_index(index, 'cloud computing is'))
print (search_index(index, 'cloud computing'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment