Last active
April 15, 2022 03:11
-
-
Save TheSithPadawan/a5f966143d59e1a5d76492a7af10077e to your computer and use it in GitHub Desktop.
CFLT onsite prep -- inverted index phrase search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data modeling {word: {doc_id: {positions}}} | |
from collections import defaultdict | |
def build_index(sentences): | |
index = defaultdict(lambda: defaultdict(set)) | |
for docid, sentence in enumerate(sentences): | |
wordlist = sentence.split(' ') | |
for pos, word in enumerate(wordlist): | |
index[word][docid].add(pos) | |
return index | |
def search_index(index, phrase): | |
""" | |
input: list of phrase | |
output: document ids | |
""" | |
wordlist = phrase.split(' ') | |
result = [] | |
if not wordlist: | |
return result | |
if len(wordlist) == 1: | |
return list(index[wordlist[0]].keys()) | |
docs = index[wordlist[0]] | |
for docid in docs.keys(): | |
pos = docs[docid] | |
docfound = True | |
for p in pos: | |
count = 0 | |
for i in range(1, len(wordlist)): | |
if docid not in index[wordlist[i]]: | |
# check the next document | |
docfound = False | |
break | |
if p + i not in index[wordlist[i]][docid]: | |
break | |
# otherwise check off the current word in current doc | |
count += 1 | |
if not docfound: | |
break | |
if count == len(wordlist) - 1: # everyone exists | |
result.append(docid) | |
# check for the next doc | |
break | |
return result | |
""" | |
Example usage | |
Output: | |
[0, 1, 2, 3] | |
[0] | |
[0, 3] | |
""" | |
index = build_index(['cloud computing is great', 'cloud needs end to end', 'cloud is easy computing is not', 'learn about cloud computing', 'on prem service great']) | |
print (search_index(index, 'cloud')) | |
print (search_index(index, 'cloud computing is')) | |
print (search_index(index, 'cloud computing')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment