Last active
May 14, 2022 01:34
-
-
Save jhidajat/17dece18649477ec3313d32c1bbe968a to your computer and use it in GitHub Desktop.
Search Phrase
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Implement an data structure that takes in a dictionary [doc_id] = doc_content | |
which contains a method search(word) and returns a list of document IDs containing the word. | |
Followup: update the data structure such that it also contains a method search_phrase(phrase) that | |
returns the document ids containing the phrase. | |
""" | |
from collections import defaultdict | |
def _parse_content_to_dict(contents:list[str]) -> dict: | |
document_indexes = defaultdict(lambda: defaultdict(set)) | |
for i, content in enumerate(contents): | |
for wi, word in enumerate(content.split()): | |
document_indexes[word.lower()][i+1].add(wi) | |
return document_indexes | |
def search_phrase(document_indexes:dict, phrase:str) -> list[str]: | |
common_document_ids = None | |
for unprocessed_word in phrase.split(): | |
word = unprocessed_word.lower() | |
if not common_document_ids: | |
if word in document_indexes: | |
common_document_ids = document_indexes[word] | |
else: | |
new_common_document_ids = defaultdict(set) | |
for doc_id in common_document_ids: | |
indices = common_document_ids[doc_id] | |
for i in indices: | |
if i+1 in document_indexes[word][doc_id]: | |
new_common_document_ids[doc_id].add(i+1) | |
common_document_ids = new_common_document_ids | |
return list(common_document_ids.keys()) | |
def test_fn_example(): | |
document_contents = [ | |
"Cloud Computing is booming in the market", | |
"I am going to introduce what is Cloud Monitoring in following paragraphs. I have been working in cloud industry for 10 years.", | |
"Scientist has investigated Venus Monitoring Camera images and try to identify the possibility of bacteria living in cloud tops." | |
] | |
phrase = "cloud monitoring" | |
document_indexes = _parse_content_to_dict(contents=document_contents) | |
res = search_phrase(document_indexes, phrase=phrase) | |
assert 2 in res | |
assert len(res) == 1 | |
def test_fn_example2(): | |
document_contents = [ | |
"cloud computing", | |
"i love cloud Computing", | |
"Cloud is computing" | |
"computing cloud" | |
] | |
phrase = "cloud computing" | |
document_indexes = _parse_content_to_dict(contents=document_contents) | |
res = search_phrase(document_indexes, phrase=phrase) | |
assert 1 in res | |
assert 2 in res | |
assert len(res) == 2 | |
test_fn_example() | |
test_fn_example2() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment