Skip to content

Instantly share code, notes, and snippets.

@jhidajat
Last active May 14, 2022 01:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jhidajat/17dece18649477ec3313d32c1bbe968a to your computer and use it in GitHub Desktop.
Save jhidajat/17dece18649477ec3313d32c1bbe968a to your computer and use it in GitHub Desktop.
Search Phrase
"""
Implement an data structure that takes in a dictionary [doc_id] = doc_content
which contains a method search(word) and returns a list of document IDs containing the word.
Followup: update the data structure such that it also contains a method search_phrase(phrase) that
returns the document ids containing the phrase.
"""
from collections import defaultdict
def _parse_content_to_dict(contents:list[str]) -> dict:
document_indexes = defaultdict(lambda: defaultdict(set))
for i, content in enumerate(contents):
for wi, word in enumerate(content.split()):
document_indexes[word.lower()][i+1].add(wi)
return document_indexes
def search_phrase(document_indexes:dict, phrase:str) -> list[str]:
common_document_ids = None
for unprocessed_word in phrase.split():
word = unprocessed_word.lower()
if not common_document_ids:
if word in document_indexes:
common_document_ids = document_indexes[word]
else:
new_common_document_ids = defaultdict(set)
for doc_id in common_document_ids:
indices = common_document_ids[doc_id]
for i in indices:
if i+1 in document_indexes[word][doc_id]:
new_common_document_ids[doc_id].add(i+1)
common_document_ids = new_common_document_ids
return list(common_document_ids.keys())
def test_fn_example():
document_contents = [
"Cloud Computing is booming in the market",
"I am going to introduce what is Cloud Monitoring in following paragraphs. I have been working in cloud industry for 10 years.",
"Scientist has investigated Venus Monitoring Camera images and try to identify the possibility of bacteria living in cloud tops."
]
phrase = "cloud monitoring"
document_indexes = _parse_content_to_dict(contents=document_contents)
res = search_phrase(document_indexes, phrase=phrase)
assert 2 in res
assert len(res) == 1
def test_fn_example2():
document_contents = [
"cloud computing",
"i love cloud Computing",
"Cloud is computing"
"computing cloud"
]
phrase = "cloud computing"
document_indexes = _parse_content_to_dict(contents=document_contents)
res = search_phrase(document_indexes, phrase=phrase)
assert 1 in res
assert 2 in res
assert len(res) == 2
test_fn_example()
test_fn_example2()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment