jhidajat/search_phrase.py

## search_phrase.py
"""
Implement an data structure that takes in a dictionary [doc_id] = doc_content
which contains a method search(word) and returns a list of document IDs containing the word.

Followup: update the data structure such that it also contains a method search_phrase(phrase) that
            returns the document ids containing the phrase.

"""

from collections import defaultdict

def _parse_content_to_dict(contents:list[str]) -> dict:
    document_indexes = defaultdict(lambda: defaultdict(set))
    for i, content in enumerate(contents):
        for wi, word in enumerate(content.split()):
            document_indexes[word.lower()][i+1].add(wi)
    return document_indexes

def search_phrase(document_indexes:dict, phrase:str) -> list[str]:
    common_document_ids = None
    for unprocessed_word in phrase.split():
        word = unprocessed_word.lower()
        if not common_document_ids:
            if word in document_indexes:
                common_document_ids = document_indexes[word]
        else:
            new_common_document_ids = defaultdict(set)
            for doc_id in common_document_ids:
                indices = common_document_ids[doc_id]
                for i in indices:
                    if i+1 in document_indexes[word][doc_id]:
                        new_common_document_ids[doc_id].add(i+1)
            common_document_ids = new_common_document_ids
    return list(common_document_ids.keys())

def test_fn_example():
    document_contents = [
        "Cloud Computing is booming in the market",
        "I am going to introduce what is Cloud Monitoring in following paragraphs. I have been working in cloud industry for 10 years.",
        "Scientist has investigated Venus Monitoring Camera images and try to identify the possibility of bacteria living in cloud tops."
    ]
    phrase = "cloud monitoring"
    document_indexes = _parse_content_to_dict(contents=document_contents)
    res = search_phrase(document_indexes, phrase=phrase)
    assert 2 in res
    assert len(res) == 1

def test_fn_example2():
    document_contents = [
        "cloud computing",
        "i love cloud Computing",
        "Cloud is computing"
        "computing cloud"
    ]
    phrase = "cloud computing"
    document_indexes = _parse_content_to_dict(contents=document_contents)
    res = search_phrase(document_indexes, phrase=phrase)
    assert 1 in res
    assert 2 in res
    assert len(res) == 2

test_fn_example()
test_fn_example2()
	"""
	Implement an data structure that takes in a dictionary [doc_id] = doc_content
	which contains a method search(word) and returns a list of document IDs containing the word.

	Followup: update the data structure such that it also contains a method search_phrase(phrase) that
	returns the document ids containing the phrase.

	"""

	from collections import defaultdict

	def _parse_content_to_dict(contents:list[str]) -> dict:
	document_indexes = defaultdict(lambda: defaultdict(set))
	for i, content in enumerate(contents):
	for wi, word in enumerate(content.split()):
	document_indexes[word.lower()][i+1].add(wi)
	return document_indexes

	def search_phrase(document_indexes:dict, phrase:str) -> list[str]:
	common_document_ids = None
	for unprocessed_word in phrase.split():
	word = unprocessed_word.lower()
	if not common_document_ids:
	if word in document_indexes:
	common_document_ids = document_indexes[word]
	else:
	new_common_document_ids = defaultdict(set)
	for doc_id in common_document_ids:
	indices = common_document_ids[doc_id]
	for i in indices:
	if i+1 in document_indexes[word][doc_id]:
	new_common_document_ids[doc_id].add(i+1)
	common_document_ids = new_common_document_ids
	return list(common_document_ids.keys())

	def test_fn_example():
	document_contents = [
	"Cloud Computing is booming in the market",
	"I am going to introduce what is Cloud Monitoring in following paragraphs. I have been working in cloud industry for 10 years.",
	"Scientist has investigated Venus Monitoring Camera images and try to identify the possibility of bacteria living in cloud tops."
	]
	phrase = "cloud monitoring"
	document_indexes = _parse_content_to_dict(contents=document_contents)
	res = search_phrase(document_indexes, phrase=phrase)
	assert 2 in res
	assert len(res) == 1

	def test_fn_example2():
	document_contents = [
	"cloud computing",
	"i love cloud Computing",
	"Cloud is computing"
	"computing cloud"
	]
	phrase = "cloud computing"
	document_indexes = _parse_content_to_dict(contents=document_contents)
	res = search_phrase(document_indexes, phrase=phrase)
	assert 1 in res
	assert 2 in res
	assert len(res) == 2

	test_fn_example()
	test_fn_example2()