The following report summarizes the work done by me during Google Summer of Code 2021 along with the results, scope for improvements, and future work.
This also serves as the final project report with all the contributions.
[] |
# IN: documents, keywords | |
inverted_index = { | |
keyword: tuple( | |
document_id | |
for document_id, document_content in documents.items() | |
if (keyword in document_content) | |
) | |
for keyword in keywords | |
} | |
# OUT: inverted_index |
# IN: documents | |
keywords: set[str] = set() | |
for document in documents.values(): | |
keywords = keywords.union(set(document)) | |
# OUT: keywords |
# IN: string | |
for punctuation in "!()-[]{};:, <>./?@#$%^&*_~'\"\\": | |
string = string.replace(punctuation, " ") | |
string = string.lower() | |
tokens = word_tokenize(string) | |
tokens = [token for token in tokens if token not in stopwords.words()] | |
# OUT: tokens |
# IN: events | |
messages: dict[str, str] = {} | |
for event in events: | |
if event["type"] == "m.room.message" and "content" in event and "body" in event["content"]: | |
messages[event["event_id"]] = event["content"]["body"] | |
# OUT: messages |
# IN: file_path | |
events: list[dict[str, Any]] = [] | |
if path.exists(file_path): | |
with open(file_path, encoding="utf-8") as file: | |
events: list[dict[str, Any]] = json.load(file)["messages"] | |
# OUT: events |